Logo Search packages:      
Sourcecode: kdrill version File versions  Download package

readfile.c

/*
 * This file is for just setting up the structs, etc
 */
/* stdio SHOULD get included by Xos.h or something */
/* but it doesn't with sunos, at least */

#include <errno.h>
#include <stdio.h>       /* for popen, and other things */
#include <stdlib.h>
#include <ctype.h>
#include <Xfuncs.h>     /* handles bzero redefine stuff */
#include <Xlib.h>
#include <Xatom.h>
#include <Xutil.h>
#include <Intrinsic.h>
#include <StringDefs.h>
#include <Xos.h>

#include "defs.h"
#include "utils.h"
#include "externs.h"

/* translations[] keeps track of which kanji it is okay to test the
 *    user on. Likewise with numberofkanji, highest, and lowest.
 *    YES, it is best to keep in a large array, otherwise
 *    it would be difficult to switch between grade levels.
 */
struct translationstruct *translations[MAXTRANSLATIONSALLOWED];
int numberofkanji=0, highestkanji=0, lowestkanji=0;

static char *dictname=NULL;
static char *edictname=NULL;

/* getline:
 *    reads a line (from dictionary).
 *    Deals with 8-bit char reads (or attempts to)
 *    Also attempts to deal with the problem of variable line length.
 *    Reads in chunks, looking for newline.
 *    Note the MAXLINELEN define. We DO have a limit. We do NOT use
 *          malloc
 *
 *    Copies a line from FILE * to passed (unsigned char *)
 *
 *    Returns true (1) if read a line
 *    Returns false (0) if fail;
 *
 *    Used in "readstructs()", below. Also "readedict()"
 *
 *    Note that this expects static global getline_inptr to be NULL
 *    if we have just started reading a stream;
 */

int getline(FILE *fp,unsigned char *s)
{
      char * val;
      char *endparse;

      val=fgets(s,MAXLINELEN,fp);
      
      if(val==NULL)
            return 0; /* probably EOF. We dont care */

      endparse=& s[strlen(s)-1];

      switch(*endparse){
            case 10:
            case 13:
                  *endparse='\0';
      }

      return 1;

}

/* StripBrackets:
 *    Gets rid of those annoying {enlish}{english2} brackets.
 *    PRESUMES first char of source is '{'!!
 *      Well, actually, it nicely sets a null string if otherwise.
 *    See also StripSlash, below
 */
void StripBrackets(char *dest,unsigned char *source)
{
      unsigned char *parse = &source[1];

      if(source[0] != '{'){
            dest[0] = '\0';
            return;
      }
      /* (*dest) is always assumed to be needing a write */

      do {
            switch(*parse){
                  case '{':
                        *dest++ = ':';
                        *dest++ = ' ';
                        break;
                  case '}':
                        break;
                  case '\n':
                        *dest++='\0';
                  default:
                        *dest++ = *parse;                   
            }
            parse++;
      } while((*parse != '\n') && (*parse != '\0'));
      *dest = '\0';
      return;
}
/* StripSlash
 *    Gets rid of /enlish/english2/ Slashes.
 *    Copies the cleaned up version of source, to topdest
 *
 *    This is for readedict. Probably nothing else should use it
 *    Modeled directly after StripBrackets
 *    PRESUMES first char of source is '/'!!
 *    Then looks for LAST '/'
 *    (Or will set topdest[0] to '\0')
 *
 *    We USED to translate middle ':' to '/'.
 *
 *    Source is actually assumed to be regular ascii signed char,
 *    but declared as unsigned to stop compiler warnings.
 *
 * return 0 OKAY, 1 bad line
 */
int StripSlash(char *topdest,unsigned char *source)
{
      char *dest;
      int englen;
      unsigned char *parse = source;

      if(*parse != '/'){
            topdest = '\0';
            return 1;
      }
      parse=strrchr(source, '/');
      if(parse<&source[2]){
            fprintf(stderr,"Error: english part too short\n");
            fprintf(stderr,"%s\n", source);
            return 1;
      }
      englen=parse- source - 1;
      strncpy(topdest, &source[1], parse- source - 1);
      topdest[englen]='\0';

      /* we've copied the relavant part over to topdest. Now rewrite
       * in-place
       */
      dest=topdest;
      dest=strchr(dest, '/');
      while(dest!=NULL){
            *dest=':';
            dest=strchr(dest, '/');
      }

      return 0;
}

/* Given a translation, return the index into translations[] that it
 * sits at
 */
int trans_to_index(TRANSLATION trans)
{
      return trans->kdrill_index;
}


/* read in kanji/kana part of edictfile line.
 * format is:
 *
 * KANA /english_1/.../
 *
 * or
 *
 * KANJI [KANA] /english_1/english_2/.../
 *
 *
 */
void ReadEdictPron(unsigned char **Pstring, struct translationstruct *trans)
{
      /* note that MAXLINELEN means we canot possibly run out of space */
      XChar2b kbuff[MAXLINELEN];
      XChar2b *kptr = kbuff;
      unsigned char *parse = *Pstring;

      /* Read in a 16-bit string.
       * We dont know if its kana or kanji yet
       */
      while(*parse && (*parse != '/'))
      {
            switch(*parse)
            {
               case ' ':
                  /* 0x2121 is ' ' */
                  kptr->byte1 = 0x21;
                  kptr->byte2 = 0x21;
                  kptr++;
                  parse++;                
                  break;

               case '[':
                    /* oops.. the kanji/kana switch */
                  /* save what must be kanji, then start
                   * on kana
                   */
                  kptr->byte1 = 0;
                  kptr->byte2 = 0;

                  trans->kanji =  dup_16(kbuff);
                  kptr = kbuff;
                  /* now reset buffer, and read in another char16
                   * string
                   */
                  parse++;
                  break;
               case ']':
                  parse++;
                  while(*parse && (*parse != '/'))
                        parse++;
                  /* and then we will fall out of the top loop  */
                  break;

               default:
                  kptr->byte1= (*parse++ & 0x7f);
                  kptr->byte2= (*parse++ & 0x7f);
                  kptr++;
            }
      }

      /* when we come out here, we will ALWAYS have kana in
       * the kbuff
       */
      
      kptr->byte1 = 0;
      kptr->byte2 = 0;

      trans->pronunciation =  dup_16(kbuff);


      *Pstring = parse;
}


/* Okay, it's not actually pronunciation we're reading in
 * We are reading the "on-yoni" and "kun-yoni" readings
 * in kanjidic. Also, the optional okurigami.
 *
 * Format:
 *     reading{.oku} [reading{.oku}] ...
 */


/* 0x2500 stuff is kanakana? (ON?)
 * 0x2400 is hiragana?  (KUN?)
 *
 *    We need to assume
 *    
 */
XChar2b * ReadPronunciation(unsigned char **Pstring)
{

      XChar2b kbuff[MAXLINELEN];
      XChar2b *kptr = kbuff;
      unsigned char *parse = *Pstring;
      enum {ERROR,READING, OKURIGANA,BLANK, DONE};
      int state=BLANK;

      if(*parse == '{'){
            /* only english exists,
             *  (no kanji, even)
             *   so set character to be unusable.
             */
            return 0;
      }
      while(*parse == ' ')
            parse++;

      /* THIS is going to get yeuky.
       *  We are going to parse a line segment which has
       *  reading.oku  pairs.
       * This is REALLY annoying, because the line jumps between
       * 8 -bit and 16-bit chars
       */

      /* okay, bad practice... you tell me what would be better :-/ */

      while(1){

            /* bug in gcc? If we put 
             *    int state=BLANK;
             * here, it gets reset each time through the while loop
             */

            if(kptr >&kbuff[MAXLINELEN]){
                  fprintf(stderr,"ERROR! overflow reading in kanjidic\n");
                  fprintf(stderr,"%s\n",*Pstring);
                  return 0;
            }

            switch(*parse){
                  case '.':
                        parse++;

                        /* we ALWAYS need to close this off later */
                        state = OKURIGANA;
                        /* open paren */
                        kptr->byte1 = 0x21;
                        kptr->byte2 = 0x4a;
                        kptr++;

                        break;
                  case '-':
                        parse++;
#ifdef USEEXTRABLANKS
                        if(state == BLANK){
                              kptr->byte1 = 0x21;
                              kptr->byte2 = 0x21;
                              kptr++;
                        }
#endif
                        kptr->byte1 = 0x21;
                        kptr->byte2 = 0x41;
                        kptr++;
#ifdef USEEXTRABLANKS
                        if(state != BLANK){
                              kptr->byte1 = 0x21;
                              kptr->byte2 = 0x21;
                              kptr++;
                        }
#endif
                        continue;
                        /* start at top of while again */

                  case '\0':
                  case '\n':
                  case '\r':
                  case '{':
                        if(state == OKURIGANA){
                              /* close paren */
                              kptr->byte1 = 0x21;
                              kptr->byte2 = 0x4b;
                              kptr++;
                        }
                        state = DONE;
                        break;

                  case ' ':
                        if(state == OKURIGANA){
                              /* close paren */
                              kptr->byte1 = 0x21;
                              kptr->byte2 = 0x4b;
                              kptr++;
                        }
                        state = BLANK;

                        parse++;
                        kptr->byte1 = 0x21;
                        kptr->byte2 = 0x21;
                        kptr++;
                        break;

                  default:
                        if(*parse <127){
                              if(state == OKURIGANA){
                                    /* close paren */
                                    kptr->byte1 = 0x21;
                                    kptr->byte2 = 0x4b;
                                    kptr++;
                                    puts("Kdrill.. error on kana read-in... ");
                                    puts("Expecting high bit char to start after '.'");
                                    printf("%s\n",*Pstring);
                                    
                              }
                              state = BLANK;
                              parse++;
                        } else {
                              if(state != OKURIGANA)
                                    state = READING;
                        }
                        break;
            }

            if(state == DONE){
                  break;
            }
            if(state == BLANK)
                  continue;
            /* else read in another char */
            kptr->byte1= (*parse++ & 0x7f);
            kptr->byte2= (*parse++ & 0x7f);
            kptr++;

      } /* while(1) */

      /* copy out to struct, and exit */
      kptr->byte1 = 0;
      kptr->byte2 = 0;

      *Pstring = parse;
      return dup_16(kbuff);

}


/* readedict()
 * 
 *    Read in "edict.gz" if it exists
 *    [readstructs handles kanjidic reading]
 *    
 *    We only make very partial entries for edict entries
 *    We just fill out "english" and "pronunciation" entries.
 *
 *    If we cannot extract a kanji entry, the kanji pointer of a
 *    translation will be set to a shared string of '8' on its side
 *
 *    Note that we always start entries at index
 *    translations[MAXKANJIALLOWED+1]. This is to attempt to keep
 *    usefiles working
 *    
 */
void readedict()
{
      unsigned char instring[MAXLINELEN];
      unsigned char *parse, *slashparse;
      int slashcount;
      char edict[MAXLINELEN]; /*PATH to dictionary */
      FILE *fp;
      TRANSLATION newk=NULL,lastk;
      int nextindex = MAXKANJIALLOWED+1;
      int linecount=0;
      static XChar2b no_kanji[2]=
      {
            {0x0, 0x0},
            {0x0,  0x0}
      };

      no_kanji[0].byte1 = (NOKANJI >> 8);
      no_kanji[0].byte2 = (NOKANJI & 0xff);


      /* the following will be NULL if kanjidic not read in */
      lastk = translations[highestkanji];

      GetXtrmString("edictfile","Edictfile",edict);
      edictname = edict;

      if(strncmp(edictname,"none",4)==0){
            fprintf(stderr,"edictfile set to 'none'. Skippping.\n");
            return;
      }

      edictname=malloc(strlen(edict)+1);
      strcpy(edictname, edict);
      fp = open_compressed(edictname);
      if(fp == NULL)
      {
            fprintf(stderr,"Cannot open edict file %s. Skipping.\n",
                  edictname);
            return;
      }
      printf("Opened dictionary %s \n",edictname);
      if(highestkanji == 0)
      {
            lowestkanji = nextindex;
      }

      while(getline(fp, instring) != 0)
      {
            int instrlen;

            linecount++;
            if(linecount%1000 == 0)
            {
                  putchar('.');
                  fflush(stdout);
            }


            if(newk == NULL)
            {
                  newk =  (struct translationstruct *)
                        malloc(sizeof(struct translationstruct));
                  if(newk == NULL)
                  {
                        fprintf(stderr,"OUT OF MEMORY!!\n");
                        exit(errno);
                  }
            }

            bzero(newk, sizeof(*newk));

            /* 1- read first part
             * 2- read optional [part]
             * 3- read english part
             */

            parse = instring;

            newk->kanji = no_kanji;

            ReadEdictPron(&parse, newk);
            if(newk->pronunciation == NULL)
            {
                  fprintf(stderr,"Error reading edict\n");
                  newk = NULL;
                  continue;
            }

            while((*parse != '/') && *parse)
            {
                  parse++;
            }
            slashcount=1;
            slashparse = parse;
            while(*slashparse++)
            {
                  if(*slashparse =='/')
                        slashcount++;
            }
            /* need extra space for expansion */
            instrlen = strlen((char *)parse)+1+ slashcount*4;
            
            newk->english = (char *) malloc(instrlen);
            if(newk->english == NULL){
                  perror("Cannot allocate memory for translation table\n");
                  exit(errno);            
            }
            
            if(StripSlash(newk->english, parse)!=0){
                  fprintf(stderr, "bad line: %s\n", instring);
            }

            /* Success! Set pointers appropriately */
            newk->kdrill_index=nextindex;
            translations[nextindex++] = newk;
            if(lastk != NULL)
            {
                  lastk->nextk = newk;
            }
            lastk = newk;
            newk = NULL;
            
      }

      if(isapipe(fp)){
            pclose(fp);
      } else {
            fclose(fp);
      }
      if(nextindex != MAXKANJIALLOWED+1)
      {
            highestkanji = nextindex-1;
      }

      puts("");

      puts("NOTE: an \"infinity\" sign means there is no kanji.");
      puts("  Switch to \"show meaning\" option to show alternates.");

      return;

}

/* lets make sure we have one single unified skip encoding here! */
short skipfromthree(int one, int two, int three){

      int SKIPnum = (one<<12) | (two<<8) | three;

      if((one>0xf) | (two>0xf) | (three>0xff) | (SKIPnum <0) )
      {
#ifdef DEBUG
              printf("corrupted SKIP ('Px-x-x') entry: %d-%d-%d\n",
                     one, two, three);
#endif
              return 0;
      }


      return (short)(SKIPnum&0xffff);
}


/* parseskip
 * Take a string pointing to the first char AFTER the "P", in 
 * kanjidic.
 * So we expect a string like "4-5-11 xxx xxx xxx"
 *
 * We will then convert the three numbers into single byte values,
 * and put them in the short we return.
 * In hex, with a full short being [f][f][f][f], that would look like
 * [1][2][3][3], in nibble positions.
 * Although you really shouldn't care what we do with it, just remember that it
 * is a short. We call skipfromthree(), and so should anything else!
 *
 *
 */
short parseskip(char *input)
{
      int one, two, three;
      
      one = atoi(input);

      input++;
      if(*input != '-')
      {
#ifdef DEBUG
            puts("corrupted SKIP ('Px-x-x') entry");
#endif            
            return 0;
      }
      input++;
      two = atoi(input);

      input++;
      if(*input != '-')
            input++;
      if(*input != '-')
      {
#ifdef DEBUG
            puts("corrupted SKIP ('Px-x-x') entry");
#endif            
            return 0;
      }
      input++;
      three = atoi(input);

      return skipfromthree(one, two, three);
}

/* readstructs:
 *    the main dictionary reading routine for "kanjidic".
 *    Fills in the global translationstruct with
 *    all that is available for each selected kanji, in
 *    Grade, "pronunciation", english translation, and
 *    frequency of use (by native speakers)
 */
void readstructs(){
      unsigned char instring[MAXLINELEN];
      char dict[200];
      FILE *fp;
      TRANSLATION newk=NULL,lastk=NULL;

      GetXtrmString("kdictfile","Kdictfile",dict);
      dictname = dict;
#ifdef DEBUG
      printf("kdictfile from resources is\" %s\"\n",dictname);
#endif

      if(strncmp(dictname,"none",4)==0){
            fprintf(stderr,"kdictfile set to 'none'. Skippping.\n");
            return;
      }
      dictname=malloc(strlen(dict)+1);
      strcpy(dictname, dict);
      fp = open_compressed(dictname);
      if(fp == NULL)
      {
            fprintf(stderr,"Cannot open kanjidic file %s. Skipping.\n",
                  dictname);
            return;
      }
      
      printf("Opened dictionary %s \n",dictname);


      if(fp ==NULL){
            fprintf(stderr,"Dictionary  not found\n");
            exit(-1);
      }

      while (getline(fp,instring) != 0) {
            int Kanji;
            int freq,grade,N,U,H,Q,SKIP;
            unsigned char *parse;
            BYTE strokes;
            int instrlen;     /* length of pronunciation */

            if(strlen((char *)instring) <10) continue;

            /*try to get kanji Index right away */

#define BROKENFONTS 0
            
            Kanji = xtoi((char *)&instring[2]) + (BROKENFONTS);


            /* skip comments, kanji not specified in
             * the usefile, and invalid single kanji
             */   
            if(Kanji < MINKANJIALLOWED) {
                  continue;
            }
            if(Kanji >MAXKANJIALLOWED) {
                  continue;
            }

            parse = &instring[2];
            if(parse == NULL){
                  continue;
            }
            /* now parse for grade level, frequency, and english */
            freq = grade = N = U = H = SKIP=0;
            strokes=0; Q = -1; /* remember, "0000" IS a valid Qval!*/

            nextword(&parse);

            /* Check for high bit set, which means
             * start of kana definition of kana.
             * We cheat a bit, and let this loop skip over
             * numbers by the fact that they don't match
             * the case statements.
             */
            while ( (*parse < 127)  && (*parse != '{') ) {
                  switch(*parse){
                        case 'F':
                              freq = atoi((char *)&parse[1]);
                              break;
                        case 'G':
                              grade = atoi((char *)&parse[1]);
                              break;
                        case 'H':
                              H = atoi((char *)&parse[1]);
                              break;
                        case 'N':
                              N = atoi((char *)&parse[1]);
                              break;
                        case 'P':
                              SKIP = parseskip((char *) &parse[1]);
                              break;
                        case 'Q':
                              Q = atoi((char *)&parse[1]);
                              break;
                        case 'S':
                              strokes= atoi((char *)&parse[1]);
                              break;
                        case 'U':
                              U = xtoi((char *)&parse[1]);
                              if(U&0xffff0000)
                              {
                                    printf("got hi U: %x\n",
                                           U);
                              }
                              break;
                        default:
                              parse++;
                              break;
                  }
                  nextword(&parse);
            } /* while != '{' */
            
            
            /**********************************************
             *  Now we know that we have a useable/wanted *
             *  dictionary definition                     *
             *********************************************/
            if((lowestkanji==highestkanji) && (highestkanji==0)){
                  lowestkanji = highestkanji = Kanji;
            } else{
                  if(Kanji < lowestkanji) lowestkanji = Kanji;
                  if (Kanji > highestkanji) highestkanji = Kanji;
            }

            lastk = newk;
            
            newk = (struct translationstruct *)
                  malloc(sizeof(struct translationstruct));
            if (newk == NULL){
                  perror("Cannot allocate memory for translation table\n");
                  exit(errno);
            }
            newk->Sindex=SKIP;
            newk->Qindex=Q;
            newk->Uindex=U;
            newk->Hindex=H;
            newk->Nindex=N;
            newk->frequency = freq;
            newk->grade_level = grade;
            newk->Strokecount=strokes;
            newk->incorrect=0;
            newk->kanji=0;
            newk->pronunciation=0;
            newk->nextk = NULL;
#ifdef DEBUG
            printf("Q=%d, U=%d, freq=%d\n", Q, U, freq);
#endif
            
            newk->pronunciation = ReadPronunciation(&parse);
            if(newk->pronunciation == 0){
                  free(newk);
                  newk = lastk;
                  continue;
            } else {
                  XChar2b buff[2];

                  buff[0].byte1 = (Kanji & 0xff00) >> 8;
                  buff[0].byte2 = (Kanji & 0xff);
                  buff[1].byte1 = 0;
                  buff[1].byte2 = 0;
                  newk->kanji = dup_16(buff);
            }
            if(lastk != NULL)
                  lastk->nextk = newk;

            instrlen = strlen((char *)parse)+1;
            newk->english = (char *) malloc(instrlen);
            if(newk->english == NULL){
                  perror("Cannot allocate memory for translation table\n");
                  exit(errno);            
            }

            StripBrackets(newk->english, parse);
            newk->kdrill_index=Kanji;
            translations[Kanji] = newk;
            numberofkanji++;
            if(numberofkanji%1000 == 0)
            {
                  putchar('.');
                  fflush(stdout);
            }

      } /* and repeat until end of file */
      puts("");

      if(isapipe(fp)){
            pclose(fp);
      } else {
            fclose(fp);
      }

}


Generated by  Doxygen 1.6.0   Back to index