Received: (at submit) by bugs.debian.org; 12 May 2000 14:45:02 +0000 From kiwamu@misterdosv.eei.metro-u.ac.jp Fri May 12 09:45:01 2000 Received: from misterdosv.eei.metro-u.ac.jp [133.86.34.143] by master.debian.org with esmtp (Exim 3.12 #2 (Debian)) id 12qGgX-0005bD-00; Fri, 12 May 2000 09:45:01 -0500 Received: from kiwamu by misterdosv.eei.metro-u.ac.jp with local (Exim 3.12 #1 (Debian)) id 12qGg1-0000V9-00 for ; Fri, 12 May 2000 23:44:29 +0900 Date: Fri, 12 May 2000 23:44:29 +0900 From: kiwamu To: submit@bugs.debian.org Mime-Version: 1.0 Content-Type: text/plain; charset=iso-2022-jp Content-Transfer-Encoding: 7bit X-Mailer: Mutt 0.95.4i-jp2 Message-Id: Sender: Kiwamu Okabe Delivered-To: submit@bugs.debian.org Package: sufary Version: 2.1b3-4 Severity: wishlist Please apply this patch for the optimized SDIC array. See also . ------------------------------------------------------------------------------ --- sufary-2.1b3.orig/mkary/mkary.c +++ sufary-2.1b3/mkary/mkary.c @@ -55,6 +55,8 @@ int dict_mode = MODE_OFF; int j_mode = MODE_OFF; /* 日本語と'<'にしかインデックス張らんモード 981115 */ int bunkatu_sort_mode = MODE_OFF; /* 990219 */ +int sdic_mode = MODE_OFF; /* for SDIC */ +int sdic_tagin = 0; /* for SDIC */ int number_of_block; /* 990219 分割ブロック数 */ @@ -179,6 +181,10 @@ break; case 's': /* -so ソートしかしないモード */ if(argv[1][2] == 'o') sort_only_mode = MODE_ON; + if (argv[1][2] == 'd') { /* for SDIC */ + sdic_mode = MODE_ON; + /* デミリタの後にしかインデックスを作らない */ + } break; case '#': /* #で始まる行はコメントアウト */ comment_out_mode = MODE_ON; @@ -325,7 +331,7 @@ if(!(i % 50000)) fprintf(stderr,"+"); if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000); } - } else if(option_byline == MODE_ON){ /* 一行、一語毎にインデックスを作る */ + } else if(option_byline == MODE_ON && sdic_mode == MODE_OFF){ /* 一行、一語毎にインデックスを作る */ for(i = 0; i < N; i++){ if((char*)strchr(delimitter, text[i]) != NULL && text[i] != '\0') last_char_is_delimitter = 1; @@ -343,6 +349,63 @@ if(!(i % 50000)) fprintf(stderr,"+"); if(!(i % 1000000)) fprintf(stderr," %ldM\n",i/1000000); } + } else if (sdic_mode == MODE_ON) { /* SDICモード */ + /* の中にいるときsdic_taginが1 + * の中にだけインデックスを作る */ + fprintf(stderr, "SDIC MODE\n"); + for (i = 0; i < N; i++) { + if (!strncmp(text + i, "", 3)) { /* タグ発見! */ + fwrite(&i, 1, sizeof(long), ofd); + jj++; + sdic_tagin = 1; + i = i + 3 - 1; + last_char_is_delimitter = 1; + } else if (!strncmp(text + i, "", 4)) { + sdic_tagin = 0; + } else if (sdic_tagin == 1) { /* の中では */ + if ((char *) strchr(delimitter, text[i]) != NULL && + text[i] != '\0') { + last_char_is_delimitter = 1; + } else if (!strncmp(text + i, "&", 5)) { /* &<>を読み飛ばす */ + fwrite(&i, 1, sizeof(long), ofd); + jj++; + i = i + 5 - 1; + last_char_is_delimitter = 1; + } else if (!strncmp(text + i, "<", 4)) { + fwrite(&i, 1, sizeof(long), ofd); + jj++; + i = i + 4 - 1; + last_char_is_delimitter = 1; + } else if (!strncmp(text + i, ">", 4)) { + fwrite(&i, 1, sizeof(long), ofd); + jj++; + i = i + 4 - 1; + last_char_is_delimitter = 1; + } else if (last_char_is_delimitter == 1){ + /* EUC漢字の2char目 */ + if (bit_8_mode == MODE_ON && (0x80 & text[i]) != 0x00 + && last_char_is_kanji == 1) { + last_char_is_kanji = 0; + } else { + fwrite(&i, 1, sizeof(long), ofd); + jj++; + if ((0x80 & text[i]) != 0x00) + last_char_is_kanji = 1; + } + if (option_byline == MODE_ON) { + last_char_is_delimitter = 0; + } else { /* 文字単位 */ + last_char_is_delimitter = 1; + } + } + } + if (quiet_mode == MODE_ON || i == 0) + continue; + if (!(i % 50000)) /* このままじゃちゃんと数えないじゃん */ + fprintf(stderr, "+"); + if (!(i % 1000000)) + fprintf(stderr, " %ldM\n", i / 1000000); + } } else { /* 文字毎にインデックスを作る */ for(i = 0; i < N; i++){ /*printf("i %d %d\n",i,last_char_is_kanji);fflush(stdout);*/ @@ -479,7 +542,7 @@ void usage(void){ fprintf(stderr, "\n" "mkary --- array ファイルを作成する\n\n" - "Version 1.7 990616\n\n" + "Version 1.7 990616 + SDICpatch\n\n" "USAGE\n" " mkary [ -l [-#] ] [ -w ] [ -c ] [ -q ] [ -ns ] [ -so ] [ -8 ]\n" " [ -J ] [ -m ] [ -b NUM ] [ -o FILE_NAME ] FILE_NAME\n" @@ -492,6 +555,7 @@ " -q : メッセージなし\n" " -ns : ソートしない(No Sort)\n" " -so : ソートだけす(Sort Only)る\n" + " -sd : SDICモード\n" " -8 : 2バイト一文字処理を行なわない\n" " -J : 日本語文字と '<' 以外は無視する(文字単位のとき)\n" " -# : #で始まる行はコメントアウト(行単位のとき)\n" ------------------------------------------------------------------------------ -- Tokyo Metropolitan University Kiwamu Okabe Mail: kiwamu@debian.or.jp URL: http://silica.eei.metro-u.ac.jp/~kiwamu/