262 lines
5.5 KiB
HolyC
262 lines
5.5 KiB
HolyC
|
/*
|
|||
|
This file is a stand-alone program
|
|||
|
which will regenerate processed dictionary
|
|||
|
files from a raw Project Gutenberg
|
|||
|
dictionary file.
|
|||
|
|
|||
|
See $LK,"::/Doc/Credits.DD"$.
|
|||
|
*/
|
|||
|
|
|||
|
U0 ACDPreprocess(U8 *in_name,U8 *out_name)
|
|||
|
{/*
|
|||
|
<cr><nl>--> <nl>
|
|||
|
$$ --> $$$$
|
|||
|
\'89 --> <EFBFBD>
|
|||
|
*/
|
|||
|
I64 ch,i;
|
|||
|
U8 *src,*dst;
|
|||
|
CDoc *doc;
|
|||
|
CDocEntry *doc_e;
|
|||
|
if (doc=DocRead(in_name,DOCF_PLAIN_TEXT_TABS|DOCF_DBL_DOLLARS)) {
|
|||
|
doc_e=doc->head.next;
|
|||
|
while (doc_e!=doc) {
|
|||
|
if (doc_e->type_u8==DOCT_TEXT) {
|
|||
|
src=dst=doc_e->tag;
|
|||
|
while (ch=*src++) {
|
|||
|
if (ch=='\\' && *src=='\'') {
|
|||
|
src++;
|
|||
|
i=0;
|
|||
|
ch=ToUpper(*src++);
|
|||
|
if ('0'<=ch<='9')
|
|||
|
i+=ch-'0';
|
|||
|
else if ('A'<=ch<='F')
|
|||
|
i+=ch-'A'+10;
|
|||
|
i<<=4;
|
|||
|
ch=ToUpper(*src++);
|
|||
|
if ('0'<=ch<='9')
|
|||
|
i+=ch-'0';
|
|||
|
else if ('A'<=ch<='F')
|
|||
|
i+=ch-'A'+10;
|
|||
|
*dst++=i;
|
|||
|
} else
|
|||
|
*dst++=ch;
|
|||
|
}
|
|||
|
*dst=0;
|
|||
|
}
|
|||
|
doc_e=doc_e->next;
|
|||
|
}
|
|||
|
StrCpy(doc->filename.name,out_name);
|
|||
|
DocWrite(doc);
|
|||
|
DocDel(doc);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
I64 ACDNextCmd(U8 **_ptr)
|
|||
|
{
|
|||
|
U8 *ptr=*_ptr,*ptr2;
|
|||
|
I64 ch,res=-1;
|
|||
|
do {
|
|||
|
do {
|
|||
|
if (!(ch=*ptr++)) goto ncmd_done;
|
|||
|
} while (ch!='<');
|
|||
|
|
|||
|
ptr2=ptr;
|
|||
|
do {
|
|||
|
if (!(ch=*ptr2++)) goto ncmd_done;
|
|||
|
} while (ch!='>');
|
|||
|
*--ptr2=0;
|
|||
|
res=LstMatch(ptr,"h1\0/h1\0def\0/def\0hw\0/hw\0tt\0/tt\0"
|
|||
|
"ety\0@fld\0@cd\0@blockquote\0@wordforms\0@note\0@altname\0@chform\0"
|
|||
|
"@cref\0@syn\0/ety\0@/fld\0@/cd\0@/blockquote\0@/wordforms\0@/note\0"
|
|||
|
"@/altname\0@/chform\0@/cref\0@/syn\0");
|
|||
|
*ptr2++='>';
|
|||
|
ptr=ptr2;
|
|||
|
} while (res<0);
|
|||
|
|
|||
|
ncmd_done:
|
|||
|
*_ptr=ptr;
|
|||
|
return res;
|
|||
|
}
|
|||
|
|
|||
|
U8 *ACDNextEntry(U8 **_ptr)
|
|||
|
{
|
|||
|
U8 *res,*ignore,*ptr=*_ptr,buf[ACD_BLK_SIZE],*out_ptr=buf;
|
|||
|
I64 ch,l;
|
|||
|
while (TRUE) {
|
|||
|
while (TRUE) {
|
|||
|
if (!(ch=*ptr++)) goto nentry_done;
|
|||
|
if (ch!='<') {
|
|||
|
*out_ptr++=ch;
|
|||
|
if (ch=='$$')
|
|||
|
*out_ptr++=ch;
|
|||
|
} else
|
|||
|
break;
|
|||
|
}
|
|||
|
ignore="b>\0i>\0ppp>\0/b>\0/i>\0/p>\0"
|
|||
|
"ets>\0col>\0spn>\0/ets>\0/col>\0/spn>\0er>\0as>\0cs>\0cd>\0ex>\0"
|
|||
|
"/er>\0/as>\0/cs>\0/cd>\0/ex>\0"
|
|||
|
"note>\0/note>\0blockquote>\0/blockquote>\0";
|
|||
|
while (*ignore) {
|
|||
|
l=StrLen(ignore);
|
|||
|
if (!StrNCmp(ptr,ignore,l)) {
|
|||
|
ptr+=l;
|
|||
|
break;
|
|||
|
} else
|
|||
|
ignore+=l+1;
|
|||
|
}
|
|||
|
if (!*ignore)
|
|||
|
break;
|
|||
|
}
|
|||
|
nentry_done:
|
|||
|
*out_ptr++=0;
|
|||
|
res=StrNew(buf);
|
|||
|
*_ptr=ptr-1;
|
|||
|
return res;
|
|||
|
}
|
|||
|
|
|||
|
I64 ACDCompareWords(U8 *e1,U8 *e2)
|
|||
|
{
|
|||
|
return StrICmp(e1,e2);
|
|||
|
}
|
|||
|
|
|||
|
U8 *ACDSortWords(U8 *start,I64 size,I64 word_cnt)
|
|||
|
{
|
|||
|
U8 **ptr_array=MAlloc(sizeof(U8 *)*word_cnt),
|
|||
|
*out_start=MAlloc(size),
|
|||
|
*ptr=start,*ptr2;
|
|||
|
I64 i=0;
|
|||
|
while (*ptr) {
|
|||
|
ptr_array[i++]=ptr;
|
|||
|
ptr+=StrLen(ptr)+3;
|
|||
|
}
|
|||
|
"Sorting...\n"; Sleep(100);
|
|||
|
QSortI64(ptr_array,word_cnt,&ACDCompareWords);
|
|||
|
"Done...\n"; Sleep(100);
|
|||
|
|
|||
|
ptr=out_start;
|
|||
|
for (i=0;i<word_cnt;i++) {
|
|||
|
ptr2=ptr_array[i];
|
|||
|
while (*ptr2)
|
|||
|
*ptr++=*ptr2++;
|
|||
|
*ptr++=*ptr2++; //zero
|
|||
|
*ptr++=*ptr2++; //blk lo
|
|||
|
*ptr++=*ptr2++; //blk hi
|
|||
|
}
|
|||
|
*ptr++=0;
|
|||
|
return out_start;
|
|||
|
}
|
|||
|
|
|||
|
U0 ACDGen(U8 *in_file)
|
|||
|
{
|
|||
|
I64 cmd,size,word_cnt=0,largest_entry=0;
|
|||
|
U8 *st,*in_ptr=FileRead(in_file,&size),*in_start=in_ptr,
|
|||
|
*out_ptr=MAlloc(size),*out_start=out_ptr,
|
|||
|
*word_ptr=MAlloc(size),*word_start=word_ptr,
|
|||
|
*last_word="",*def_word_start=out_ptr,
|
|||
|
*sorted_word_start;
|
|||
|
U16 *d;
|
|||
|
if (!in_ptr) return;
|
|||
|
do {
|
|||
|
cmd=ACDNextCmd(&in_ptr);
|
|||
|
if (cmd==ACD_H1) {
|
|||
|
next_word:
|
|||
|
if (out_ptr-def_word_start>largest_entry)
|
|||
|
largest_entry=out_ptr-def_word_start;
|
|||
|
def_word_start=out_ptr;
|
|||
|
if (st=ACDNextEntry(&in_ptr)) {
|
|||
|
if (*st) {
|
|||
|
if (StrICmp(st,last_word)) {
|
|||
|
word_cnt++;
|
|||
|
|
|||
|
*word_ptr++=ACD_WORD_CHAR;
|
|||
|
last_word=word_ptr;
|
|||
|
StrCpy(word_ptr,st);
|
|||
|
word_ptr+=StrLen(st)+1;
|
|||
|
|
|||
|
d=word_ptr;
|
|||
|
*d=(out_ptr-out_start)/ACD_BLK_SIZE;
|
|||
|
word_ptr+=2;
|
|||
|
|
|||
|
*out_ptr++=ACD_WORD_CHAR;
|
|||
|
StrCpy(out_ptr,st);
|
|||
|
out_ptr+=StrLen(st)+1;
|
|||
|
}
|
|||
|
Free(st);
|
|||
|
|
|||
|
do {
|
|||
|
do {
|
|||
|
cmd=ACDNextCmd(&in_ptr);
|
|||
|
if (cmd==ACD_H1)
|
|||
|
goto next_word;
|
|||
|
} while (cmd>=0 && !(cmd==ACD_DEF||cmd==ACD_PRONUNCIATION||
|
|||
|
cmd==ACD_POS||cmd==ACD_EXTRA));
|
|||
|
if (cmd==ACD_DEF) {
|
|||
|
if(st=ACDNextEntry(&in_ptr)) {
|
|||
|
if (*st) {
|
|||
|
*out_ptr++=ACD_DEF_CHAR;
|
|||
|
StrCpy(out_ptr,st);
|
|||
|
out_ptr+=StrLen(st)+1;
|
|||
|
}
|
|||
|
Free(st);
|
|||
|
}
|
|||
|
} else if (cmd==ACD_PRONUNCIATION) {
|
|||
|
if(st=ACDNextEntry(&in_ptr)) {
|
|||
|
if (*st) {
|
|||
|
*out_ptr++=ACD_PRONUNCIATION_CHAR;
|
|||
|
StrCpy(out_ptr,st);
|
|||
|
out_ptr+=StrLen(st)+1;
|
|||
|
}
|
|||
|
Free(st);
|
|||
|
}
|
|||
|
} else if (cmd==ACD_POS) {
|
|||
|
if(st=ACDNextEntry(&in_ptr)) {
|
|||
|
if (*st) {
|
|||
|
*out_ptr++=ACD_POS_CHAR;
|
|||
|
StrCpy(out_ptr,st);
|
|||
|
out_ptr+=StrLen(st)+1;
|
|||
|
}
|
|||
|
Free(st);
|
|||
|
}
|
|||
|
} else if (cmd==ACD_EXTRA) {
|
|||
|
if(st=ACDNextEntry(&in_ptr)) {
|
|||
|
if (*st) {
|
|||
|
*out_ptr++=ACD_EXTRA_CHAR;
|
|||
|
StrCpy(out_ptr,st);
|
|||
|
out_ptr+=StrLen(st)+1;
|
|||
|
}
|
|||
|
Free(st);
|
|||
|
}
|
|||
|
}
|
|||
|
} while (cmd==ACD_DEF||cmd==ACD_PRONUNCIATION||
|
|||
|
cmd==ACD_POS||cmd==ACD_EXTRA);
|
|||
|
} else
|
|||
|
Free(st);
|
|||
|
}
|
|||
|
}
|
|||
|
} while (cmd>=0);
|
|||
|
*out_ptr++=ACD_END_CHAR;
|
|||
|
*word_ptr++=ACD_END_CHAR;
|
|||
|
|
|||
|
Free(in_start);
|
|||
|
|
|||
|
"Blk Size:%d\n",ACD_BLK_SIZE;
|
|||
|
"Blk Cnt:%04X\n",(out_ptr-out_start+ACD_BLK_SIZE-1)/ACD_BLK_SIZE;
|
|||
|
"Largest Entry:%d\n",largest_entry;
|
|||
|
"Word Count:%d\n",word_cnt;
|
|||
|
|
|||
|
FileWrite(ACD_DEF_FILENAME,out_start,out_ptr-out_start);
|
|||
|
"Def File Size:%d\n",out_ptr-out_start;
|
|||
|
|
|||
|
sorted_word_start=ACDSortWords(word_start,word_ptr-word_start,word_cnt);
|
|||
|
FileWrite(ACD_WORD_FILENAME,sorted_word_start,word_ptr-word_start);
|
|||
|
"Word File Size:%d\n",word_ptr-word_start;
|
|||
|
|
|||
|
Free(out_start);
|
|||
|
Free(word_start);
|
|||
|
Free(sorted_word_start);
|
|||
|
}
|
|||
|
|
|||
|
Cd(__DIR__);
|
|||
|
ACDPreprocess("DICTIONARY.DD","DICTIONARY2.DD");
|
|||
|
ACDGen("DICTIONARY2.DD");
|