262 lines
5.5 KiB
HolyC
Executable File
262 lines
5.5 KiB
HolyC
Executable File
/*
|
||
This file is a stand-alone program
|
||
which will regenerate processed dictionary
|
||
files from a raw Project Gutenberg
|
||
dictionary file.
|
||
|
||
See $LK,"::/Doc/Credits.DD"$.
|
||
*/
|
||
|
||
U0 ACDPreprocess(U8 *in_name,U8 *out_name)
|
||
{/*
|
||
<cr><nl>--> <nl>
|
||
$$ --> $$$$
|
||
\'89 --> ‰
|
||
*/
|
||
I64 ch,i;
|
||
U8 *src,*dst;
|
||
CDoc *doc;
|
||
CDocEntry *doc_e;
|
||
if (doc=DocRead(in_name,DOCF_PLAIN_TEXT_TABS|DOCF_DBL_DOLLARS)) {
|
||
doc_e=doc->head.next;
|
||
while (doc_e!=doc) {
|
||
if (doc_e->type_u8==DOCT_TEXT) {
|
||
src=dst=doc_e->tag;
|
||
while (ch=*src++) {
|
||
if (ch=='\\' && *src=='\'') {
|
||
src++;
|
||
i=0;
|
||
ch=ToUpper(*src++);
|
||
if ('0'<=ch<='9')
|
||
i+=ch-'0';
|
||
else if ('A'<=ch<='F')
|
||
i+=ch-'A'+10;
|
||
i<<=4;
|
||
ch=ToUpper(*src++);
|
||
if ('0'<=ch<='9')
|
||
i+=ch-'0';
|
||
else if ('A'<=ch<='F')
|
||
i+=ch-'A'+10;
|
||
*dst++=i;
|
||
} else
|
||
*dst++=ch;
|
||
}
|
||
*dst=0;
|
||
}
|
||
doc_e=doc_e->next;
|
||
}
|
||
StrCpy(doc->filename.name,out_name);
|
||
DocWrite(doc);
|
||
DocDel(doc);
|
||
}
|
||
}
|
||
|
||
I64 ACDNextCmd(U8 **_ptr)
|
||
{
|
||
U8 *ptr=*_ptr,*ptr2;
|
||
I64 ch,res=-1;
|
||
do {
|
||
do {
|
||
if (!(ch=*ptr++)) goto ncmd_done;
|
||
} while (ch!='<');
|
||
|
||
ptr2=ptr;
|
||
do {
|
||
if (!(ch=*ptr2++)) goto ncmd_done;
|
||
} while (ch!='>');
|
||
*--ptr2=0;
|
||
res=LstMatch(ptr,"h1\0/h1\0def\0/def\0hw\0/hw\0tt\0/tt\0"
|
||
"ety\0@fld\0@cd\0@blockquote\0@wordforms\0@note\0@altname\0@chform\0"
|
||
"@cref\0@syn\0/ety\0@/fld\0@/cd\0@/blockquote\0@/wordforms\0@/note\0"
|
||
"@/altname\0@/chform\0@/cref\0@/syn\0");
|
||
*ptr2++='>';
|
||
ptr=ptr2;
|
||
} while (res<0);
|
||
|
||
ncmd_done:
|
||
*_ptr=ptr;
|
||
return res;
|
||
}
|
||
|
||
U8 *ACDNextEntry(U8 **_ptr)
|
||
{
|
||
U8 *res,*ignore,*ptr=*_ptr,buf[ACD_BLK_SIZE],*out_ptr=buf;
|
||
I64 ch,l;
|
||
while (TRUE) {
|
||
while (TRUE) {
|
||
if (!(ch=*ptr++)) goto nentry_done;
|
||
if (ch!='<') {
|
||
*out_ptr++=ch;
|
||
if (ch=='$$')
|
||
*out_ptr++=ch;
|
||
} else
|
||
break;
|
||
}
|
||
ignore="b>\0i>\0ppp>\0/b>\0/i>\0/p>\0"
|
||
"ets>\0col>\0spn>\0/ets>\0/col>\0/spn>\0er>\0as>\0cs>\0cd>\0ex>\0"
|
||
"/er>\0/as>\0/cs>\0/cd>\0/ex>\0"
|
||
"note>\0/note>\0blockquote>\0/blockquote>\0";
|
||
while (*ignore) {
|
||
l=StrLen(ignore);
|
||
if (!StrNCmp(ptr,ignore,l)) {
|
||
ptr+=l;
|
||
break;
|
||
} else
|
||
ignore+=l+1;
|
||
}
|
||
if (!*ignore)
|
||
break;
|
||
}
|
||
nentry_done:
|
||
*out_ptr++=0;
|
||
res=StrNew(buf);
|
||
*_ptr=ptr-1;
|
||
return res;
|
||
}
|
||
|
||
I64 ACDCompareWords(U8 *e1,U8 *e2)
|
||
{
|
||
return StrICmp(e1,e2);
|
||
}
|
||
|
||
U8 *ACDSortWords(U8 *start,I64 size,I64 word_cnt)
|
||
{
|
||
U8 **ptr_array=MAlloc(sizeof(U8 *)*word_cnt),
|
||
*out_start=MAlloc(size),
|
||
*ptr=start,*ptr2;
|
||
I64 i=0;
|
||
while (*ptr) {
|
||
ptr_array[i++]=ptr;
|
||
ptr+=StrLen(ptr)+3;
|
||
}
|
||
"Sorting...\n"; Sleep(100);
|
||
QSortI64(ptr_array,word_cnt,&ACDCompareWords);
|
||
"Done...\n"; Sleep(100);
|
||
|
||
ptr=out_start;
|
||
for (i=0;i<word_cnt;i++) {
|
||
ptr2=ptr_array[i];
|
||
while (*ptr2)
|
||
*ptr++=*ptr2++;
|
||
*ptr++=*ptr2++; //zero
|
||
*ptr++=*ptr2++; //blk lo
|
||
*ptr++=*ptr2++; //blk hi
|
||
}
|
||
*ptr++=0;
|
||
return out_start;
|
||
}
|
||
|
||
U0 ACDGen(U8 *in_file)
|
||
{
|
||
I64 cmd,size,word_cnt=0,largest_entry=0;
|
||
U8 *st,*in_ptr=FileRead(in_file,&size),*in_start=in_ptr,
|
||
*out_ptr=MAlloc(size),*out_start=out_ptr,
|
||
*word_ptr=MAlloc(size),*word_start=word_ptr,
|
||
*last_word="",*def_word_start=out_ptr,
|
||
*sorted_word_start;
|
||
U16 *d;
|
||
if (!in_ptr) return;
|
||
do {
|
||
cmd=ACDNextCmd(&in_ptr);
|
||
if (cmd==ACD_H1) {
|
||
next_word:
|
||
if (out_ptr-def_word_start>largest_entry)
|
||
largest_entry=out_ptr-def_word_start;
|
||
def_word_start=out_ptr;
|
||
if (st=ACDNextEntry(&in_ptr)) {
|
||
if (*st) {
|
||
if (StrICmp(st,last_word)) {
|
||
word_cnt++;
|
||
|
||
*word_ptr++=ACD_WORD_CHAR;
|
||
last_word=word_ptr;
|
||
StrCpy(word_ptr,st);
|
||
word_ptr+=StrLen(st)+1;
|
||
|
||
d=word_ptr;
|
||
*d=(out_ptr-out_start)/ACD_BLK_SIZE;
|
||
word_ptr+=2;
|
||
|
||
*out_ptr++=ACD_WORD_CHAR;
|
||
StrCpy(out_ptr,st);
|
||
out_ptr+=StrLen(st)+1;
|
||
}
|
||
Free(st);
|
||
|
||
do {
|
||
do {
|
||
cmd=ACDNextCmd(&in_ptr);
|
||
if (cmd==ACD_H1)
|
||
goto next_word;
|
||
} while (cmd>=0 && !(cmd==ACD_DEF||cmd==ACD_PRONUNCIATION||
|
||
cmd==ACD_POS||cmd==ACD_EXTRA));
|
||
if (cmd==ACD_DEF) {
|
||
if(st=ACDNextEntry(&in_ptr)) {
|
||
if (*st) {
|
||
*out_ptr++=ACD_DEF_CHAR;
|
||
StrCpy(out_ptr,st);
|
||
out_ptr+=StrLen(st)+1;
|
||
}
|
||
Free(st);
|
||
}
|
||
} else if (cmd==ACD_PRONUNCIATION) {
|
||
if(st=ACDNextEntry(&in_ptr)) {
|
||
if (*st) {
|
||
*out_ptr++=ACD_PRONUNCIATION_CHAR;
|
||
StrCpy(out_ptr,st);
|
||
out_ptr+=StrLen(st)+1;
|
||
}
|
||
Free(st);
|
||
}
|
||
} else if (cmd==ACD_POS) {
|
||
if(st=ACDNextEntry(&in_ptr)) {
|
||
if (*st) {
|
||
*out_ptr++=ACD_POS_CHAR;
|
||
StrCpy(out_ptr,st);
|
||
out_ptr+=StrLen(st)+1;
|
||
}
|
||
Free(st);
|
||
}
|
||
} else if (cmd==ACD_EXTRA) {
|
||
if(st=ACDNextEntry(&in_ptr)) {
|
||
if (*st) {
|
||
*out_ptr++=ACD_EXTRA_CHAR;
|
||
StrCpy(out_ptr,st);
|
||
out_ptr+=StrLen(st)+1;
|
||
}
|
||
Free(st);
|
||
}
|
||
}
|
||
} while (cmd==ACD_DEF||cmd==ACD_PRONUNCIATION||
|
||
cmd==ACD_POS||cmd==ACD_EXTRA);
|
||
} else
|
||
Free(st);
|
||
}
|
||
}
|
||
} while (cmd>=0);
|
||
*out_ptr++=ACD_END_CHAR;
|
||
*word_ptr++=ACD_END_CHAR;
|
||
|
||
Free(in_start);
|
||
|
||
"Blk Size:%d\n",ACD_BLK_SIZE;
|
||
"Blk Cnt:%04X\n",(out_ptr-out_start+ACD_BLK_SIZE-1)/ACD_BLK_SIZE;
|
||
"Largest Entry:%d\n",largest_entry;
|
||
"Word Count:%d\n",word_cnt;
|
||
|
||
FileWrite(ACD_DEF_FILENAME,out_start,out_ptr-out_start);
|
||
"Def File Size:%d\n",out_ptr-out_start;
|
||
|
||
sorted_word_start=ACDSortWords(word_start,word_ptr-word_start,word_cnt);
|
||
FileWrite(ACD_WORD_FILENAME,sorted_word_start,word_ptr-word_start);
|
||
"Word File Size:%d\n",word_ptr-word_start;
|
||
|
||
Free(out_start);
|
||
Free(word_start);
|
||
Free(sorted_word_start);
|
||
}
|
||
|
||
Cd(__DIR__);
|
||
ACDPreprocess("DICTIONARY.DD","DICTIONARY2.DD");
|
||
ACDGen("DICTIONARY2.DD");
|