{$A+,B-,D+,E-,F+,G-,I+,L+,N-,O-,R-,S-,V+,X+}
{$M 16384,0,655360}
Program WordCountProg;

Uses Crt;

TYPE
   Str10=String[10];
   InfoType = RECORD
      WordStr:Str10;
      Occur :Word;
   END;

   PointerType=^TreeType;
   TreeType = RECORD
      Info  :InfoType;
      Left  :PointerType;
      Right :PointerType;
   END;

   FinalRec = RECORD
      WordStr  :Str10;
      FirstAddr:LongInt;
      Occur    :Word;
   END;


CONST
   EmptyFlag = 0;
   WordCount      :LongInt=0;
   UniqueWordCount:Word   =0;
   AddrCounter    :LongInt=0;
   LineCounter    :Word   =0;
   LineLength             =80;
   TreeRoot       :PointerType=NIL;
   Delimiters=[' ', ',', '.', #13, #10, '"', ';', '?', '!', '-', ':', '(', ')', '[', ']', '>', '<', #9, '*', #26];
   BufferSize=65520;
   SafteyZone=35;
   WriteBufferSize={10920;}500;
   OccurrenceArraySize=32760;

VAR
   Ind1FileName,Ind2FileName,
   Ind3FileName,TextFileName :String[40];
   DataFile :Text;
   UnTyFile :FILE;
   Ind1File :FILE OF FinalRec;
   AddrFile :FILE OF Word;
   LineIndex:FILE OF LongInt;
   OneRec   :FinalRec;
   Line     :String[LineLength];
   WordBegin,
   WordEnd  :Word;
   LineLen  :Word;
   Temp     :Word;
   i,AddressToWrite:Longint;

PROCEDURE GetFileNames;
   BEGIN
      IF ParamCount=0 THEN
         BEGIN
            Write('Filename: ');
            Readln(TextFileName);
            Writeln;
         END
      ELSE TextFileName:=ParamStr(1);
      Assign(DataFile,TextFileName);
      Assign(UnTyFile,TextFileName);
      IF Pos('.',TextFileName)>0 THEN
         Delete(TextFileName,Pos('.',TextFileName),Length(TextFileName)-Pos('.',TextFileName)+1);
      Ind1FileName:=TextFileName+'.in1';
      Ind2FileName:=TextFileName+'.in2';
      Ind3FileName:=TextFileName+'.in3';
      Assign(Ind1File,Ind1FileName);
      Assign(AddrFile,Ind2FileName);
      Assign(LineIndex,Ind3FileName);
   END;

FUNCTION  CleanUpWord(var InWord:Str10):Boolean;
   VAR X:Byte;
   BEGIN
      IF InWord[1]='''' THEN
         BEGIN
            Delete(InWord,1,1);
            IF Length(InWord)=0 THEN CleanUpWord:=TRUE ELSE CleanUpWord:=FALSE;
         END;
      FOR X:=1 TO Length(InWord) DO IF InWord[X] IN ['A'..'Z'] THEN InWord[X]:=Chr(Ord(InWord[X])+32);
   END;

PROCEDURE ReadFile;

   PROCEDURE InsertOrInc(SearchStr:Str10);
      VAR
         X:Byte;
         NewNode:PointerType;
         Ptr,Back:PointerType;
      BEGIN
         IF CleanUpWord(SearchStr)=TRUE THEN EXIT;
         Inc(WordCount);
         Ptr:=TreeRoot;
         Back:=NIL;
         WHILE Ptr <> NIL DO
            BEGIN
               Back:=Ptr;
               IF Ptr^.Info.WordStr = SearchStr THEN
                  BEGIN
                     Inc(Ptr^.Info.Occur);
                     Exit;
                  END
               ELSE IF Ptr^.Info.WordStr > SearchStr THEN Ptr:=Ptr^.Left
               ELSE Ptr:=Ptr^.Right;
            END;
         New(NewNode);
         Inc(UniqueWordCount);
         WITH NewNode^ DO
            BEGIN
               Left:=NIL;
               Right:=NIL;
               Info.WordStr:=SearchStr;
               Info.Occur:=1;
            END;
         IF Back=NIL THEN TreeRoot:=NewNode
         ELSE IF Back^.Info.WordStr > SearchStr THEN Back^.Left:=NewNode
            ELSE Back^.Right:=NewNode;
      END;{insertorinc}

   BEGIN{readfile}
      Reset(DataFile);
      Rewrite(LineIndex);
      AddrCounter:=0;
      Write(LineIndex,AddrCounter);
      REPEAT
         Readln(DataFile,Line);
         WordBegin:=1;
         LineLen:=Length(Line);
         REPEAT
            WHILE (WordBegin<=LineLen) AND (Line[WordBegin] IN Delimiters) DO Inc(WordBegin);
            WordEnd:=WordBegin+1;
            WHILE (WordEnd<=LineLen) AND (NOT (Line[WordEnd] IN Delimiters)) DO Inc(WordEnd);
            IF WordBegin <= LineLen THEN InsertOrInc(Copy(Line,WordBegin,WordEnd-WordBegin));
            WordBegin:=WordEnd+1;
         UNTIL WordEnd > LineLen;
         AddrCounter:=AddrCounter+LineLen+2;
         AddressToWrite:=AddrCounter;
         Write(LineIndex,AddressToWrite);
      UNTIL EOF(DataFile);
      Close(DataFile);
      Close(LineIndex);
   END;


{****************************************************************************}


Procedure TraverseTree(Address:PointerType);
   BEGIN
      IF Address<>NIL THEN
         BEGIN
            TraverseTree(Address^.Left);
            OneRec.WordStr:=Address^.Info.WordStr;
            OneRec.FirstAddr:=AddrCounter;
            OneRec.Occur:=Address^.Info.Occur;
            Write(Ind1File,OneRec);
            AddrCounter:=AddrCounter+OneRec.Occur;
            TraverseTree(Address^.Right);
            Dispose(Address);
         END;
   END;

PROCEDURE InitializeAddressFile;
   BEGIN
      Rewrite(AddrFile);
      Temp:=EmptyFlag;
      FOR i:=1 TO WordCount DO Write(AddrFile,Temp);
      Close(AddrFile);
   END;


{****************************************************************************}


PROCEDURE GenerateIndex;

   TYPE
      PString=^Str10;
      TextBufferArrayType=ARRAY[0..BufferSize] OF Char;
      TextBufferPtr=^TextBufferArrayType;
      WriteBufferRecord = RECORD
         SeekAddress:LongInt;
         AddressToWrite:Word;
      END;
      WriteBufferArrayType=ARRAY[1..WriteBufferSize] OF WriteBufferRecord;
      WriteBufferPtr=^WriteBufferArrayType;
      OccurrenceArrayType=ARRAY[0..OccurrenceArraySize] OF Word;
      OccurrenceArrayPtr=^OccurrenceArrayType;

   VAR
      TextBuffer:TextBufferPtr;
      WriteBuffer:WriteBufferPtr;
      OccurrenceArray:OccurrenceArrayPtr;
      WriteBufferCounter:Word;
      UniqueWordNum, Count:Word;
      TString:PString;

   FUNCTION  SubAddress(AddrStr:Str10):LongInt;
      VAR
         MidPoint,
         First,
         Last :Word;
         Found:Boolean;
      BEGIN
         Found:=FALSE;
         First:=0;
         Last:=FileSize(Ind1File)-1;
         WHILE (First <= Last) AND (NOT Found) DO
            BEGIN
               MidPoint := (First + Last) DIV 2;
               Seek(Ind1File, MidPoint);
               Read(Ind1File, OneRec);
               IF OneRec.WordStr=AddrStr THEN Found:=TRUE
               ELSE IF OneRec.WordStr > AddrStr THEN Last:=MidPoint-1
                  ELSE First:=MidPoint+1;
            END;
         Inc(OccurrenceArray^[MidPoint]);
         SubAddress:=OneRec.FirstAddr+OccurrenceArray^[MidPoint]-1;
      END;

   PROCEDURE DumpBuffer(BufferFilled:Word);
      VAR
         X,Y,Min:Word;
         SwapTemp:WriteBufferRecord;
      BEGIN
         {sort buffer first}
         FOR X:=1 TO BufferFilled-1 DO
            BEGIN
               Min:=X;
               FOR Y:=X+1 TO BufferFilled DO
                  IF WriteBuffer^[Y].SeekAddress < WriteBuffer^[Min].SeekAddress THEN Min:=Y;
               SwapTemp:=WriteBuffer^[X];
               WriteBuffer^[X]:=WriteBuffer^[Min];
               WriteBuffer^[Min]:=SwapTemp;
            END;
         {and now bung it to disk}
         FOR X:=1 TO BufferFilled DO
            BEGIN
               Seek(AddrFile,WriteBuffer^[X].SeekAddress);
               Write(AddrFile,WriteBuffer^[X].AddressToWrite);
            END;
         WriteBufferCounter:=0;
      END;

   PROCEDURE Index(IndexStr:Str10;FileAddr:Word);
      BEGIN
         IF CleanUpWord(IndexStr)=TRUE THEN EXIT;
         Inc(WriteBufferCounter);
         WriteBuffer^[WriteBufferCounter].SeekAddress:=SubAddress(IndexStr);
         WriteBuffer^[WriteBufferCounter].AddressToWrite:=FileAddr;
         IF WriteBufferCounter=WriteBufferSize THEN DumpBuffer(WriteBufferSize);
      END;

   BEGIN{generate index}
      Reset(UnTyFile,1);
      Reset(AddrFile);
      Reset(Ind1File);
      New(TextBuffer);
      New(WriteBuffer);
      New(OccurrenceArray);
      AddrCounter:=0;
      WriteBufferCounter:=0;
      LineCounter:=0;
      FillChar(OccurrenceArray^,SizeOf(OccurrenceArray^),0);
      Writeln(FileSize(UnTyFile)+3:8,' total bytes to be processed');
      Write('         bytes already processed');
      REPEAT
         BlockRead(UnTyFile,TextBuffer^[1],BufferSize-SafteyZone,Count);
         IF NOT EOF(UnTyFile) THEN
            WHILE NOT (TextBuffer^[Count] IN Delimiters) DO
               BEGIN
                  Inc(Count);
                  BlockRead(UnTyFile,TextBuffer^[Count],1);
               END;
         WordBegin:=1;
         REPEAT
            IF TextBuffer^[WordBegin]=#13 THEN Inc(LineCounter);
            WHILE (WordBegin<=Count) AND (TextBuffer^[WordBegin] IN Delimiters) DO
               BEGIN
                  Inc(WordBegin);
                  IF TextBuffer^[WordBegin]=#13 THEN Inc(LineCounter);
               END;
            WordEnd:=WordBegin+1;
            IF TextBuffer^[WordEnd]=#13 THEN Inc(LineCounter);
            WHILE (WordEnd<=Count) AND (NOT (TextBuffer^[WordEnd] IN Delimiters)) DO
               BEGIN
                  Inc(WordEnd);
                  IF TextBuffer^[WordEnd]=#13 THEN Inc(LineCounter);
               END;
            IF WordBegin <= Count THEN
               BEGIN
                  TString:=@TextBuffer^[WordBegin-1];
                  TString^[0]:=Chr(WordEnd-WordBegin);
                  IF TextBuffer^[WordEnd]<>#13 THEN Index(TString^,LineCounter)
                     ELSE Index(TString^,LineCounter-1);
               END;
            WordBegin:=WordEnd+1;
            GotoXY(1,WhereY);
            Write(AddrCounter+WordBegin:8);
         UNTIL WordEnd > Count;
         AddrCounter:=AddrCounter+Count;
      UNTIL Count=0;
      DumpBuffer(WriteBufferCounter);
      Close(UnTyFile);
      Close(AddrFile);
      Close(Ind1File);
      Dispose(TextBuffer);
      Dispose(WriteBuffer);
      Dispose(OccurrenceArray);
   END;{generate index}


{****************************************************************************}


BEGIN {main}
   TextBackground(Blue);
   TextColor(Yellow);
   ClrScr;
   Writeln;
   Writeln('WordCount:  The Capital Search Index Generation Program.');
   Writeln('Copyright (c) 1991 by The Whitman Group.  All rights reserved.');
   Writeln;Writeln;
   GetFileNames;
   Writeln('Scanning text file (first pass) . . .');Writeln;
   ReadFile;
   Writeln('       Word Count: ',WordCount);
   Writeln('Unique Word Count: ',UniqueWordCount);
   Writeln;
   Writeln('Writing primary index . . .');Writeln;
   Rewrite(Ind1File);
   AddrCounter:=0;
   TraverseTree(TreeRoot);
   Close(Ind1File);
   Writeln('Initializing secondary index . . .');Writeln;
   InitializeAddressFile;
   Writeln('Scanning text file (final pass) . . .');Writeln;
   GenerateIndex;
   Writeln;
   Writeln;
   Writeln('Processing completed.');
   Writeln;
END.
