This was part of my investigation of 'Travesty' text, back in 2002. The 'most common words in the English language' are less often discussed than the 'most frequently used letters'.There are interesting aspects, like using LB's 'sort' routine. Malformed words like 'john_f_#192' and abbreviations like 'isn't' will cause trouble. For any valid literary analysis you nedd LARGE samples, and the program will slow down or may need re-writing. Perhaps some day!
No guarantee that this version is fully de-bugged, either!
I had ideas of dealing correctly with plurals; and of checking against published dictionaries like 'SOWPODS', the official Scrabble-words dictionary. 'Yet to be implemented'.
WORD-FREQUENCY ANALYSIS Original supplied text. Hello World! Zoroastrians unite. The cat sat on the mat . asdf the The THE tHe qwerty aŁ$sd<> )(*& syzygy phantasmagorical Text in lower case, stripped of non-alpha & punctuation. hello world zoroastrians unite the cat sat on the mat asdf the the the the qwerty asd syzygy phantasmagorical With just the unique words, and none repeated. hello world zoroastrians unite the cat sat on mat asdf qwerty asd syzygy phantasmagorical The count of each word in the supplied text. eachWord$( 1) holds "hello " which appeared 1 time(s). eachWord$( 2) holds "world " which appeared 1 time(s). eachWord$( 3) holds "zoroastrians " which appeared 1 time(s). eachWord$( 4) holds "unite " which appeared 1 time(s). eachWord$( 5) holds "the " which appeared 6 time(s). eachWord$( 6) holds "cat " which appeared 1 time(s). eachWord$( 7) holds "sat " which appeared 1 time(s). eachWord$( 8) holds "on " which appeared 1 time(s). eachWord$( 9) holds "mat " which appeared 1 time(s). eachWord$( 10) holds "asdf " which appeared 1 time(s). eachWord$( 11) holds "qwerty " which appeared 1 time(s). eachWord$( 12) holds "asd " which appeared 1 time(s). eachWord$( 13) holds "syzygy " which appeared 1 time(s). The sorted count of each word in the supplied text- word order. eachWord$( 1) holds "asd " which appeared 1 time(s). eachWord$( 2) holds "asdf " which appeared 1 time(s). eachWord$( 3) holds "cat " which appeared 1 time(s). eachWord$( 4) holds "hello " which appeared 1 time(s). eachWord$( 5) holds "mat " which appeared 1 time(s). eachWord$( 6) holds "on " which appeared 1 time(s). eachWord$( 7) holds "phantasmagorical " which appeared 1 time(s). eachWord$( 8) holds "qwerty " which appeared 1 time(s). eachWord$( 9) holds "sat " which appeared 1 time(s). eachWord$( 10) holds "syzygy " which appeared 1 time(s). eachWord$( 11) holds "the " which appeared 6 time(s). eachWord$( 12) holds "unite " which appeared 1 time(s). eachWord$( 13) holds "world " which appeared 1 time(s). eachWord$( 14) holds "zoroastrians " which appeared 1 time(s). The sorted count of each word in the supplied text- frequency order. eachWord$( 0) holds "the " which appeared 6 time(s). eachWord$( 1) holds "phantasmagorical " which appeared 1 time(s). eachWord$( 2) holds "qwerty " which appeared 1 time(s). eachWord$( 3) holds "asdf " which appeared 1 time(s). eachWord$( 4) holds "sat " which appeared 1 time(s). eachWord$( 5) holds "hello " which appeared 1 time(s). eachWord$( 6) holds "syzygy " which appeared 1 time(s). eachWord$( 7) holds "on " which appeared 1 time(s). eachWord$( 8) holds "asd " which appeared 1 time(s). eachWord$( 9) holds "cat " which appeared 1 time(s). eachWord$( 10) holds "unite " which appeared 1 time(s). eachWord$( 11) holds "mat " which appeared 1 time(s). eachWord$( 12) holds "zoroastrians " which appeared 1 time(s). eachWord$( 13) holds "world " which appeared 1 time(s). Done
mainwin 120 50 print print " WORD-FREQUENCY ANALYSIS" print global j2, true, false, position j2 = 1 true = -1 false = 0 dim eachWord$( 50000, 2) ' up to 50000 unique words. Stored as text-string and frequency (as a string) filedialog "Open text file", "*.txt", fileNameIn$ open fileNameIn$ for input as #original size =lof( #original) tex$ =input$( #original, size) 'tex$ ="Hello World! The cat sat on the mat . asdf the The THE tHe qwerty aŁ$sd<> )(*& syzygy phantasmagorical" ' NB in the above string it will incorrectly at present see a word 'asd' ******************** close #original newtex$ ="" print " Original supplied text." print tex$ print wasspace =false for loopcounter =1 to size i$ = lower$( mid$( tex$, loopcounter, 1)) wanted = instr( "abcdefghijklmnopqrstuvwxyz ", i$) if wanted >0 then wanted =true if wanted =true then if wasspace =false then newtex$ = newtex$ +i$ else if i$ <>" " then newtex$ = newtex$ +i$ end if end if if i$ =" " then wasspace =true else wasspace =false end if next loopcounter print " Text in lower case, stripped of non-alpha & punctuation." print newtex$ ' print the text with lowercase, single spaces, no punctuation. print another =true position =1 j2 =1 print " With just the unique words, and none repeated." while true nextWord$ =word$( newtex$, position, " ") if nextWord$="" then exit while ' we've hit end-of-text EOT if alreadyCounted( nextWord$, j2) <>true then print nextWord$; " "; eachWord$( j2, 1) =left$( nextWord$ +" ", 40) eachWord$( j2 ,2) =" 1" j2 =j2 +1 end if position =position +1 wend j2 =j2 -1 print print print " The count of each word in the supplied text." for i =1 to j2 -1 ' print the unique words in unsorted order. print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_ chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)." next i print print " The sorted count of each word in the supplied text- word order." sort eachWord$(), 0, j2, 1 ' ************************** by column 1 for i =0 to j2 ' print the unique words in unsorted order. if eachWord$( i, 1) <>"" then print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_ chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)." next i print print " The sorted count of each word in the supplied text- frequency order." sort eachWord$(), j2, 0, 2 ' ************************** by column 2 for i =0 to j2 ' print the unique words in unsorted order. if eachWord$( i, 1) <>"" then print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_ chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)." next i print print "Done" end function alreadyCounted( i$, wordsSoFar) alreadyCounted =false for j =1 to wordsSoFar if eachWord$( j, 1) =left$( i$ +" ", 40) then alreadyCounted =true 'print "Bingo!" eachWord$( j, 2) =right$( " " +str$( val( eachWord$( j, 2)) +1), 8) exit for end if next j end function