This was part of my investigation of 'Travesty' text, back in 2002. The 'most common words in the English language' are less often discussed than the 'most frequently used letters'.There are interesting aspects, like using LB's 'sort' routine. Malformed words like 'john_f_#192' and abbreviations like 'isn't' will cause trouble. For any valid literary analysis you nedd LARGE samples, and the program will slow down or may need re-writing. Perhaps some day!
No guarantee that this version is fully de-bugged, either!
I had ideas of dealing correctly with plurals; and of checking against published dictionaries like 'SOWPODS', the official Scrabble-words dictionary. 'Yet to be implemented'.
WORD-FREQUENCY ANALYSIS
Original supplied text.
Hello World! Zoroastrians unite. The cat sat on the mat . asdf the The THE tHe qwerty aŁ$sd<> )(*& syzygy phantasmagorical
Text in lower case, stripped of non-alpha & punctuation.
hello world zoroastrians unite the cat sat on the mat asdf the the the the qwerty asd syzygy phantasmagorical
With just the unique words, and none repeated.
hello world zoroastrians unite the cat sat on mat asdf qwerty asd syzygy phantasmagorical
The count of each word in the supplied text.
eachWord$( 1) holds "hello " which appeared 1 time(s).
eachWord$( 2) holds "world " which appeared 1 time(s).
eachWord$( 3) holds "zoroastrians " which appeared 1 time(s).
eachWord$( 4) holds "unite " which appeared 1 time(s).
eachWord$( 5) holds "the " which appeared 6 time(s).
eachWord$( 6) holds "cat " which appeared 1 time(s).
eachWord$( 7) holds "sat " which appeared 1 time(s).
eachWord$( 8) holds "on " which appeared 1 time(s).
eachWord$( 9) holds "mat " which appeared 1 time(s).
eachWord$( 10) holds "asdf " which appeared 1 time(s).
eachWord$( 11) holds "qwerty " which appeared 1 time(s).
eachWord$( 12) holds "asd " which appeared 1 time(s).
eachWord$( 13) holds "syzygy " which appeared 1 time(s).
The sorted count of each word in the supplied text- word order.
eachWord$( 1) holds "asd " which appeared 1 time(s).
eachWord$( 2) holds "asdf " which appeared 1 time(s).
eachWord$( 3) holds "cat " which appeared 1 time(s).
eachWord$( 4) holds "hello " which appeared 1 time(s).
eachWord$( 5) holds "mat " which appeared 1 time(s).
eachWord$( 6) holds "on " which appeared 1 time(s).
eachWord$( 7) holds "phantasmagorical " which appeared 1 time(s).
eachWord$( 8) holds "qwerty " which appeared 1 time(s).
eachWord$( 9) holds "sat " which appeared 1 time(s).
eachWord$( 10) holds "syzygy " which appeared 1 time(s).
eachWord$( 11) holds "the " which appeared 6 time(s).
eachWord$( 12) holds "unite " which appeared 1 time(s).
eachWord$( 13) holds "world " which appeared 1 time(s).
eachWord$( 14) holds "zoroastrians " which appeared 1 time(s).
The sorted count of each word in the supplied text- frequency order.
eachWord$( 0) holds "the " which appeared 6 time(s).
eachWord$( 1) holds "phantasmagorical " which appeared 1 time(s).
eachWord$( 2) holds "qwerty " which appeared 1 time(s).
eachWord$( 3) holds "asdf " which appeared 1 time(s).
eachWord$( 4) holds "sat " which appeared 1 time(s).
eachWord$( 5) holds "hello " which appeared 1 time(s).
eachWord$( 6) holds "syzygy " which appeared 1 time(s).
eachWord$( 7) holds "on " which appeared 1 time(s).
eachWord$( 8) holds "asd " which appeared 1 time(s).
eachWord$( 9) holds "cat " which appeared 1 time(s).
eachWord$( 10) holds "unite " which appeared 1 time(s).
eachWord$( 11) holds "mat " which appeared 1 time(s).
eachWord$( 12) holds "zoroastrians " which appeared 1 time(s).
eachWord$( 13) holds "world " which appeared 1 time(s).
Done
mainwin 120 50
print
print " WORD-FREQUENCY ANALYSIS"
print
global j2, true, false, position
j2 = 1
true = -1
false = 0
dim eachWord$( 50000, 2) ' up to 50000 unique words. Stored as text-string and frequency (as a string)
filedialog "Open text file", "*.txt", fileNameIn$
open fileNameIn$ for input as #original
size =lof( #original)
tex$ =input$( #original, size)
'tex$ ="Hello World! The cat sat on the mat . asdf the The THE tHe qwerty aŁ$sd<> )(*& syzygy phantasmagorical"
' NB in the above string it will incorrectly at present see a word 'asd' ********************
close #original
newtex$ =""
print " Original supplied text."
print tex$
print
wasspace =false
for loopcounter =1 to size
i$ = lower$( mid$( tex$, loopcounter, 1))
wanted = instr( "abcdefghijklmnopqrstuvwxyz ", i$)
if wanted >0 then wanted =true
if wanted =true then
if wasspace =false then
newtex$ = newtex$ +i$
else
if i$ <>" " then
newtex$ = newtex$ +i$
end if
end if
if i$ =" " then wasspace =true else wasspace =false
end if
next loopcounter
print " Text in lower case, stripped of non-alpha & punctuation."
print newtex$ ' print the text with lowercase, single spaces, no punctuation.
print
another =true
position =1
j2 =1
print " With just the unique words, and none repeated."
while true
nextWord$ =word$( newtex$, position, " ")
if nextWord$="" then exit while ' we've hit end-of-text EOT
if alreadyCounted( nextWord$, j2) <>true then
print nextWord$; " ";
eachWord$( j2, 1) =left$( nextWord$ +" ", 40)
eachWord$( j2 ,2) =" 1"
j2 =j2 +1
end if
position =position +1
wend
j2 =j2 -1
print
print
print " The count of each word in the supplied text."
for i =1 to j2 -1 ' print the unique words in unsorted order.
print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_
chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)."
next i
print
print " The sorted count of each word in the supplied text- word order."
sort eachWord$(), 0, j2, 1 ' ************************** by column 1
for i =0 to j2 ' print the unique words in unsorted order.
if eachWord$( i, 1) <>"" then print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_
chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)."
next i
print
print " The sorted count of each word in the supplied text- frequency order."
sort eachWord$(), j2, 0, 2 ' ************************** by column 2
for i =0 to j2 ' print the unique words in unsorted order.
if eachWord$( i, 1) <>"" then print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_
chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)."
next i
print
print "Done"
end
function alreadyCounted( i$, wordsSoFar)
alreadyCounted =false
for j =1 to wordsSoFar
if eachWord$( j, 1) =left$( i$ +" ", 40) then
alreadyCounted =true
'print "Bingo!"
eachWord$( j, 2) =right$( " " +str$( val( eachWord$( j, 2)) +1), 8)
exit for
end if
next j
end function