Counting frequency with which words appear in a text file.

This was part of my investigation of 'Travesty' text, back in 2002. The 'most common words in the English language' are less often discussed than the 'most frequently used letters'.There are interesting aspects, like using LB's 'sort' routine. Malformed words like 'john_f_#192' and abbreviations like 'isn't' will cause trouble. For any valid literary analysis you nedd LARGE samples, and the program will slow down or may need re-writing. Perhaps some day!

No guarantee that this version is fully de-bugged, either!

I had ideas of dealing correctly with plurals; and of checking against published dictionaries like 'SOWPODS', the official Scrabble-words dictionary. 'Yet to be implemented'.


The following shows typical output.

    WORD-FREQUENCY ANALYSIS

  Original supplied text.
Hello World! Zoroastrians unite. The cat sat  on    the mat . asdf  the The THE tHe qwerty aŁ$sd<> )(*& syzygy phantasmagorical

  Text in lower case, stripped of non-alpha & punctuation.
hello world zoroastrians unite the cat sat on the mat asdf the the the the qwerty asd syzygy phantasmagorical

  With just the unique words, and none repeated.
hello world zoroastrians unite the cat sat on mat asdf qwerty asd syzygy phantasmagorical

  The count of each word in the supplied text.
 eachWord$(      1) holds "hello                                   " which appeared        1 time(s).
 eachWord$(      2) holds "world                                   " which appeared        1 time(s).
 eachWord$(      3) holds "zoroastrians                            " which appeared        1 time(s).
 eachWord$(      4) holds "unite                                   " which appeared        1 time(s).
 eachWord$(      5) holds "the                                     " which appeared        6 time(s).
 eachWord$(      6) holds "cat                                     " which appeared        1 time(s).
 eachWord$(      7) holds "sat                                     " which appeared        1 time(s).
 eachWord$(      8) holds "on                                      " which appeared        1 time(s).
 eachWord$(      9) holds "mat                                     " which appeared        1 time(s).
 eachWord$(     10) holds "asdf                                    " which appeared        1 time(s).
 eachWord$(     11) holds "qwerty                                  " which appeared        1 time(s).
 eachWord$(     12) holds "asd                                     " which appeared        1 time(s).
 eachWord$(     13) holds "syzygy                                  " which appeared        1 time(s).

  The sorted count of each word in the supplied text- word order.
 eachWord$(      1) holds "asd                                     " which appeared        1 time(s).
 eachWord$(      2) holds "asdf                                    " which appeared        1 time(s).
 eachWord$(      3) holds "cat                                     " which appeared        1 time(s).
 eachWord$(      4) holds "hello                                   " which appeared        1 time(s).
 eachWord$(      5) holds "mat                                     " which appeared        1 time(s).
 eachWord$(      6) holds "on                                      " which appeared        1 time(s).
 eachWord$(      7) holds "phantasmagorical                        " which appeared        1 time(s).
 eachWord$(      8) holds "qwerty                                  " which appeared        1 time(s).
 eachWord$(      9) holds "sat                                     " which appeared        1 time(s).
 eachWord$(     10) holds "syzygy                                  " which appeared        1 time(s).
 eachWord$(     11) holds "the                                     " which appeared        6 time(s).
 eachWord$(     12) holds "unite                                   " which appeared        1 time(s).
 eachWord$(     13) holds "world                                   " which appeared        1 time(s).
 eachWord$(     14) holds "zoroastrians                            " which appeared        1 time(s).

  The sorted count of each word in the supplied text- frequency order.
 eachWord$(      0) holds "the                                     " which appeared        6 time(s).
 eachWord$(      1) holds "phantasmagorical                        " which appeared        1 time(s).
 eachWord$(      2) holds "qwerty                                  " which appeared        1 time(s).
 eachWord$(      3) holds "asdf                                    " which appeared        1 time(s).
 eachWord$(      4) holds "sat                                     " which appeared        1 time(s).
 eachWord$(      5) holds "hello                                   " which appeared        1 time(s).
 eachWord$(      6) holds "syzygy                                  " which appeared        1 time(s).
 eachWord$(      7) holds "on                                      " which appeared        1 time(s).
 eachWord$(      8) holds "asd                                     " which appeared        1 time(s).
 eachWord$(      9) holds "cat                                     " which appeared        1 time(s).
 eachWord$(     10) holds "unite                                   " which appeared        1 time(s).
 eachWord$(     11) holds "mat                                     " which appeared        1 time(s).
 eachWord$(     12) holds "zoroastrians                            " which appeared        1 time(s).
 eachWord$(     13) holds "world                                   " which appeared        1 time(s).

Done

The following is the code used, in Liberty BASIC.


    mainwin 120 50

    print
    print "    WORD-FREQUENCY ANALYSIS"
    print

    global j2, true, false, position
    j2    =  1
    true  = -1
    false =  0

    dim eachWord$( 50000, 2)    '   up to 50000 unique words. Stored as text-string and frequency (as a string)

    filedialog "Open text file", "*.txt", fileNameIn$

    open fileNameIn$ for input as #original
        size    =lof(    #original)
        tex$    =input$( #original, size)

        'tex$      ="Hello World! The cat sat  on    the mat . asdf  the The THE tHe qwerty aŁ$sd<> )(*& syzygy phantasmagorical"
        '            NB in the above string it will incorrectly at present see a word 'asd' ********************
    close #original

    newtex$ =""

    print "  Original supplied text."
    print tex$
    print

    wasspace  =false

    for loopcounter =1 to size
        i$     = lower$( mid$( tex$, loopcounter, 1))

        wanted = instr( "abcdefghijklmnopqrstuvwxyz ", i$)

        if wanted >0 then wanted =true

        if wanted =true then
            if wasspace =false then
                newtex$  = newtex$  +i$
            else
                if i$ <>" " then
                    newtex$  = newtex$  +i$
                end if
            end if
            if i$ =" " then wasspace =true else wasspace =false
        end if

    next loopcounter

    print "  Text in lower case, stripped of non-alpha & punctuation."
    print newtex$               '   print the text with lowercase, single spaces, no punctuation.
    print

    another   =true
    position  =1
    j2        =1

    print "  With just the unique words, and none repeated."
    while true
        nextWord$ =word$( newtex$, position, " ")
        if  nextWord$="" then exit while        '   we've hit end-of-text EOT

        if alreadyCounted( nextWord$, j2) <>true then
            print nextWord$; " ";
            eachWord$( j2, 1)   =left$( nextWord$ +"                                        ", 40)
            eachWord$( j2 ,2)   ="       1"
            j2 =j2 +1
        end if
        position =position +1
    wend

    j2 =j2 -1

    print
    print
    print "  The count of each word in the supplied text."

    for i =1 to j2 -1           '   print the unique words in unsorted order.
        print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_
             chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)."
    next i



    print
    print "  The sorted count of each word in the supplied text- word order."

    sort eachWord$(), 0, j2, 1  '   ************************** by column 1

    for i =0 to j2           '   print the unique words in unsorted order.
        if eachWord$( i, 1) <>"" then print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_
             chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)."
    next i



    print
    print "  The sorted count of each word in the supplied text- frequency order."

    sort eachWord$(), j2, 0, 2  '   ************************** by column 2

    for i =0 to j2           '   print the unique words in unsorted order.
        if eachWord$( i, 1) <>"" then print " eachWord$( "; using( "######", i); ") holds "; chr$( 34); eachWord$( i, 1);_
             chr$( 34); tab( 50); " which appeared "; eachWord$( i, 2); " time(s)."
    next i


    print
    print "Done"

    end

function alreadyCounted( i$, wordsSoFar)
    alreadyCounted =false
    for j =1 to wordsSoFar
        if eachWord$( j, 1) =left$( i$ +"                                        ", 40) then
            alreadyCounted =true
            'print "Bingo!"
            eachWord$( j, 2)   =right$( "        " +str$( val( eachWord$( j, 2)) +1), 8)
            exit for
        end if

    next j
end function

As always, e-mail me if you meet problems or have suggestions.