Skip to main content

challenge_file_indexing.py


"""
Challenge: File indexer
Write a program which creates an index from the text file :
 TheTyger_WilliamBlake.txt
The index records the occurrences of each word in the text
and the line(s) at which the words occur.
Write the index in a text file with each word on one line
followed by the list of lines. Words should be in alphabetical order.
For example:
a: 4 9
the: 6 9 12
Author: Lauren Gregoire
"""
##
## Creating index
##
text_filename = 'TheTyger_WilliamBlake.txt'
index_filename = 'index_TheTyger_WilliamBlake.txt'
# punctuation that we need to remove from words
delimiter_chars=",.;:!?"
# the index is stored as a dictionary
# with words as keys and the line numbers as a list of integers
index = {}
fid=open(text_filename, "r")
line_num = 1     # line counter start at 1
for line in fid:
    # Split the line into words
    words = line.split()
    # Remove unwanted delimiter characters adjoining words.
    words_striped = [ word.strip(delimiter_chars) for word in words ]
    # Find and save the occurrences of each word in the line.
    for word in words_striped:
        if not index.has_key(word):
            index[word] = [ line_num ]   # add word and line_num to index
        else:
            index[word].append(line_num) # add line_num to list of lines
    line_num+=1
##
## write index to file
##
index_file = open(index_filename, "w")
for word in sorted(index.keys()):
    lines = index[word]
    lines_string=', '.join([str(line) for line in lines])
    index_file.write(word + ": " + lines_string + "\n")
index_file.close()