Skip to main content

challenge_file_indexing_function.py


"""
text_file_indexer.py
Exercise:
Given a text file somefile.txt, the program will read it completely,
and while doing so, record the occurrences of each unique word,
and the line numbers on which they occur. This information is
then written to an index file somefile.idx, which is also a text
file.
Author: Lauren Gregoire
"""
def create_index(text_filename, delimiter_chars=",.;:!?"):
    # the index is stored as a dictionary
    # with words as keys and the line numbers as a list of integers
    index = {}
    with open(text_filename, "r") as fid:
        line_num = 1     # line counter start at 1
        for line in fid:
            # Split the line into words
            words = line.split()
            # Remove unwanted delimiter characters adjoining words.
            words_striped = [ word.strip(delimiter_chars) for word in words ]
            # Find and save the occurrences of each word in the line.
            for word in words_striped:
                if not index.has_key(word):
                    index[word] = [ line_num ]   # add word and line_num to index
                else:
                    index[word].append(line_num) # add line_num to list of lines
            line_num+=1
    return index
def write_index(index, index_filename):
    index_file = open(index_filename, "w")
    for word in sorted(index.keys()):
        lines = index[word]
        lines_string=', '.join([str(line) for line in lines])
        index_file.write(word + ": " + lines_string + "\n")
    index_file.close()
def index_text_file(text_filename, index_filename="index.txt", delimiter_chars=",.;:!?"):
    """ Creates and index of the occurences of words in a file
    and writes the index in the text file index_filename.
    The output file has one line per word as follow:
            word: line_num line_num line_num
    with words sorted alphabetically
    """
    index = create_index(text_filename, delimiter_chars=",.;:!?")
    write_index(index, index_filename)
index_text_file('TheTyger_WilliamBlake.txt', 'index_TheTyger_WilliamBlake.txt')