Counting Words Serially

In [1]:
import logging
import sys
import string
from collections import defaultdict


def word_count(s):
    # For this exercise, write a program that serially counts the number of occurrences
    # of each word in the book Alice in Wonderland.
    #
    # The text of Alice in Wonderland will be fed into your program line-by-line.
    # Your program needs to take each line and do the following:
    # 1) Tokenize the line into string tokens by whitespace
    #    Example: "Hello, World!" should be converted into "Hello," and "World!"
    #    (This part has been done for you.)
    #
    # 2) Remove all punctuation
    #    Example: "Hello," and "World!" should be converted into "Hello" and "World"
    #
    # 3) Make all letters lowercase
    #    Example: "Hello" and "World" should be converted to "hello" and "world"
    #
    # Store the the number of times that a word appears in Alice in Wonderland
    # in the word_counts dictionary, and then *print* (don't return) that dictionary
    #
    # In this exercise, print statements will be considered your final output. Because
    # of this, printing a debug statement will cause the grader to break. Instead, 
    # you can use the logging module which we've configured for you.
    #
    # For example:
    # logging.info("My debugging message")
    #
    # The logging module can be used to give you more control over your
    # debugging or other messages than you can get by printing them. Messages 
    # logged via the logger we configured will be saved to a
    # file. If you click "Test Run", then you will see the contents of that file
    # once your program has finished running.
    # 
    # The logging module also has other capabilities; see 
    # https://docs.python.org/2/library/logging.html
    # for more information.

    word_counts = defaultdict(int)

    for line in s:#ys.stdin:
        data = line.strip().split(" ")
        for i in data:
            word = i.translate(string.maketrans("",""),string.punctuation).lower()
            word_counts[word] += 1
            
    word_counts = dict(word_counts)
    print word_counts
    
word_count(['a b c a ab cd', 'b bd, sdlkjf'])
{'a': 2, 'bd': 1, 'c': 1, 'b': 2, 'ab': 1, 'sdlkjf': 1, 'cd': 1}

Mapper and Reducer with Aadhaar Data

In [2]:
def mapper():

    #Also make sure to fill out the reducer code before clicking "Test Run" or "Submit".

    #Each line will be a comma-separated list of values. The
    #header row WILL be included. Tokenize each row using the 
    #commas, and emit (i.e. print) a key-value pair containing the 
    #district (not state) and Aadhaar generated, separated by a tab. 
    #Skip rows without the correct number of tokens and also skip 
    #the header row.

    #You can see a copy of the the input Aadhaar data
    #in the link below:
    #https://www.dropbox.com/s/vn8t4uulbsfmalo/aadhaar_data.csv

    #Since you are printing the output of your program, printing a debug 
    #statement will interfere with the operation of the grader. Instead, 
    #use the logging module, which we've configured to log to a file printed 
    #when you click "Test Run". For example:
    #logging.info("My debugging message")
    #
    #Note that, unlike print, logging.info will take only a single argument.
    #So logging.info("my message") will work, but logging.info("my","message") will not.
    
    for line in sys.stdin:
        data = line.strip().split(',')
        if len(data) == 12 and data[3]!='District':
            print '{0}\t{1}'.format(data[3], data[8])

    return None

mapper()



def reducer():
    
    #Also make sure to fill out the mapper code before clicking "Test Run" or "Submit".

    #Each line will be a key-value pair separated by a tab character.
    #Print out each key once, along with the total number of Aadhaar 
    #generated, separated by a tab. Make sure each key-value pair is 
    #formatted correctly! Here's a sample final key-value pair: 'Gujarat\t5.0'

    #Since you are printing the output of your program, printing a debug 
    #statement will interfere with the operation of the grader. Instead, 
    #use the logging module, which we've configured to log to a file printed 
    #when you click "Test Run". For example:
    #logging.info("My debugging message")
    #Note that, unlike print, logging.info will take only a single argument.
    #So logging.info("my message") will work, but logging.info("my","message") will not.
        
    old_district = None
    for line in sys.stdin:
        data_mapped = line.strip().split('\t')
        if len(data_mapped) != 2:
            # Something has gone wrong. Skip this line.
            continue
        
        # same district
        if old_district and data_mapped[0] == old_district:
            count += float(data_mapped[1])
            
        # new district
        else:
            # not the first district
            if old_district:
                print '{0}\t{1}'.format(old_district,count)
                
            old_district = data_mapped[0]
            count = float(data_mapped[1])
            
    # print the last district
    print '{0}\t{1}'.format(old_district,count)
    
    return None