Sales per Category

Mapper

In [1]:
#!/usr/bin/python

# Format of each line is:
# date\ttime\tstore name\titem description\tcost\tmethod of payment
#
# We want elements 3 (item description) and 4 (cost)
# We need to write them out to standard output, separated by a tab

import sys

for line in sys.stdin:
    data = line.strip().split("\t")
    if len(data) == 6:
        print "{0}\t{1}".format(data[3], data[4])

Reducer

In [2]:
#!/usr/bin/python

import sys

salesTotal = 0
oldKey = None

# Loop around the data
# It will be in the format key\tval
# Where key is the product category, val is the sale amount
#
# All the sales for a particular product category will be presented,
# then the key will change and we'll be dealing with the next store

for line in sys.stdin:
    data_mapped = line.strip().split("\t")
    if len(data_mapped) != 2:
        # Something has gone wrong. Skip this line.
        continue

    (thisKey, thisSale) = data_mapped

    if oldKey and oldKey != thisKey:
        print oldKey, "\t", salesTotal
        oldKey = thisKey;
        salesTotal = 0
    else:
        oldKey = thisKey

    #oldKey = thisKey
    salesTotal += float(thisSale)

if oldKey != None:
    print oldKey, "\t", salesTotal

Highest Sale

Mapper

In [3]:
#!/usr/bin/python

# Format of each line is:
# date\ttime\tstore name\titem description\tcost\tmethod of payment
#
# We want elements 2 (store name) and 4 (cost)
# We need to write them out to standard output, separated by a tab

import sys

for line in sys.stdin:
    data = line.strip().split("\t")
    if len(data) == 6:
        print "{0}\t{1}".format(data[2], data[4])

Reducer

In [4]:
#!/usr/bin/python

import sys

old_store = None
max_sale = 0

# Loop around the data
# It will be in the format key, val
# Where key is the store, val is the sale amount

for line in sys.stdin:
    data_mapped = line.strip().split("\t")
    if len(data_mapped) != 2:
        # Something has gone wrong. Skip this line.
        continue

    (store, sale) = data_mapped
    sale = float(sale)

	# if it's a new store (and not the first store)
    if old_store and old_store != store:
        print old_store, "\t", max_sale
        old_store = store;
        max_sale = sale
        
    # if it's the same store
    else:
    	old_store = store
        if sale > max_sale:
        	max_sale = sale

if old_store != None:
    print store, "\t", max_sale

Total Sales

Mapper

In [5]:
#!/usr/bin/python

# Format of each line is:
# date\ttime\tstore name\titem description\tcost\tmethod of payment
#
# We want elements 2 (store name) and 4 (cost)
# We need to write them out to standard output, separated by a tab

import sys

for line in sys.stdin:
    data = line.strip().split("\t")
    if len(data) == 6:
        print "{0}\t{1}".format(data[2], data[4])

Reducer

In [6]:
#!/usr/bin/python

import sys

sale = None
sales_value = 0.0
sales_count = 0

# Loop around the data
# It will be in the format key, val
# Where key is the store, val is the sale amount

for line in sys.stdin:
    data_mapped = line.strip().split("\t")
    if len(data_mapped) != 2:
        # Something has gone wrong. Skip this line.
        continue

    (_, sale) = data_mapped
    sales_value += float(sale)
    sales_count += 1

if sale != None:
    print sales_value, "\t", sales_count

Hits to Page

Mapper

In [7]:
#!/usr/bin/python

'''The logfile is in Common Log Format:

10.223.157.186 - - [15/Jul/2009:15:50:35 -0700] "GET /assets/js/lowpro.js HTTP/1.1" 200 10469

%h %l %u %t \"%r\" %>s %b

Where:

* %h is the IP address of the client
* %l is identity of the client, or "-" if it's unavailable
* %u is username of the client, or "-" if it's unavailable
* %t is the time that the server finished processing the request. The format is [day/month/year:hour:minute:second zone]
* %r is the request line from the client is given (in double quotes). It contains the 
    * method
    * path
    * query-string
    * protocol or the request.
* %>s is the status code that the server sends back to the client. You will see see mostly status codes 200 
  (OK - The request has succeeded), 304 (Not Modified) and 404 (Not Found). See more information on status codes in W3C.org
* %b is the size of the object returned to the client, in bytes. It will be "-" in case of status code 304.
'''

import sys

for line in sys.stdin:
    # extract the log fields
    data = line.replace('[','').replace(']','').replace('"','').split(' ')
    # data = (IP, ID, username, dt [date+time], timezone, method, path, qspr [query-string + protocol/request], status, size)
    if len(data) == 10:
        print "{0}".format(data[6])

Reducer

In [8]:
#!/usr/bin/python

import sys

path = None
path_count = 0

# Loop around the data
# It will be in the format key, val
# Where key is the store, val is the sale amount

for line in sys.stdin:
    newpath = line.strip()
    
    # same path --> add to the count
    if path and path == newpath:
        path_count += 1
    
    # new path --> print old path & count, then reset path & count
    else:
        # print the old path & count as long as it's not the first line
        if path:
            print path, "\t", path_count
            
        path = newpath
        path_count = 1

# print the last path & count
if path != None:
    print path, "\t", path_count

Hits from IP

Mapper

In [9]:
#!/usr/bin/python

'''The logfile is in Common Log Format:

10.223.157.186 - - [15/Jul/2009:15:50:35 -0700] "GET /assets/js/lowpro.js HTTP/1.1" 200 10469

%h %l %u %t \"%r\" %>s %b

Where:

* %h is the IP address of the client
* %l is identity of the client, or "-" if it's unavailable
* %u is username of the client, or "-" if it's unavailable
* %t is the time that the server finished processing the request. The format is [day/month/year:hour:minute:second zone]
* %r is the request line from the client is given (in double quotes). It contains the 
    * method
    * path
    * query-string
    * protocol or the request.
* %>s is the status code that the server sends back to the client. You will see see mostly status codes 200 
  (OK - The request has succeeded), 304 (Not Modified) and 404 (Not Found). See more information on status codes in W3C.org
* %b is the size of the object returned to the client, in bytes. It will be "-" in case of status code 304.
'''

import sys

for line in sys.stdin:
    # extract the log fields
    data = line.replace('[','').replace(']','').replace('"','').split(' ')
    # data = (IP, ID, username, dt [date+time], timezone, method, path, qspr [query-string + protocol/request], status, size)
    if len(data) == 10:
        print "{0}".format(data[0])

Reducer

In [10]:
#!/usr/bin/python

import sys

key = None
key_count = 0

# Loop around the data
# It will be in the format key, val
# Where key is the store, val is the sale amount

for line in sys.stdin:
    newkey = line.strip()
    
    # same key --> add to the count
    if key and key == newkey:
        key_count += 1
    
    # new key --> print old key & count, then reset key & count
    else:
        # print the old key & count as long as it's not the first line
        if key:
            print key, "\t", key_count
            
        key = newkey
        key_count = 1

# print the last key & count
if key != None:
    print key, "\t", key_count

Mapper

In [11]:
#!/usr/bin/python

'''The logfile is in Common Log Format:

10.223.157.186 - - [15/Jul/2009:15:50:35 -0700] "GET /assets/js/lowpro.js HTTP/1.1" 200 10469

%h %l %u %t \"%r\" %>s %b

Where:

* %h is the IP address of the client
* %l is identity of the client, or "-" if it's unavailable
* %u is username of the client, or "-" if it's unavailable
* %t is the time that the server finished processing the request. The format is [day/month/year:hour:minute:second zone]
* %r is the request line from the client is given (in double quotes). It contains the 
    * method
    * path
    * query-string
    * protocol or the request.
* %>s is the status code that the server sends back to the client. You will see see mostly status codes 200 
  (OK - The request has succeeded), 304 (Not Modified) and 404 (Not Found). See more information on status codes in W3C.org
* %b is the size of the object returned to the client, in bytes. It will be "-" in case of status code 304.
'''

import sys

for line in sys.stdin:
    # extract the log fields
    data = line.replace('[','').replace(']','').replace('"','').split(' ')
    # data = (IP, ID, username, dt [date+time], timezone, method, path, qspr [query-string + protocol/request], status, size)
    if len(data) == 10:
        print "{0}".format(data[6].replace('http://www.the-associates.co.uk',''))

Reducer

In [12]:
#!/usr/bin/python

import sys

key = None
key_count = 0
max_key = None
max_count = 0

# Loop around the data
# It will be in the format key, val
# Where key is the store, val is the sale amount

for line in sys.stdin:
    newkey = line.strip()
    
    # same key --> add to the count
    if key and key == newkey:
        key_count += 1
    
    # new key
    else:
        # compare the last key_count to max_count
        if max_count < key_count:
            max_count = key_count
            max_key = key
             
        # reset the key and the count
        key = newkey
        key_count = 1
    
# print the maximum key_count and its key
if key != None:
    print max_key, "\t", max_count