Parsing CSV Files

In [1]:
# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!
import os

DATADIR = "./data/"
DATAFILE = "beatles-diskography.csv"
datafile = DATADIR+DATAFILE


def parse_file(datafile):
    data = []
    count = 0
    
    with open(datafile, "r") as f:
        for line in f:
            # the header row ==> get the keys
            if count == 0:
                keys = line.strip().split(',')
            elif count <= 10:
                values = line.strip().split(',')
                data.append(dict(zip(keys,values)))
            else:
                break
            
            count += 1

    return data

def test():
    # a simple test of your implemetation
    datafile = os.path.join(DATADIR, DATAFILE)
    d = parse_file(datafile)
    firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 
                 'Released': '22 March 1963', 'US Chart Position': '-', 'RIAA Certification': 'Platinum', 
                 'BPI Certification': 'Gold'}
    tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964', 
                 'US Chart Position': '-', 'RIAA Certification': '', 'BPI Certification': 'Gold'}

    assert d[0] == firstline
    assert d[9] == tenthline

    
test()

Reading Excel Files

In [2]:
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
from zipfile import ZipFile
import numpy as np
datafile = "./data/2013_ERCOT_Hourly_Load_Data.xls"

def open_zip(datafile):
    pass    
    #with ZipFile('{0}.zip'.format(datafile), 'r') as myzip:
        #myzip.extractall()


def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    ### example on how you can get the data
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    
    ### other useful methods:
    # print "\nROWS, COLUMNS, and CELLS:"
    # print "Number of rows in the sheet:", 
    # print sheet.nrows
    # print "Type of data in cell (row 3, col 2):", 
    # print sheet.cell_type(3, 2)
    # print "Value in cell (row 3, col 2):", 
    # print sheet.cell_value(3, 2)
    # print "Get a slice of values in column 3, from rows 1-3:"
    # print sheet.col_values(3, start_rowx=1, end_rowx=4)

    # print "\nDATES:"
    # print "Type of data in cell (row 1, col 0):", 
    # print sheet.cell_type(1, 0)
    # exceltime = sheet.cell_value(1, 0)
    # print "Time in Excel format:",
    # print exceltime
    # print "Convert time to a Python datetime tuple, from the Excel float:",
    # print xlrd.xldate_as_tuple(exceltime, 0)
    
    #sheet_data = [[sheet.cell_value(r, col) for col in range(sheet.ncols)] for r in range(sheet.nrows)]
    
    times = sheet.col_values(0, start_rowx=1, end_rowx=None)
    coast = sheet.col_values(1, start_rowx=1, end_rowx=None)
    
    i_max = np.argmax(coast)
    i_min = np.argmin(coast)
    
    maxtime = xlrd.xldate_as_tuple(times[i_max], 0)
    mintime = xlrd.xldate_as_tuple(times[i_min], 0)
    
    maxvalue = coast[i_max]
    minvalue = coast[i_min]
    avgcoast = np.mean(coast)
    
    data = {
            'maxtime': maxtime,
            'maxvalue': maxvalue,
            'mintime': mintime,
            'minvalue': minvalue,
            'avgcoast': avgcoast
    }
    return data


def test():
    open_zip(datafile)
    data = parse_file(datafile)

    assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
    assert round(data['maxvalue'], 10) == round(18779.02551, 10)


test()

JSON Playground

In [3]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print "requesting", r.url

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print json.dumps(data, indent=indent, sort_keys=True)
    else:
        print data


def main():
    '''
    Modify the function calls and indexing below to answer the questions on
    the next quiz. HINT: Note how the output we get from the site is a
    multi-level JSON document, so try making print statements to step through
    the structure one level at a time or copy the output to a separate output
    file.
    '''
    results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
    #pretty_print(results)
    print results.keys()
    print results['artists'][0]['name']
    print len(results)

    artist_id = results["artists"][1]["id"]
    print "\nARTIST:"
    pretty_print(results["artists"][1])

    artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
    releases = artist_data["releases"]
    print "\nONE RELEASE:"
    pretty_print(releases[0], indent=2)
    release_titles = [r["title"] for r in releases]

    print "\nALL TITLES:"
    for t in release_titles:
        print t


if __name__ == '__main__':
    main()
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json
[u'count', u'offset', u'artists', u'created']
Nirvana
4

ARTIST:
{
    "area": {
        "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed", 
        "name": "United Kingdom", 
        "sort-name": "United Kingdom"
    }, 
    "begin-area": {
        "id": "f03d09b3-39dc-4083-afd6-159e3f0d462f", 
        "name": "London", 
        "sort-name": "London"
    }, 
    "country": "GB", 
    "disambiguation": "60s band from the UK", 
    "id": "9282c8b4-ca0b-4c6b-b7e3-4f7762dfc4d6", 
    "life-span": {
        "begin": "1967", 
        "ended": null
    }, 
    "name": "Nirvana", 
    "score": "100", 
    "sort-name": "Nirvana", 
    "tags": [
        {
            "count": 1, 
            "name": "rock"
        }, 
        {
            "count": 1, 
            "name": "pop"
        }, 
        {
            "count": 1, 
            "name": "progressive rock"
        }, 
        {
            "count": 1, 
            "name": "orchestral"
        }, 
        {
            "count": 1, 
            "name": "british"
        }, 
        {
            "count": 1, 
            "name": "power pop"
        }, 
        {
            "count": 1, 
            "name": "psychedelic rock"
        }, 
        {
            "count": 1, 
            "name": "soft rock"
        }, 
        {
            "count": 1, 
            "name": "symphonic rock"
        }, 
        {
            "count": 1, 
            "name": "english"
        }
    ], 
    "type": "Group"
}
requesting http://musicbrainz.org/ws/2/artist/9282c8b4-ca0b-4c6b-b7e3-4f7762dfc4d6?fmt=json&inc=releases

ONE RELEASE:
{
  "barcode": null, 
  "country": "GB", 
  "date": "1969", 
  "disambiguation": "", 
  "id": "0b44cb36-550a-491d-bfd9-8751271f9de7", 
  "packaging": null, 
  "quality": "normal", 
  "release-events": [
    {
      "area": {
        "disambiguation": "", 
        "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed", 
        "iso-3166-1-codes": [
          "GB"
        ], 
        "name": "United Kingdom", 
        "sort-name": "United Kingdom"
      }, 
      "date": "1969"
    }
  ], 
  "status": "Official", 
  "text-representation": {
    "language": "eng", 
    "script": "Latn"
  }, 
  "title": "To Markos III"
}

ALL TITLES:
To Markos III
Travelling on a Cloud
Songs Of Love And Praise
Songs of Love and Praise
Songs of Love and Praise
Secret Theatre
The Story of Simon Simopath
Me And My Friend
All of Us
The Story of Simon Simopath
To Markos III
Chemistry
Local Anaesthetic
Orange & Blue
Pentecost Hotel
Black Flower
All of Us

Exploring JSON

In [4]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

# How many bands are there named "First Aid Kit"?
results = query_by_name(ARTIST_URL, query_type["simple"], "First Aid Kit")
names = [artist['name'] for artist in results['artists'] if artist['name']=='First Aid Kit']
print '\n***There are', len(names), 'bands named "First Aid Kit"\n'

# begin-area name for Queen?
results = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
queen = [artist for artist in results['artists'] if artist['name']=='Queen']
print '\n***The begin-area name for Queen is', queen[0]['begin-area']['name'],'\n'

# Spanish alias for the Beatles?
results = query_by_name(ARTIST_URL, query_type["simple"], "Beatles")
beatles = [artist for artist in results['artists'] if (artist['name']=='Beatles' or artist['name']=='The Beatles')]
#pp.pprint(beatles)
#print beatles[0]['aliases']
pp.pprint([alias['name'] for alias in beatles[0]['aliases']])
print '\n'

# Nirvana disambiguation
results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
nirvana = [artist for artist in results['artists'] if artist['name']=='Nirvana']
print '\n***The disambiguation for Nirvana is:',nirvana[0]['disambiguation'],'\n'

# when was one direction formed?
results = query_by_name(ARTIST_URL, query_type["simple"], "One Direction")
onedirection = [artist for artist in results['artists'] if artist['name']=='One Direction']
print '\n***One Direction began', onedirection[0]['life-span']['begin']
requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AFirst+Aid+Kit&fmt=json

***There are 2 bands named "First Aid Kit"

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AQueen&fmt=json

***The begin-area name for Queen is London 

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ABeatles&fmt=json
[ u'\ub354 \ube44\ud2c0\uc988',
  u'\u30b6\u30fb\u30d3\u30fc\u30c8\u30eb\u30ba',
  u'Beatles',
  u'Beetles',
  u'fab four',
  u'Los Beatles',
  u'The Beatles',
  u'\u0411\u0438\u0442\u043b\u0437',
  u'\u62ab\u5934\u58eb',
  u'\u62ab\u982d\u56db']


requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json

***The disambiguation for Nirvana is: 90s US grunge band 

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AOne+Direction&fmt=json

***One Direction began 2010-07