Extracting Data

In [1]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys
import xml.etree.ElementTree as ET

article_file = "./data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        fnm = author.find('fnm')
        if fnm is not None:
            fnm = fnm.text
            
        snm = author.find('snm')
        if snm is not None:
            snm = snm.text
            
        email = author.find('email')
        if email is not None:
            email = email.text
        
        data = {
                "fnm": fnm,
                "snm": snm,
                "email": email
        }
        
        authors.append(data)

    return authors


def test():
    solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 
        'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 
        'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, 
        {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 
        'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
    
    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["fnm"] == solution[1]["fnm"]


test()

Handling Attributes

In [2]:
#!/usr/bin/env python
# Your task here is to extract data from xml on authors of an article
# and add it to a list, one item for an author.
# See the provided data structure for the expected format.
# The tags for first name, surname and email should map directly
# to the dictionary keys, but you have to extract the attributes from the "insr" tag
# and add them to the list for the dictionary key "insr"
import xml.etree.ElementTree as ET

article_file = "./data/exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": author.find('fnm').text,
                "snm": author.find('snm').text,
                "email": author.find('email').text,
                "insr": [insr.attrib['iid'] for insr in author.findall('insr')]
        }
        authors.append(data)

    return authors


def test():
    solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
                {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
                {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
                {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
                {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
                {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
                {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
                {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]

    root = get_root(article_file)
    data = get_authors(root)

    assert data[0] == solution[0]
    assert data[1]["insr"] == solution[1]["insr"]


test()

In [3]:
from bs4 import BeautifulSoup

def options(soup, id):
    option_values = []
    carrier_list = soup.find(id=id)
    for option in carrier_list.find_all('option'):
        option_values.append(option['value'])
    return option_values

def print_list(label, codes):
    print '\n%s:' % label
    for c in codes:
        print c
        
def main():
    #soup = BeautifulSoup(open('./data/page_source.html'))
    soup = BeautifulSoup(open('./data/virgin_and_logan_airport.html'))
    
    #codes = options(soup, 'CarrierList')
    #print_list('Airports', codes)
    
    airports = options(soup, 'AirportList')
    carriers = options(soup, 'CarrierList')
    
    print_list('Airports', airports)
    print_list('Carriers', carriers)
    
main()
Airports:
All
AllMajors
ATL
BWI
BOS
CLT
MDW
ORD
DFW
DEN
DTW
FLL
IAH
LAS
LAX
MIA
MSP
JFK
LGA
EWR
MCO
PHL
PHX
PDX
SLC
SAN
SFO
SEA
TPA
DCA
IAD
AllOthers
UXM
ABR
ABI
DYS
ADK
VZF
BQN
AKK
KKI
AKI
AKO
CAK
7AK
KQA
AUK
ALM
ALS
ABY
ALB
ABQ
ZXB
WKK
AED
AEX
AXN
AET
ABE
AIA
APN
DQH
AOO
AMA
ABL
OQZ
AOS
OTS
AKP
EDF
DQL
MRI
ANC
AND
AGN
ANI
ANN
ANB
ANV
ATW
ACV
ARC
ADM
AVL
HTS
ASE
AST
AHN
AKB
PDK
FTY
ACY
ATT
ATK
MER
AUO
AGS
AUG
AUS
A28
BFL
BGR
BHB
BRW
BTI
BQV
A2K
BTR
BTL
AK2
A56
BTY
BPT
BVD
WBQ
BKW
BED
A11
KBE
BLV
BLI
BLM
JVL
BVU
BJI
RDM
BEH
BET
BTT
OQB
A50
BIC
BIG
BGQ
BMX
PWR
A85
BIL
BIX
BGM
KBC
BHM
BIS
BYW
BID
BMG
BMI
BFB
BYH
BCT
BOI
RLU
BXS
BLD
BYA
BWG
BZN
BFD
A23
BRD
BKG
PWT
KTS
BDR
TRI
BKX
RBH
BRO
BWD
BQK
BCE
BKC
BUF
IFP
BUR
BRL
BTV
MVW
BNO
BTM
JQF
UXI
CDW
C01
ADW
CDL
CGI
LUR
EHM
CZF
A61
A40
CYT
MDH
CLD
CNM
A87
CPR
CDC
CID
JRV
NRR
CEM
CDR
CIK
CMI
WCR
CHS
CRW
SPB
STT
CHO
CYM
CHA
CYF
WA7
CEX
EGA
NCN
KCN
VAK
CYS
PWK
DPA
LOT
CKX
CIC
CEF
KCG
KCL
WQZ
KCQ
CZN
CIV
ZXH
SSB
STX
CHU
LUK
CVG
OQC
A12
CHP
IRC
CLP
CKB
BKL
CLE
CGF
CFT
CLK
ZXN
CVN
ZXI
OOB
COD
CFA
KCC
A69
CDB
CXF
CLL
KCR
COS
COA
COU
CAE
CSG
CBM
GTR
OSU
CMH
LCK
CKU
CDV
CBA
CRP
CEZ
CVO
CIL
CGA
CEC
CKD
CUW
CPX
CBE
DCK
ADS
DAL
RBD
AFW
FTW
DGB
DAN
WQW
MGY
DAY
DAB
AA8
SCC
FVZ
A02
DTH
DTR
DEC
XXV
A36
DHB
DRG
DRT
DLF
DJN
DMN
APA
FTG
DSM
DTL
DET
DTT
YIP
DVL
DIK
DLG
DIO
DDC
FVQ
DOF
DHN
DOV
DRF
A22
FQQ
DUJ
DBQ
DLH
AMK
DRO
EAA
EGE
FRG
HTO
ESN
ESD
EAU
EDA
EDW
EEK
EGX
KKU
KEK
ZXO
IPL
ELD
BIF
ELP
ELV
ELI
EKO
ELM
LYU
ELY
EMK
WDG
ERI
ESC
EUG
EVV
EVM
PAE
EXI
EIL
FAI
FBK
A01
SUU
FAJ
KFP
FWL
FAR
FMN
FYV
XNA
FAY
POB
FFM
FIC
FAQ
FLG
FNT
FLO
FNL
WRI
FOD
FQW
FHU
TBN
RSW
FPR
FSI
FSM
FWA
FWH
FYU
FKL
VZE
FAT
FRD
FBS
FNR
GNV
GVL
GBH
GAL
GUP
GAM
GEK
GCK
GYY
GCC
AQY
GGW
AZ3
GDV
AK6
FVW
GLV
GNU
GYR
FVX
JGC
GCN
AZ1
GFK
GRI
GJT
GRR
GPZ
VWZ
GMT
XWA
KGX
GBD
GTF
GRB
GSO
GLH
PGV
GVT
GSP
UAM
GUM
GUF
GPT
GKN
GUC
GST
HGR
HNS
A03
HNM
CMX
VWD
ZXJ
HRL
MDT
HRO
BDL
PIB
HVR
HWI
HHR
HDN
HYS
HKB
HLN
T2X
HIB
HKY
HIO
ITO
HHH
HBH
HOB
HGZ
HOL
HYL
HCR
HOM
HST
VWX
HNL
MKK
HNH
HPB
HOP
HOT
EFD
HOU
HUS
HSV
HON
HSL
HUT
HYA
HYG
WHD
ICY
IDA
IGG
ILI
ZXF
IND
MQJ
INL
A57
IYK
IMT
IWD
ISP
SAW
ITH
KIB
A59
A26
MKL
JAC
JAN
NZC
JAX
NIP
OAJ
JMS
JHW
VZM
JON
JST
JBR
JLN
JNU
OGG
KAE
A37
A35
KKK
AZO
LUP
FCA
KLG
KAL
MUE
KNB
MKC
MCI
JHM
JRF
KKL
A65
KYK
KXA
KUK
VZR
VZY
FQD
VIK
MVM
EAR
ENA
KEH
KTN
WFB
DQU
EYW
NQX
QQB
IAN
GRK
ILE
A29
KVC
AKN
IGM
ISO
KPN
IRK
KKB
KVL
KZH
06A
LMT
KLW
SZL
TYS
OBU
A43
ADQ
KDK
A41
KNK
KGK
KOA
KKH
KOT
OTZ
KKA
KYU
LKK
UUK
KWT
KWK
LSE
LAF
LFT
LCH
XXW
HII
LMA
TVL
LNY
ZXK
WJF
LNS
LAN
LAR
LRD
KLN
HSH
LSV
VGT
LBE
LZU
LAW
ALZ
LEB
VA4
KLL
LWB
LWS
LEW
LWT
LEX
LBL
LIH
UXA
LVD
LNK
LIT
05A
LGU
LNI
LGB
LIJ
GGG
LPS
LPR
LAM
SDF
LBB
LYH
MCN
MSN
A75
MMH
MNZ
MHT
MHK
MBL
MLY
KMO
MZJ
MTH
MYH
MWA
MQT
MLL
MVY
MCW
MSS
MYK
MAZ
MYL
MXY
MCK
OQA
MCG
MCL
MFR
MDR
MYU
MLB
OQL
MEM
XXX
MCE
MEI
OQM
MFH
MTM
WMK
MPB
6B0
MDO
MAF
MDY
MLS
NQA
MKE
MHM
STP
MIB
MOT
MNT
MFE
MSO
CNY
BFM
MOB
MOD
VZG
MLI
MLU
MRY
MGM
MTJ
UXR
MGW
MMU
MVL
KMY
MWH
MOS
CWA
MUO
MOU
A13
MSL
VZC
MKG
MYR
NNK
WQR
AA2
ACK
KEB
APC
WNA
KPM
PKA
APF
BNA
NKI
NLG
ENN
EWB
EWN
HVN
ARA
GON
NEW
MSY
DQN
KNW
JRB
TSS
NYC
SWF
LFI
PHF
ONP
WWT
EWK
IAG
NME
NIB
IKO
NIN
RQI
WTK
OME
NNL
ORV
OFK
ORF
NGU
OTH
LBF
MA5
OHC
ORT
NUI
NUL
NUP
ZNC
ODW
OAK
OCF
OFU
HIF
OGD
OGS
OKC
OJC
JCI
OLH
KOY
XWS
OLV
OLM
OMA
ONN
ONT
OPH
ORL
OSH
KOZ
OWB
UOX
OXR
PBK
PAH
PGA
PPG
PCE
PSP
PMD
PAQ
PFN
ECP
PAM
PKD
PKB
PSC
PRB
DQR
1G4
DQW
WQJ
PDB
PEC
PLN
PDT
PNS
NPA
PIA
KPV
VYS
GUS
PSG
PNF
PNE
LUF
DVT
AZA
SCF
PIR
PIP
UGB
PQS
SOP
AGC
PIT
PSF
PTU
PLB
PBG
PTR
PIH
A27
KPB
PHO
PIZ
POQ
PNC
PSE
PTK
PVY
PTD
PTC
PTA
CLM
KPY
KPC
PGM
PTH
A48
ORI
PML
PPV
TWD
A17
KPR
PCA
WQU
PTV
PWM
PSM
PRC
PQI
BLF
PPC
PVD
PVC
PVU
PUO
A39
PUB
PUW
OQP
PGD
AK5
UIN
KWN
RDU
RMP
RCA
RAP
RDG
RDV
RDB
A76
A04
RDR
RDD
RNO
RNT
RHI
RIC
RIL
RIV
RIW
ROA
RCE
RST
ROC
RKS
RFD
RKD
RWI
ROG
FAL
RME
RSJ
ROW
ROP
RBY
RUI
RSH
RSN
RUT
SAC
SMF
SAD
MBS
SPN
SLE
SLT
SLN
SNS
SBY
SMN
ZXM
SJT
SKF
SAT
NKX
MYF
NZY
SJC
WSJ
SIG
SJU
SBP
SDP
KSR
OQS
SFB
SNA
SBA
SAF
SMX
STS
SLK
SRQ
CIU
SVN
SAV
SVA
SCM
BFF
AVP
SYB
BFI
LKE
SDX
A07
WLK
SOV
A31
SQV
SWD
SHX
SKK
A90
A77
SXP
SYA
SHR
OQV
SHH
SOW
BAD
SHV
SHG
SDY
SVC
SUX
FSD
NKT
SIT
SKJ
SGY
SKW
SLQ
SCJ
MQY
SXQ
SBN
WSN
SVW
GEG
SPI
SGF
UST
STC
STG
SGU
STJ
CPS
STL
SUS
KSM
SMK
SNP
PIE
RMN
STF
SCE
SHD
WSB
WBB
WA6
VZO
SVS
SWO
SCK
SRV
SSC
SUN
SYR
TCM
TIW
TCT
TKA
TLH
MCF
TAL
TSM
TLJ
TEK
TAV
TWE
TLF
TLA
TEX
TKE
HUF
A30
TEB
TEH
TXK
DLS
TVF
KTB
TNC
TIQ
TOG
TKJ
TKI
OOK
TOL
TPH
FOE
JZE
TVC
TTN
TTD
TUS
TUL
TLT
UTM
WTL
TNK
TUP
TCL
TWF
TWA
TYR
TYE
UGI
UGS
UMT
UMB
UNK
DUT
UTO
VDZ
VLD
VPS
VPZ
VNY
VUO
VEE
VEL
VCT
VCV
A67
VQS
VCB
A70
VIS
OQI
CNW
ACT
AIN
AWK
WAA
ALW
WWA
OXC
KWF
ALO
ART
ATY
EAT
ENV
AWM
PBI
KWP
WYS
WST
BAF
FOK
WSX
WWP
WMO
HPN
DQS
SPS
ICT
WDB
VZN
IPT
ISN
WOW
ILG
ILM
WGO
INW
INT
WA5
WSM
OLF
ORH
WRL
WRG
YKM
YAK
XWC
WYB
YNG
A63
NYL
YUM
KZB
AK8

Carriers:
All
AllUS
AllForeign
AS
AA
5Y
DL
MQ
EV
F9
HA
B6
OO
WN
NK
UA
VX
C:\Users\Jeff\Anaconda2\lib\site-packages\bs4\__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

To get rid of this warning, change this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))

Using Beautiful Soup

In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Please note that the function 'make_request' is provided for your reference only.
# You will not be able to to actually use it from within the Udacity web UI.
# Your task is to process the HTML using BeautifulSoup, extract the hidden
# form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
# values in the data dictionary.
# All your changes should be in the 'extract_data' function
from bs4 import BeautifulSoup
import requests
import json

html_page = "./data/page_source.html"


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html,'lxml')
        data['eventvalidation'] = soup.find(id='__EVENTVALIDATION')['value']
        data['viewstate'] = soup.find(id='__VIEWSTATE')['value']

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")

    
test()

Best Practices for Scraping

  1. Look at how a browser makes requests
  2. Emulate in code
  3. If stuff blows up, look at your HTTP traffic
  4. Return to Step 1 until it works

Scraping Solution

In [5]:
from bs4 import BeautifulSoup

s = requests.Session()

r = s.get('http://www.transtats.bts.gov/Data_Elements.aspx?Data=2')
soup = BeautifulSoup(r.text)
viewstate = soup.find(id='__VIEWSTATE')['value']
eventvalidation = soup.find(id='__EVENTVALIDATION')['value']

r = s.post('http://www.transtats.bts.gov/Data_Elements.aspx?Data=2',
          data={'AirportList' : 'BOS',
                'CarrierList' : 'VX',
                'Submit' : 'Submit',
                '__EVENTTARGET': '',
                '__EVENTARGUMENT' : '',
                '__EVENTVALIDATION' : eventvalidation,
                '__VIEWSTATE' : viewstate})

f = open('./data/virgin_and_logan_airport.html','w')
f.write(r.text)