-
Notifications
You must be signed in to change notification settings - Fork 0
/
bls_eni.py
217 lines (187 loc) · 9.26 KB
/
bls_eni.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Author: Eni Awowale & Coline Dony
# Date first written: December 18, 2018
# Date last updated: August 15, 2019
# Purpose: Extract BLS data
# Problem Statement:
# For several projects at the AAG, it would be valuable to utilize data
# made available by the Bureau of Labor Statistics (BLS) on jobs
# and occupations. This mai framework calls different modules that
# were developed to extract these data using the BLS API and re-format these
# data in different formats (spreadsheets, maps, etc.) as needed.
"""
1. PYTHON LIBRARIES
all libraries necessary to run all scripts
"""
import os
import requests
import json
# Coline's and Eni's Directories
# folder = r'C:\Users\cdony\Google Drive\GitHub\bls-and-geographers'
folder = r'C:\Users\oawowale\Documents\GitHub\bls-and-geographers'
os.chdir(folder)
"""
BUILDING BLOCKS TO CREATE, SEND, and GET THE RESPONSE OF A BLS QUERY
see https://www.bls.gov/developers/api_signature_v2.htm#parameters
A BLS QUERY is composed of:
(A) the data we are requesting (in the form of a series_id)
(B) the start and end years we want to extract data from (optional)
(C) whether or not we want to use an API key (optional)
A BLS HTTP REQUEST needs to be composed of the following items:
(D) the BLS API location (there are 2 locations: v1 and v2),
(E) the data query (see C.)
(F) the header, which contains what format we want to be sent
(G) Using the "GET" function of an HTTP REQUEST the response can be read.
"""
# A. We are requesting "Occupational Employment Statistics" data under
# the the Employment and Unemployment Survey, which requires a specific format
# https://www.bls.gov/help/hlpforma.htm#OE
# Data Viewer app: https://beta.bls.gov/dataQuery/find?removeAll=1
# What a single series_id should be made of
prefix = 'OE' # Occupational Employment
seasonal_code = 'U' # Seasonal
area_type_code = 'S' # Statewide
state_code = '01' # 01 stands for Alabama for example
area_code = '00000' # Statewide
industry_code = '000000' # Cross-industry, Private, Federal, State, and Local Government
occupation_code = '193092' # 19-3092 stands for geogrpaher for example
data_type = '01' # Employment
# A1. State codes: read the states code dictionary from BLS website
# The BLS State codes represent each state and its name as used in their databases
# see https://download.bls.gov/pub/time.series/sa/sa.state
# Url request so we can get the states and state codes
BLS_dictionary_url = r'https://download.bls.gov/pub/time.series/sa/sa.state'
BLS_dictionary_textfile = requests.get(BLS_dictionary_url).text
bls_states_db = {}
# Creating state dictionary
for line in BLS_dictionary_textfile.split('\n')[1:]:
try:
state_code, state_name = line.strip().split('\t')
except:
continue
bls_states_db[state_code] = state_name
# A2. Occupation codes: read the occupation code dictionary from BLS website
# The BLS Occupation codes represent the 6-digit code and its name as used in their databases
# see https://download.bls.gov/pub/time.series/oe/oe.occupation
BLS_dictionary_url = r'https://download.bls.gov/pub/time.series/oe/oe.occupation'
BLS_dictionary_textfile = requests.get(BLS_dictionary_url).text
bls_occupations_db = {}
for line in BLS_dictionary_textfile.split('\n')[1:]:
try:
occupation_code_6digit, occupation_name = line.strip().split('\t')[:2]
except:
continue
bls_occupations_db[occupation_code_6digit] = occupation_name
# A3. AAG salary data: read the occupation codes associated with geography jobs
# from the AAG salary spreadsheet (provided by Mark Revell in June 2019)
AAG_salary_data_textfilename = r'Salary Data 2018 updated.txt'
AAG_salary_data_textfile = open(AAG_salary_data_textfilename).readlines()
headers = AAG_salary_data_textfile[0].split('\t')
aag_occupations_db = {}
for line in AAG_salary_data_textfile[1:]:
# The occupations in the salary data file are based on the BLS 8-digit codes
occ_name_8digit, occ_code_8digit = line.split('\t')[:2]
# splits by tab
if occ_code_8digit != '':
# To make queries to the BLS API, we need the 6-digit code instead
occ_code_6digit = occ_code_8digit.replace('-', '').replace('.', '')[:6]
try:
occ_name_6digit = bls_occupations_db[occ_code_6digit]
# adding a zero to the end of the 6 digit codes so it is the correct length
except:
occ_code_6digit = occ_code_6digit[:5] + '0'
occ_name_6digit = bls_occupations_db[occ_code_6digit]
# adding the occupational codes the agg_occupations_db
if occ_code_6digit not in aag_occupations_db:
aag_occupations_db[occ_code_6digit] = {'Main occupation name': occ_name_6digit,
'Geography occupations': {occ_code_8digit: occ_name_8digit}}
else:
aag_occupations_db[occ_code_6digit]['Geography occupations'][occ_code_8digit] = occ_name_8digit
# List of 6-digit occupation codes that relate to geography jobs
aag_occupations = list(aag_occupations_db.keys())
# A4: Series_ids: Create a list of the series ids we are interested
# to request to the BLS API (and store them to a text file for
# documentation purposes)
series_ids = []
series_ids_file = open('list_series_id.txt', 'w')
# Initializing variable series_id that follows the BLS series_id format
for state_code in bls_states_db:
for occupation_code in aag_occupations:
series_id = prefix + seasonal_code + area_type_code + state_code + area_code + industry_code + occupation_code + data_type
series_ids += [series_id]
series_ids_file.write(series_id + '\n')
series_ids_file.close()
# Split the list into chucks of 50 series ids
series_ids_chunks = [series_ids[i:i + 50] for i in range(0, len(series_ids), 50)]
# B. The years we want to extract from
startyear = 2018
endyear = 2018
# C. Make Requests to the BLS API (using multiple series ids)
# https://www.bls.gov/developers/api_signature_v2.htm#multiple
# BLS API keys:
bls_api_key = {'Eni 1': 'de6366639eb64fa79045c9071a080dd5',
'Eni 2': '9954b4145b514eae81c6fe9a93739299',
'Eni 3': '8649eca48fd747478b7ba9a7095b2473',
'Coline': '41d57752042240da84a71fd2ba7c748d'}
# Store bls responses to a text file
bls_responses_textfilename = r'bls_state_occupational_employment.txt'
'''
bls_responses_textfile = open(bls_responses_textfilename, 'w')
bls_responses_textfile.write('State\t' + '\t'.join(aag_occupations))
state_values = []
for series_ids_chunk in series_ids_chunks:
data_query = json.dumps({"seriesid": series_ids_chunk,
"startyear": startyear,
"endyear": endyear,
"registrationkey": bls_api_key['Coline']})
# BLS API location
bls_api_location = 'https://api.bls.gov/publicAPI/v2/timeseries/data/'
# Query Header: requests the response to be in the JSON format
headers = {'Content-type': 'application/json'}
# Send the query to the BLS API
post_request = requests.post(bls_api_location, data=data_query, headers=headers)
# Get the API response as text and convert it to a JSON dictionary
get_response = json.loads(post_request.text)
# Extracting data from the API response
for series in get_response['Results']['series']:
series_id = series['seriesID']
state_code = series_id[4:6]
state_name = bls_states_db[state_code]
occupation_code_6digit = series_id[17:23]
occupation_index = aag_occupations.index(occupation_code_6digit)
try: employment = series['data'][0]['value'].replace('-', 'no est.')
except: employment = 'none'
if state_name not in state_values:
bls_responses_textfile.write('\t'.join(state_values) + '\n')
state_values = [state_name] + ['*']*len(aag_occupations)
state_values[occupation_index + 1] = employment
bls_responses_textfile.write('\t'.join(state_values) + '\n')
bls_responses_textfile.close()
'''
# Read responses from the text file and calculate
# the top 5 occupations
bls_responses_textfile = open(bls_responses_textfilename).readlines()
bls_states_values_db = {}
for line in bls_responses_textfile[1:]:
state_data = line.split('\t')
state_name = state_data[0]
bls_states_values_db[state_name] = {}
# Convert text values into integers
employment_ints = []
for occupation_code, employment_text in zip(aag_occupations, state_data[1:]):
try:
employment_int = int(employment_text)
except:
employment_int = 0
employment_ints += [employment_int]
bls_states_values_db[state_name][occupation_code] = {'employment text': employment_text,
'employment int': employment_int}
# Calculate top 5 occupations per state
bls_states_values_db[state_name]['top 5'] = []
for employment_int, occupation_code in sorted(zip(employment_ints, aag_occupations), reverse=True)[:5]:
bls_states_values_db[state_name]['top 5'] += [occupation_code]
for occupation_code in bls_states_values_db[state_name]['top 5']:
occupation_name = aag_occupations_db[occupation_code]['Main occupation name']
employment = bls_states_values_db[state_name][occupation_code]['employment text']
print(occupation_name, ' : ', employment)
del requests
del json