-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCleanMPData_KLowell.py
204 lines (202 loc) · 10.4 KB
/
CleanMPData_KLowell.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 29 12:23:59 2017
@author: Kim Earl Lowell
"""
#%%
# KL code.This will clean some of the Martin's Point claims data.
################ concatchars #####################
# Function concatchars checks for non-numeric information in cells where it
# appears that textual information contained a comma -- e.g., "Statistical,
# non-specific chest pain." It concatenates the two fields separating them
# by a slash. It checks for strings that contain no digits.
def concatchars(dfout,i,subjcol):
# Change things if the cell is not null and it contains only characters.
if pd.notnull(dfout.iloc[i,subjcol]) and not any(char.isdigit()
for char in str(dfout.iloc[i,subjcol])):
dfout.iloc[i,subjcol-1]=str(dfout.iloc[i,subjcol-1])+'/'+\
str(dfout.iloc[i,subjcol])
for j in range(subjcol+1,dfout.shape[1]-1):
dfout.iloc[i,j-1]=dfout.iloc[i,j]
dfout.iloc[i,dfout.shape[1]-1]=''
return dfout
################ concatmixed #####################
# Function concatchars checks if a string as any letters at all in cells where
# it appears that textual information contained a comma -- e.g., "Pneumonia, 13
# cc drug A". It concatenates the two fields separating them
# by a slash. It checks for strings that contain no digits.
def concatmixed(dfout,i,subjcol):
# Change things if the cell is not null and it contains only characters.
if pd.notnull(dfout.iloc[i,subjcol]) and any(char.isalpha()
for char in str(dfout.iloc[i,subjcol])):
dfout.iloc[i,subjcol-1]=str(dfout.iloc[i,subjcol-1])+'/'+\
str(dfout.iloc[i,subjcol])
for j in range(subjcol+1,dfout.shape[1]-1):
dfout.iloc[i,j-1]=dfout.iloc[i,j]
dfout.iloc[i,dfout.shape[1]-1]=''
return dfout
############## addleadzero #####################
# Function addleadzero adds a zero to the front of zip codes that had the
# zero stripped off (probably when the csv file was created).
def addleadzero(dfout,i,subjcol):
print(i,type(dfout.iloc[i,subjcol]))
print(len(dfout.iloc[i,subjcol]))
if pd.notnull(dfout.iloc[i,subjcol]) and len(dfout.iloc[i,subjcol])<5:
dfout.iloc[i,subjcol]='0'+dfout.iloc[i,subjcol]
#################################################
############### tominutes #######################
# Function tominutes converts time in seconds to minutes and seconds
def tominutes(timeinseconds):
# minutes, seconds = divmod(timeinseconds, 60)
# hours, minutes = divmod(minutes, 60)
hours=int(timeinseconds/3600)
minutes=int((timeinseconds- hours*3600)/60)
seconds=int(timeinseconds-hours*3600-minutes*60)
return hours,minutes,seconds
#################################################
############## MAIN BODY OF PROGRAM #############
import pandas as pd
import time
# Set up input and output files and read input file.
#inpath = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Test_Kim/"
#infile='Claims_201401_201412_Kim_1053.csv'
inpath = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Raw/"
infile='Claims_201401_201412.csv'
#infile='Claims_201501_201512.csv'
#infile='Claims_201501_201512.csv'
#infile='Claims_201701_201712.csv'
outfile = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Clean/CleanClaims_201401_201412.csv"
#outfile = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Clean/CleanClaims_201401_201412.csv"
#outfile = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Clean/CleanClaims_201501_201512.csv"
#outfile = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Clean/CleanClaims_201501_201512.csv"
#outfile = "C:/Analytics/DATA911/Martins_Point/MP_Data/MP_Clean/CleanClaims_201701_201712.csv"
# Set up counters, timers, output dataframe, etc.
start_time=time.time()
chunksize=10000
chunknumb=0
#colnames=['MembID','Insurance','YesNo','MembState','Age','Gender','MDState',
# 'Diag1','Diag2','Diag3','Diag4','Cost1','Cost2','Cost3']
#cols=[0,2,6,7,11,12,21,39,41,43,45,51,52,53]
dfnewout=pd.DataFrame()
# Ignore initial space in a field rather than stripping it later. And for some
# reason, using nrows=x causes changes in the type of variable created. The
# dtype option changes that. names=range(57) forces the reading of 57 columns
# even though Col 57 is sparsely populated. Read in chunks.
for dfout in pd.read_csv(inpath+infile, chunksize=chunksize, header=None,
skipinitialspace=True,index_col=False,
dtype={19:object, 22:object, 27:object,
28:object, 47:object, 53:object, 55:object, 56:object,
57:object,58:object,59:object}, names=range(59)):
chunknumb=chunknumb+1
startrow=(chunknumb-1)*chunksize+1
print('\nProcess chunk',chunknumb,' Starting row:',startrow)
currenttime=time.time()
elapsedtime=currenttime-start_time
hours,mins,seconds=tominutes(elapsedtime)
millionrowtime=elapsedtime/startrow*1000000
mhours,mmins,mseconds=tominutes(millionrowtime)
print(' Elapsed time(h.m.s):',int(hours),':',
int(mins),':',int(seconds),'\n Million row time:',int(mhours),
':',int(mmins),':',int(mseconds))
#dfin = pd.read_csv(inpath+infile,header=None,skipinitialspace=True,nrows=1000,
# dtype={19:object,22:object,27:object,28:object,47:object,
# 53:object,55:object,56:object,57:object},
# names=range(57))
# Loop through dataframe fixing anomalies as we go.
droplist=[]
for i in range(dfout.shape[0]):
# Because I drop some rows (based on missing values mjessing up the data),
# I get errors. Test for this error and break out of loop if it does not work.
try:
dummyvar=pd.notnull(dfout.iloc[i,25])
except:
break
# If Col 26 (25 in Python) contains a first name instead of a blank, an extra
# column is present in the data base. Overwrite Col 29 (28 in Python) which
# seems to be the added column. PATCH after AND: It was also discovered that
# if Col 30 (usually a zip code) is empty, then things have not been shifted.
if pd.notnull(dfout.iloc[i,25]) and pd.notnull(dfout.iloc[i,30]):
# Col 27 is empty. Overwrite. Note that there is now a superfluous column that
# will have to be deleted.
for j in range(28,dfout.shape[1]):
dfout.iloc[i,j-1]=dfout.iloc[i,j]
# If column 29 is numbers rather than a state abbreviation, there is an error.
# Move from Col Q (16) to the end of all variables right one space. Start
# with rightmost variables
if pd.notnull(dfout.iloc[i,29]) and str(dfout.iloc[i,29]).isdigit():
for j in range(dfout.shape[1]-1,16,-1):
dfout.iloc[i,j]=dfout.iloc[i,j-1]
# PATCH: Unfixable problems associated with claims for which Servicing_prov_zip
# is blank. ***** Add row to drop list for dropping later.
# ********Ignore the row/lose the claim and get next claim.
# Shift rows 17 to 26 right one column. Reset index so rows are not skipped.
# if i>510 and i < 540:
# print(i,len(str(dfout.iloc[i,30])),dfout.iloc[i,29:33])
if pd.isnull(dfout.iloc[i,30]) or len(str(dfout.iloc[i,30])) < 3:
droplist.insert(0,i)
# dfout=dfout.drop(dfout.index[i])
# dfout=dfout.reset_index(drop=True)
# continue
# If column 22 is letters (a state abbreviation), there is an error. Move
# from Col Q (16) to Col 24 right one columns
if pd.notnull(dfout.iloc[i,22]) and not str(dfout.iloc[i,22]).isdigit():
for j in range(17,26):
dfout.iloc[i,j-1]=dfout.iloc[i,j]
dfout.iloc[i,25]=''
# If a number of columns contains text ONLY, the text must be appended to the field
# to the left, and all columns to the right shifted one cell to the left.
# This is prevalent enough to write a function concatcols.
# Column 38 (Excel col AM) is part of a diagnosis.
subjcol=38
dfout=concatchars(dfout,i,subjcol)
# Columns 41, 43, 45, and 47 are also part of a diagnosis.
subjcol=41
dfout=concatchars(dfout,i,subjcol)
subjcol=43
dfout=concatchars(dfout,i,subjcol)
subjcol=45
dfout=concatchars(dfout,i,subjcol)
subjcol=47
dfout=concatchars(dfout,i,subjcol)
# For Col 51, there may be 3 columns that are part of a procedure description.
# The part to concatenate may contain letters and numbers, so a different
# function is required.
subjcol=51
for k in range(3):
dfout=concatmixed(dfout,i,subjcol)
# Some cases could not be fixed, this is indicated by column 54 being empty.
# If this occurs, add the row to the drop list.
if pd.isnull(dfout.iloc[i,54]):
droplist.insert(0,i)
# Delete right-most columns that have become superfluous and add the new
# chunk to the dataframe.
dfout=dfout.drop(dfout.columns[55:59],axis=1)
# Drop rows for which there were problems.
for row in droplist:
dfout=dfout.drop(dfout.index[[row]])
# Column 21 is blank. If needed, it should be labelled Servicing_prov_address2
# and be placed after Col 20 Servicing_prov_address1
# dfout=dfout.drop(dfout.columns[21],axis=1)
dfnewout=pd.concat([dfnewout,dfout],axis=0)
####################################
# if chunknumb >= 5:
# break
####################################
# Write the cleaned and subsetted dataframe to a csv file.
colnames=['Member_no_NPHI','Full_name_NPHI','LOB','plan_code','current_plan_desc',\
'enroll_status','Overlap_flag','phys_state','zip','DOB_NPHI','DOB_Year',\
'New_age','gender','Death_Year','PCP_id',\
'PCP_prov_TYPE','PCP_prov_SPECIALTY',\
'PCP_contract_affiliation','PCP_address1','PCP_address2','PCP_city',\
'PCP_state','PCP_zip','Health_System_2',\
'Servicing_prov_LASTNAME','Servicing_prov_FIRSTNAME',\
'Servicing_prov_degree','Servicing_prov_address1',\
'Servicing_prov_city','Servicing_prov_state','Servicing_prov_zip',\
'de_id_claim_number','Service_date_full_date','paid_denied_flag',\
'status_level_1','status_level_2','status_level_3','Diag1_AHRQ_long',\
'Diag1_AHRQ_short','Diag1','Diag1_desc','Diag2','Diag2_desc',\
'Diag3','Diag3_desc','Diag4','Diag4_desc','DRG_code','DRG_desc',\
'Procedure_code','Procedure_desc',\
'Net_amt','Billed_amt','Allowed_amt','Service_Yearmo']
dfnewout.columns=colnames
dfnewout.to_csv(path_or_buf=outfile, index=False)