Skip to content

Commit

Permalink
Add synthetic and incomplete records
Browse files Browse the repository at this point in the history
  • Loading branch information
vivverma9 committed Nov 8, 2024
1 parent 316de93 commit e21b87d
Show file tree
Hide file tree
Showing 7 changed files with 14,085 additions and 8,474 deletions.
9,000 changes: 5,064 additions & 3,936 deletions data/interim/survey_data_clean.csv

Large diffs are not rendered by default.

12,421 changes: 8,485 additions & 3,936 deletions data/processed/data_model_output.csv

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions data/processed/revised_names.csv
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,19 @@ STAY_INFORMED,stay_informed_label
SURVEY_LANGUAGE_Code_,survey_language
SURVEY_LANGUAGE,survey_language_label
SURVEY_LANGUAGE_Other_,survey_language_other
REFUS_AGE_OBSERVED_Code_,delete
REFUS_AGE_OBSERVED,delete
REFUS_GENDER_OBSRVED_Code_,delete
REFUS_GENDER_OBSRVED,delete
REFUS_RACETHN_OBSERV_1_,delete
REFUS_RACETHN_OBSERV_2_,delete
REFUS_RACETHN_OBSERV_3_,delete
REFUS_RACETHN_OBSERV_4_,delete
REFUS_RACETHN_OBSERV_5_,delete
REFUS_RACETHN_OBSERV_6_,delete
REFUS_RACETHN_OBSERV_7_,delete
REFUS_RACETHN_OBSERV_8_,delete
REFUS_RACETHN_OBSERV_Other_,delete
Date started,delete
HAVE_5_MIN_FOR_SURVE_Code_,delete
HAVE_5_MIN_FOR_SURVE,delete
16 changes: 15 additions & 1 deletion data_model/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,12 +583,26 @@ class Respondent(PydanticModel):
Data model for a survey respondent. It includes attributes common to air passengers and employees.
"""

respondentid: NoneOrNan[int] = Field(
respondentid: Union[int,str] = Field(
..., description="Unique identifier for the respondent")
"""
Unique identifier for the respondent.
"""

record_type_synthetic: bool = Field(
..., description = "True if the record is synthetically generated"
)
"""
True if the record is synthetically generated
"""

is_completed: bool = Field(
..., description = "True if the record is complete"
)
"""
True if the record is complete
"""

date_completed: NoneOrNanString[datetime] = Field(
..., description = "Date and time when respondent completed the survey"
)
Expand Down
1 change: 1 addition & 0 deletions data_model/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,7 @@ class ActivityType(IntEnum):
CONVENTION_CENTER = 4
OTHER_BUSINESS = 5
OTHER_RESIDENCE = 6
SAN_DIEGO_AIRPORT = 7
OTHER = 98
REFUSED = 99

Expand Down
59 changes: 59 additions & 0 deletions data_model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from collections import defaultdict
from typing import Any, get_origin, get_args, Annotated, Optional
from enum import Enum, IntEnum
import pandas as pd
import enums as e

def extract_base_type(typ):
"""
Expand Down Expand Up @@ -104,3 +106,60 @@ def military_to_clock(military_time):

# Format the output string
return f"{clock_hours:02d}:{minutes:02d} {period}"


def add_synthetic_records(df):
"""
Adds synthetic responses to the survey. Only adds such records corresponding to Departing Passengers.
Keeps Sociodemographics and other attributes same, exchanges trip based characteristics, like modes and origin, destination related attributes.
"""
# Create a list to store synthetic records
synthetic_records = []
# Iterate through each record in the dataframe
for index, row in df.iterrows():
# Create a copy of the current row for the synthetic record
if row['passenger_type'] == e.PassengerType.DEPARTING and row['is_completed'] == True:
synthetic_record = row.copy()

# Flip inbound/outbound
synthetic_record['respondentid'] = 'syn-' + str(row['respondentid'])
synthetic_record['inbound_or_outbound'] = 2 if row['inbound_or_outbound'] == 1 else 1
synthetic_record['passenger_type'] = e.PassengerType.ARRIVING
synthetic_record['previous_flight_origin'], synthetic_record['next_flight_destination'] = row['next_flight_destination'], row['previous_flight_origin']

# Flipping the main and reverse modes:
if row['reverse_mode']:
synthetic_record['main_mode'], synthetic_record['reverse_mode'] = row['reverse_mode'], row['main_mode']
else:
synthetic_record['main_mode'], synthetic_record['reverse_mode_predicted'] = row['reverse_mode_predicted'], row['main_mode']

# Access and Egress Modes:
synthetic_record['access_mode'], synthetic_record['egress_mode'] = row['egress_mode'], row['access_mode']

# Activity Type
synthetic_record['origin_activity_type'], synthetic_record['destination_activity_type'] = row['destination_activity_type'], row['origin_activity_type']
synthetic_record['origin_activity_type_other'], synthetic_record['destination_activity_type_other'] = row['destination_activity_type_other'], row['origin_activity_type_other']

#Location Attributes
synthetic_record['origin_state'], synthetic_record['destination_state'] = row['destination_state'], row['origin_state']
synthetic_record['origin_city'], synthetic_record['destination_city'] = row['destination_city'], row['origin_city']
synthetic_record['origin_zip'], synthetic_record['destination_zip'] = row['destination_zip'], row['origin_zip']


#synthetic_record['to_airport_transit_route_1'], synthetic_record['from_airport_transit_route_4'] = row['from_airport_transit_route_4'], row['to_airport_transit_route_1']
#synthetic_record['to_airport_transit_route_2'], synthetic_record['from_airport_transit_route_3'] = row['from_airport_transit_route_3'], row['to_airport_transit_route_2']
#synthetic_record['to_airport_transit_route_3'], synthetic_record['from_airport_transit_route_2'] = row['from_airport_transit_route_2'], row['to_airport_transit_route_3']
#synthetic_record['to_airport_transit_route_4'], synthetic_record['from_airport_transit_route_1'] = row['from_airport_transit_route_1'], row['to_airport_transit_route_4']


# Append the synthetic record to the list
synthetic_record['record_type_synthetic'] = 1
synthetic_records.append(synthetic_record)

# Convert the list of synthetic records to a DataFrame
synthetic_df = pd.DataFrame(synthetic_records)

# Concatenate the original and synthetic dataframes
combined_df = pd.concat([df, synthetic_df], ignore_index=True)

return combined_df
Loading

0 comments on commit e21b87d

Please sign in to comment.