Add synthetic and incomplete records

SANDAG · Nov 8, 2024 · e21b87d · e21b87d
1 parent 316de93
commit e21b87d
Show file tree

Hide file tree

Showing 7 changed files with 14,085 additions and 8,474 deletions.
diff --git a/data/interim/survey_data_clean.csv b/data/interim/survey_data_clean.csv
diff --git a/data/processed/data_model_output.csv b/data/processed/data_model_output.csv
diff --git a/data/processed/revised_names.csv b/data/processed/revised_names.csv
@@ -358,3 +358,19 @@ STAY_INFORMED,stay_informed_label
 SURVEY_LANGUAGE_Code_,survey_language
 SURVEY_LANGUAGE,survey_language_label
 SURVEY_LANGUAGE_Other_,survey_language_other
+REFUS_AGE_OBSERVED_Code_,delete
+REFUS_AGE_OBSERVED,delete
+REFUS_GENDER_OBSRVED_Code_,delete
+REFUS_GENDER_OBSRVED,delete
+REFUS_RACETHN_OBSERV_1_,delete
+REFUS_RACETHN_OBSERV_2_,delete
+REFUS_RACETHN_OBSERV_3_,delete
+REFUS_RACETHN_OBSERV_4_,delete
+REFUS_RACETHN_OBSERV_5_,delete
+REFUS_RACETHN_OBSERV_6_,delete
+REFUS_RACETHN_OBSERV_7_,delete
+REFUS_RACETHN_OBSERV_8_,delete
+REFUS_RACETHN_OBSERV_Other_,delete
+Date started,delete
+HAVE_5_MIN_FOR_SURVE_Code_,delete
+HAVE_5_MIN_FOR_SURVE,delete
diff --git a/data_model/data_model.py b/data_model/data_model.py
@@ -583,12 +583,26 @@ class Respondent(PydanticModel):
     Data model for a survey respondent. It includes attributes common to air passengers and employees.
     """
 
-    respondentid: NoneOrNan[int] = Field(
+    respondentid: Union[int,str] = Field(
         ..., description="Unique identifier for the respondent")
     """
     Unique identifier for the respondent.
     """
 
+    record_type_synthetic: bool =  Field(
+        ..., description = "True if the record is synthetically generated"
+    )
+    """
+    True if the record is synthetically generated
+    """
+
+    is_completed: bool = Field(
+        ..., description = "True if the record is complete"
+    )
+    """
+    True if the record is complete
+    """
+
     date_completed: NoneOrNanString[datetime] = Field(
         ..., description = "Date and time when respondent completed the survey"
     )

diff --git a/data_model/enums.py b/data_model/enums.py
@@ -928,6 +928,7 @@ class ActivityType(IntEnum):
     CONVENTION_CENTER = 4
     OTHER_BUSINESS = 5
     OTHER_RESIDENCE = 6 
+    SAN_DIEGO_AIRPORT = 7
     OTHER = 98
     REFUSED = 99
 

diff --git a/data_model/utils.py b/data_model/utils.py
@@ -5,6 +5,8 @@
 from collections import defaultdict
 from typing import Any, get_origin, get_args, Annotated, Optional
 from enum import Enum, IntEnum
+import pandas as pd
+import enums as e
 
 def extract_base_type(typ):
     """
@@ -104,3 +106,60 @@ def military_to_clock(military_time):
 
     # Format the output string
     return f"{clock_hours:02d}:{minutes:02d} {period}"
+
+
+def add_synthetic_records(df):
+    """
+    Adds synthetic responses to the survey. Only adds such records corresponding to Departing Passengers.
+    Keeps Sociodemographics and other attributes same, exchanges trip based characteristics, like modes and origin, destination related attributes.
+    """
+     # Create a list to store synthetic records
+    synthetic_records = []
+    # Iterate through each record in the dataframe
+    for index, row in df.iterrows():
+        # Create a copy of the current row for the synthetic record
+        if row['passenger_type'] == e.PassengerType.DEPARTING and row['is_completed'] == True:
+            synthetic_record = row.copy()
+
+            # Flip inbound/outbound
+            synthetic_record['respondentid'] = 'syn-' + str(row['respondentid'])
+            synthetic_record['inbound_or_outbound'] = 2 if row['inbound_or_outbound'] == 1 else 1
+            synthetic_record['passenger_type'] = e.PassengerType.ARRIVING
+            synthetic_record['previous_flight_origin'], synthetic_record['next_flight_destination'] = row['next_flight_destination'], row['previous_flight_origin']
+
+            # Flipping the main and reverse modes:
+            if row['reverse_mode']:
+                synthetic_record['main_mode'], synthetic_record['reverse_mode'] = row['reverse_mode'], row['main_mode']
+            else:
+                synthetic_record['main_mode'], synthetic_record['reverse_mode_predicted'] = row['reverse_mode_predicted'], row['main_mode']
+
+            # Access and Egress Modes:
+            synthetic_record['access_mode'], synthetic_record['egress_mode'] = row['egress_mode'], row['access_mode']
+
+            # Activity Type
+            synthetic_record['origin_activity_type'], synthetic_record['destination_activity_type'] = row['destination_activity_type'], row['origin_activity_type']
+            synthetic_record['origin_activity_type_other'], synthetic_record['destination_activity_type_other'] = row['destination_activity_type_other'], row['origin_activity_type_other']
+
+           #Location Attributes
+            synthetic_record['origin_state'], synthetic_record['destination_state'] = row['destination_state'], row['origin_state']
+            synthetic_record['origin_city'], synthetic_record['destination_city'] = row['destination_city'], row['origin_city']
+            synthetic_record['origin_zip'], synthetic_record['destination_zip'] = row['destination_zip'], row['origin_zip']
+
+
+            #synthetic_record['to_airport_transit_route_1'], synthetic_record['from_airport_transit_route_4'] = row['from_airport_transit_route_4'], row['to_airport_transit_route_1']
+            #synthetic_record['to_airport_transit_route_2'], synthetic_record['from_airport_transit_route_3'] = row['from_airport_transit_route_3'], row['to_airport_transit_route_2']
+            #synthetic_record['to_airport_transit_route_3'], synthetic_record['from_airport_transit_route_2'] = row['from_airport_transit_route_2'], row['to_airport_transit_route_3']
+            #synthetic_record['to_airport_transit_route_4'], synthetic_record['from_airport_transit_route_1'] = row['from_airport_transit_route_1'], row['to_airport_transit_route_4']
+
+
+            # Append the synthetic record to the list
+            synthetic_record['record_type_synthetic'] = 1
+            synthetic_records.append(synthetic_record)
+
+    # Convert the list of synthetic records to a DataFrame
+    synthetic_df = pd.DataFrame(synthetic_records)
+
+    # Concatenate the original and synthetic dataframes
+    combined_df = pd.concat([df, synthetic_df], ignore_index=True)
+
+    return combined_df