Skip to content

Commit

Permalink
update parsed courses with sentiment info
Browse files Browse the repository at this point in the history
  • Loading branch information
pybeebee committed Mar 31, 2024
1 parent 959b227 commit e04df39
Show file tree
Hide file tree
Showing 5 changed files with 741,095 additions and 37,392 deletions.
134 changes: 134 additions & 0 deletions backend/port_sentiment_info_to_parsed_courses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import os
import json
import argparse
import subprocess
import os
import json
import shutil

"""
This file runs sentiment classification on CourseTable data as follows:
Args
- target_data_path: Path to the desired folder containing .json files for courses that must be updated with sentiment analysis resulst from specified years
- sentiment_data_path: Path to folder containing .json files equipped with sentiment analysis results
- years_to_port: List containing integers representing what years of sentiment info to include in the given .json files for parsed courses
Processing
- Consider each .json file in 'data_path'
- For each item (representing a single course) in the .json:
- Retrieve the relevant course evaluation files for the year(s) specified
- Store result as a new field(s) in the json course_objectect
Result
- Updated .json files with new sentiment field for each course's json course_objectect, written in-place.
Notes
- This file assumes sentiment_classification.py has already been run on the course evaluation data.
"""

# Main function to loop over all JSON course_objectects for the given year
def main(args):

# Look at all parsed course files
for filename in os.listdir(args.target_data_path):
# import ipdb; ipdb.set_trace()

# Consider each file for the relevant years & load
if filename.endswith(".json") and int(filename[:4]) in args.years_to_port:
print("On year", filename)

season_file_path = os.path.join(args.target_data_path, filename)
with open(season_file_path, 'r') as f:
season_course = json.load(f)

# Consider each course in the relevant year/season
count = 0
for course_object in season_course:
reviews_missing = True

season_code = course_object.get("season_code", "")
crns = course_object.get("crns")
count += 1

# Identify the CRN of the course
for crn in crns:
print(f"On: {season_code}-{crn} / index {count}")

# Inspect if there are any json entries for that course in the specified season
grep_cmd = f"ls {args.sentiment_data_path} | grep {season_code}-{crn}"
try: # if so, write the sentiment data to the file
grep_output = subprocess.check_output(grep_cmd, shell=True).decode().strip().split("\n")
season_filename = grep_output[0]
sentiment_file_path = os.path.join(args.sentiment_data_path, season_filename)
with open(sentiment_file_path, 'r') as sentiment_file:
sentiment_json = json.load(sentiment_file)
course_object["sentiment_info"] = sentiment_json["sentiment_info"]
print(f"Finished {season_code}-{crn}")
reviews_missing = False
break
except: # if no matches found, continue
continue
if reviews_missing:
course_object["sentiment_info"] = {
"YC401": {
"sentiment_labels": [],
"sentiment_scores": [],
"sentiment_counts": {},
"sentiment_distribution": {},
"sentiment_overall": ""
},
"YC403": {
"sentiment_labels": [],
"sentiment_scores": [],
"sentiment_counts": {},
"sentiment_distribution": {},
"sentiment_overall": ""
},
"YC409": {
"sentiment_labels": [],
"sentiment_scores": [],
"sentiment_counts": {},
"sentiment_distribution": {},
"sentiment_overall": ""
},
"final_label": "",
"final_count": 0,
"final_proportion": 0.0,
"final_counts": {},
"final_distribution": {}
}

with open(season_file_path, 'w') as f:
json.dump(season_course, f, indent=4)


############################################################
############ RUN SENTIMENT CLASSIFICATION HERE #############
############################################################

if __name__ == "__main__":

parser = argparse.ArgumentParser()

# Specify the folder path where JSON files are located
parser.add_argument("--target_data_path",
type=str,
default="data/parsed_courses",
help="Folder where the .json files that need sentiment info copied over are located.")

parser.add_argument("--sentiment_data_path",
type=str,
default="data/course_evals",
help="Folder where the .json files with sentiment info are located.")

parser.add_argument("--years_to_port",
nargs="*", # 0 or more values expected => creates a list
type=int,
default = [2023], # other options: YC401, YC403
help="Specify what years of sentiment info to include in the parsed course data .json files.")

args = parser.parse_args()

main(args)

3 changes: 1 addition & 2 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,4 @@
data/course_embeddings/
data/course_json_cache/
data/demand_stats/
data/season_courses/
data/parsed_courses/
data/season_courses/
Loading

0 comments on commit e04df39

Please sign in to comment.