Skip to content

Commit

Permalink
Update sentiment data for summer terms & update all parsed info
Browse files Browse the repository at this point in the history
  • Loading branch information
pybeebee committed Apr 9, 2024
1 parent e04df39 commit a587954
Show file tree
Hide file tree
Showing 448 changed files with 1,511,466 additions and 70,855 deletions.
268 changes: 268 additions & 0 deletions backend/sentiment_classification_for_summer_courses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
import os
import json
import argparse
from typing import List, Dict
from transformers import pipeline
from collections import Counter

"""
This file runs sentiment classification on CourseTable data for SUMMER terms as follows:
Args
- data_path: Path to the desired folder containing .json files for courses
- sentiment_input_fields: List containing IDs of the desired review questions in each course's json object to consider for overall sentiment score computation
Processing
- Consider each .json file in 'data_path' (where the term code ends in "02" to indicate summer term, e.g. 202302-20009.json)
- For each item (representing a single course) in the .json:
- Retrieve the relevant review information
- Format information as string
- Pass to sentiment analysis model
- Store result as a new field(s) in the json object
- Perform further processing as specified below, in Sentiment Analysis Details
Sentiment Analysis Details
- Model used: https://huggingface.co/siebert/sentiment-roberta-large-english
- For a given course, apply sentiment analysis to EACH student-written response to the review questions in consideration
- Count up the # of positive & negative labaels
- Compute proportions of each label and use the max proportion one as the true label
- If the positive and negative ratings are close in count (proportion differs by <0.1, set the label to neutral with score 0.5)
- Update the current course's json object with the following:
- For each review question:
- List[str]: sentiment labels for each review
- List[float]: sentiment scores for each review
- Dict[int]: raw counts of pos/neg reviews
- Dict[float]: proportions of pos/neg reviews
- List[str/float]: overall label for the question & float
- Overall:
- str: final sentiment label, computed using fields specified in sentiment_input_fields
- int: raw count for final sentiment label
- float: final sentiment label's proportion score
- Dict[int]: final raw counts of pos/neg reviews, across fields specified in sentiment_input_fields
- Dict[float]: final proportions of pos/neg reviews, across fields specified in sentiment_input_fields
Result
- Updated .json files with new sentiment field for each course's json object, written in-place to the original files
"""

# Load sentiment model for use throughout rest of code
sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english", max_length=512, truncation=True)

# Get appropriate most common sentiment label
def get_most_common(
stats_dict: Counter,
):
# if scores are within 0.1 of each other, set label to neutral
if abs(stats_dict['POSITIVE'] - stats_dict['NEGATIVE']) < 0.1:
return 'neutral', 0.5
return stats_dict.most_common(1)[0]

# Function to perform sentiment analysis
def analyze(
sentiment_input: str,
):
result = sentiment_analysis(sentiment_input)
return result[0]['label'], result[0]['score']

# Obtain sentiment results for students' responses to a given evaluation question for a particular course
def process(
question: str,
):
# If no comments are listed
if question['comments']==[]:
return [], [], Counter(), Counter(), ''

# Initialize counters
labels = []
scores = []

# perform sentiment classification per response
for comment in question['comments']:
sentiment_label, sentiment_score = analyze(comment)
labels.append(sentiment_label)
scores.append(sentiment_score)

# compute counts per sentiment class & normalize to get distribution
counts = Counter(labels)
distr = Counter(labels)
if 'NEGATIVE' not in counts.keys():
counts['NEGATIVE'] = 0
distr['NEGATIVE'] = 0
if 'POSITIVE' not in counts.keys():
counts['POSITIVE'] = 0
distr['POSITIVE'] = 0

num_labels = len(labels)
for sentiment in distr.keys():
distr[sentiment] /= num_labels

overall_label = get_most_common(distr) # label with max count

return labels, scores, counts, distr, overall_label

# Main function to loop over all JSON files in a folder
def main(args):

# Identify relevant data files -- confirm only .jsons are considered & sort alphanumerically for consistency
all_jsons = sorted(sorted([filename for filename in os.listdir(args.data_path) if (filename.endswith(".json") and "02-" in filename)], key=str.lower), key=len)

# Look at all the files
for idx, filename in enumerate(all_jsons):

# Start on specified file index
if idx >= args.start_file_idx:

# Write current filename to log (in case need to pause/resume later)
f = open(os.path.join(args.data_path,"0_num_files_analyzed_so_far.txt"), "w")
f.write("\nOn file: "+filename)
f.write("\nCorresponding index: "+str(idx))
f.close()

# Read the json file for the current course
file_path = os.path.join(args.data_path, filename)
with open(file_path, 'r') as file:
course = json.load(file)

final_counts = Counter()
course['sentiment_info'] = {}

# Check that written reviews are provided for the course
if course['enrollment']['responses'] > 0:

# Retrieve relevant review info
narratives = course['narratives']
for question in narratives:

# What knowledge, skills, and insights did you develop by taking this course?
if 'SU100' in args.sentiment_input_fields and 'SU100' in question['question_id']:
labels, scores, counts, distr, overall_label = process(question)
course['sentiment_info']['SU100'] = {
'sentiment_labels': labels,
'sentiment_scores': scores,
'sentiment_counts': counts,
'sentiment_distribution': distr,
'sentiment_overall': overall_label,
}
final_counts += counts

# What are the strengths and weaknesses of this course and how could it be improved?
elif 'SU103' in args.sentiment_input_fields and 'SU103' in question['question_id']:
labels, scores, counts, distr, overall_label = process(question)
course['sentiment_info']['SU103'] = {
'sentiment_labels': labels,
'sentiment_scores': scores,
'sentiment_counts': counts,
'sentiment_distribution': distr,
'sentiment_overall': overall_label,
}
final_counts += counts

# Would you recommend this course to another student? Please explain.
elif 'SU109' in args.sentiment_input_fields and 'SU109' in question['question_id']:
labels, scores, counts, distr, overall_label = process(question)
course['sentiment_info']['SU109'] = {
'sentiment_labels': labels,
'sentiment_scores': scores,
'sentiment_counts': counts,
'sentiment_distribution': distr,
'sentiment_overall': overall_label,
}
final_counts += counts

# Edge case
if final_counts==Counter():
course['sentiment_info']["final_label"] = ''
course['sentiment_info']["final_count"] = 0
course['sentiment_info']["final_proportion"] = 0.
course['sentiment_info']["final_counts"] = Counter()
course['sentiment_info']["final_distribution"] = Counter()

# Otherwise, record final results
else:
final_label, final_count = get_most_common(final_counts)
final_distr = final_counts.copy()
num_labels = sum(final_counts.values())
for label in final_distr.keys():
final_distr[label] /= num_labels
final_label, final_proportion = get_most_common(final_distr)

course['sentiment_info']["final_label"] = final_label
course['sentiment_info']["final_count"] = final_count
course['sentiment_info']["final_proportion"] = final_proportion
course['sentiment_info']["final_counts"] = final_counts
course['sentiment_info']["final_distribution"] = final_distr

# Write back the updated data to the same file
with open(file_path, 'w') as file:
json.dump(course, file, indent=4)

else:
course['sentiment_info'] = {}
if 'SU100' in args.sentiment_input_fields:
course['sentiment_info']['SU100'] = {
'sentiment_labels': [],
'sentiment_scores': [],
'sentiment_counts': Counter(),
'sentiment_distribution': Counter(),
'sentiment_overall': '',
}
if 'SU103' in args.sentiment_input_fields:
course['sentiment_info']['SU103'] = {
'sentiment_labels': [],
'sentiment_scores': [],
'sentiment_counts': Counter(),
'sentiment_distribution': Counter(),
'sentiment_overall': '',
}
if 'SU109' in args.sentiment_input_fields:
course['sentiment_info']['SU109'] = {
'sentiment_labels': [],
'sentiment_scores': [],
'sentiment_counts': Counter(),
'sentiment_distribution': Counter(),
'sentiment_overall': '',
}
course['sentiment_info']["final_label"] = ''
course['sentiment_info']["final_count"] = 0
course['sentiment_info']["final_proportion"] = 0.
course['sentiment_info']["final_counts"] = Counter()
course['sentiment_info']["final_distribution"] = Counter()

# Write back the updated data to the same file
with open(file_path, 'w') as file:
json.dump(course, file, indent=4)

print("Finished",filename)


############################################################
############ RUN SENTIMENT CLASSIFICATION HERE #############
############################################################

# Call the main function to process all JSON files
# Example call #2: python backend/sentiment_classification_for_summer_courses.py --data_path=./data/course_evals --sentiment_input_fields SU109 SU103 SU100
if __name__ == "__main__":

parser = argparse.ArgumentParser()

# Specify the folder path where JSON files are located
parser.add_argument("--data_path",
type=str,
default="data/test_courses",
help="Folder where the .json files for the course data are located.")

parser.add_argument("--start_file_idx",
type=int,
default=0,
help="Specify which index of file to start sentiment analysis on. For when the run is paused and desired to be resumed later.")

parser.add_argument("--sentiment_input_fields",
nargs="*", # 0 or more values expected => creates a list
type=str,
default = ['SU109'], # other options: SU100, SU103
help="Specify what field(s)/attribute(s) of each course to use to compute the overall final sentiment score/label.")

args = parser.parse_args()

main(args)

4 changes: 2 additions & 2 deletions data/course_evals/0_num_files_analyzed_so_far.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@

On file: 202303-23567.json
Corresponding index: 10879
On file: 202302-30767.json
Corresponding index: 458
40 changes: 35 additions & 5 deletions data/course_evals/202102-30019.json
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,40 @@
"title": "AFAM S305 01/ENGL S305 01 - African American Autobiography"
},
"sentiment_info": {
"final_label": "",
"final_count": 0,
"final_proportion": 0.0,
"final_counts": {},
"final_distribution": {}
"SU109": {
"sentiment_labels": [
"POSITIVE",
"POSITIVE",
"POSITIVE",
"POSITIVE"
],
"sentiment_scores": [
0.9981436729431152,
0.9987603425979614,
0.9981896281242371,
0.9988548755645752
],
"sentiment_counts": {
"POSITIVE": 4,
"NEGATIVE": 0
},
"sentiment_distribution": {
"POSITIVE": 1.0,
"NEGATIVE": 0.0
},
"sentiment_overall": [
"POSITIVE",
1.0
]
},
"final_label": "POSITIVE",
"final_count": 4,
"final_proportion": 1.0,
"final_counts": {
"POSITIVE": 4
},
"final_distribution": {
"POSITIVE": 1.0
}
}
}
40 changes: 35 additions & 5 deletions data/course_evals/202102-30020.json
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,40 @@
"title": "AFAM S305 01/ENGL S305 01 - African American Autobiography"
},
"sentiment_info": {
"final_label": "",
"final_count": 0,
"final_proportion": 0.0,
"final_counts": {},
"final_distribution": {}
"SU109": {
"sentiment_labels": [
"POSITIVE",
"POSITIVE",
"POSITIVE",
"POSITIVE"
],
"sentiment_scores": [
0.9981436729431152,
0.9987603425979614,
0.9981896281242371,
0.9988548755645752
],
"sentiment_counts": {
"POSITIVE": 4,
"NEGATIVE": 0
},
"sentiment_distribution": {
"POSITIVE": 1.0,
"NEGATIVE": 0.0
},
"sentiment_overall": [
"POSITIVE",
1.0
]
},
"final_label": "POSITIVE",
"final_count": 4,
"final_proportion": 1.0,
"final_counts": {
"POSITIVE": 4
},
"final_distribution": {
"POSITIVE": 1.0
}
}
}
Loading

0 comments on commit a587954

Please sign in to comment.