Skip to content

Commit

Permalink
Merge pull request #10 from HumanCompatibleAI/dataimprovements
Browse files Browse the repository at this point in the history
Dataimprovements
  • Loading branch information
JACProjec authored Mar 29, 2024
2 parents 6cf1331 + d4ebf19 commit 14a1066
Show file tree
Hide file tree
Showing 5 changed files with 179,752 additions and 117,166 deletions.
6 changes: 4 additions & 2 deletions sample_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ process of fixing each of these:

- **Twitter:** Missing threads and views. Currently includes
quotes as a substitute. However, all of these appear blank. We
have subsequently simulated threads with random assignment.
have subsequently simulated threads with random assignment. In addition,
we have also randomly simulated engagement metrics

1. Our data is also from various times, but should be fairly general
thematically i.e Not leaning towards particular demographics
Expand All @@ -97,7 +98,8 @@ Tweets from Jan 1st 2023

- Unfortunately this set does not contain views per individual tweets.
We will assess additional sources for this information. Additionally,
lots of quotes, replies, reposts are all 0
because lots of quotes, replies and reposts are all 0 we have simulated
random engagement metrics based on follower counts

Sourced from

Expand Down
36 changes: 31 additions & 5 deletions sample_data/data_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,19 +185,45 @@ def assign_parents(sample):

# Our structure for tweets. Without 'posts' as a concept, only one structure is needed
transformed_data = []

# Randomisation of engagement metrics (current data is majority zero, this will change if we come across improved data)
# Randomisation will combine a proportional amount of follower count with a random noise variable on top
reply_seed = 1
repost_seed = 2
like_seed = 3
quote_seed = 4
noise_std = 1

np.random.seed(reply_seed)
sample['simulated_replies'] = round((sample['followers_count'] * 0.001) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
np.random.seed(repost_seed)
sample['simulated_reposts'] = round((sample['followers_count'] * 0.005) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
np.random.seed(like_seed)
sample['simulated_likes'] = round((sample['followers_count'] * 0.01) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
np.random.seed(quote_seed)
sample['simulated_quotes'] = round((sample['followers_count'] * 0.005) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)

# Grab relevant fields
for _, row in sample.iterrows():
transformed_row = {
"id": row['id'],
"parent_id": row.get('parent_id', ''),
"text": row['text'],
"expanded_url": row.get('expanded_url',None),
"author_name_hash": row['author_id'],
"type": "tweet",
"type": 'tweet',
"created_at": row['created_at'].strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(row['created_at']) else '',
"engagements": {
'reply': row.get('reply_count', 0),
'repost': row.get('retweet_count', 0),
'like': row.get('like_count', 0),
'quote': row.get('quote_count', 0)
'reply': row.get('simulated_replies', 0),
'repost': row.get('simulated_reposts', 0),
'like': row.get('simulated_likes', 0),
'quote': row.get('simulated_quotes', 0)
},
"user_metrics": {
"followers": row.get('followers_count', 0),
"following": row.get('following_count', 0),
"tweet_count": row.get('tweet_count', 0),
"listed_count": row.get('listed_count', 0),
}
}
transformed_data.append(transformed_row)
Expand Down
29 changes: 26 additions & 3 deletions sample_data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def static_hashed(x):

# Lastly, we export our data out
filtered_facebook = merged[['id','parent_id','all_post_ids','text','author_name_hash','type','created_at','like', 'love', 'haha', 'wow', 'sad', 'angry', 'comments','shares']]
filtered_facebook.to_csv(os.path.join(script_dir,'facebook_data/rrocfbsed/filtered_comment_post.csv'), index=False)
# filtered_facebook.to_csv(os.path.join(script_dir,'facebook_data/rrocfbsed/filtered_comment_post.csv'), index=False)


# TWITTER PREPROCESSING
Expand All @@ -167,16 +167,39 @@ def static_hashed(x):
with open(file_path, 'r', encoding='utf-8') as json_file:
for line in json_file:
json_obj = json.loads(line.strip())
if 'data' in json_obj:
if 'data' in json_obj and 'includes' in json_obj:
data_part = json_obj['data']
includes = json_obj['includes']

# Preprocess ID, author_id, and created_at
if 'id' in data_part:
data_part['id'] = hashed(data_part['id'])
if 'author_id' in data_part:
data_part['author_id'] = hashed(data_part['author_id'])
if 'created_at' in data_part:
created = datetime.strptime(data_part['created_at'], '%Y-%m-%dT%H:%M:%S.%fZ')
data_part['created_at'] = str(created.strftime('%Y-%m-%d %H:%M:%S'))
jsons.append(data_part)

# Check for expanded_url
entities = data_part.get('entities', {})
urls = entities.get('urls', [])
expanded_url = urls[0].get('expanded_url', None) if urls else None

# Extracting user metrics from the first user in includes.users
users = includes.get('users', [])
user_metrics = users[0].get('public_metrics', {}) if users else None

# Only append if expanded_url is not None or user_metrics is not None and has content
if expanded_url or (user_metrics and any(user_metrics.values())):
data_part['expanded_url'] = expanded_url
if user_metrics:
data_part.update({
'followers_count': user_metrics.get('followers_count', 0),
'following_count': user_metrics.get('following_count', 0),
'tweet_count': user_metrics.get('tweet_count', 0),
'listed_count': user_metrics.get('listed_count', 0),
})
jsons.append(data_part)

with open(os.path.join(script_dir,'twitter_data/processed/filtered_jan_2023.json'), 'w', encoding='utf-8') as output_file:
json.dump(jsons, output_file, indent=4)
Expand Down
Loading

0 comments on commit 14a1066

Please sign in to comment.