Merge pull request #10 from HumanCompatibleAI/dataimprovements

Dataimprovements
HumanCompatibleAI · Mar 29, 2024 · 14a1066 · 14a1066
2 parents 6cf1331 + d4ebf19
commit 14a1066
Show file tree

Hide file tree

Showing 5 changed files with 179,752 additions and 117,166 deletions.
diff --git a/sample_data/README.md b/sample_data/README.md
@@ -73,7 +73,8 @@ process of fixing each of these:
 
         - **Twitter:** Missing threads and views. Currently includes
         quotes as a substitute. However, all of these appear blank. We
-        have subsequently simulated threads with random assignment.
+        have subsequently simulated threads with random assignment. In addition,
+        we have also randomly simulated engagement metrics
 
 1. Our data is also from various times, but should be fairly general
 thematically i.e Not leaning towards particular demographics
@@ -97,7 +98,8 @@ Tweets from Jan 1st 2023
 
 - Unfortunately this set does not contain views per individual tweets.
   We will assess additional sources for this information. Additionally,
-  lots of quotes, replies, reposts are all 0
+  because lots of quotes, replies and reposts are all 0 we have simulated
+  random engagement metrics based on follower counts
 
 Sourced from
 

diff --git a/sample_data/data_pull.py b/sample_data/data_pull.py
@@ -185,19 +185,45 @@ def assign_parents(sample):
 
         # Our structure for tweets. Without 'posts' as a concept, only one structure is needed
         transformed_data = []
+
+        # Randomisation of engagement metrics (current data is majority zero, this will change if we come across improved data)
+        # Randomisation will combine a proportional amount of follower count with a random noise variable on top
+        reply_seed = 1
+        repost_seed = 2
+        like_seed = 3
+        quote_seed = 4
+        noise_std = 1
+
+        np.random.seed(reply_seed)
+        sample['simulated_replies'] = round((sample['followers_count'] * 0.001) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
+        np.random.seed(repost_seed)
+        sample['simulated_reposts'] = round((sample['followers_count'] * 0.005) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
+        np.random.seed(like_seed)
+        sample['simulated_likes'] = round((sample['followers_count'] * 0.01) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
+        np.random.seed(quote_seed)
+        sample['simulated_quotes'] = round((sample['followers_count'] * 0.005) + np.random.normal(loc=0, scale=noise_std, size=len(sample)), 0).clip(lower=0).astype(int)
+
+        # Grab relevant fields     
         for _, row in sample.iterrows():
             transformed_row = {
                 "id": row['id'],
                 "parent_id": row.get('parent_id', ''),
                 "text": row['text'],
+                "expanded_url": row.get('expanded_url',None),
                 "author_name_hash": row['author_id'],
-                "type": "tweet",
+                "type": 'tweet',
                 "created_at": row['created_at'].strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(row['created_at']) else '',
                 "engagements": {
-                    'reply': row.get('reply_count', 0),
-                    'repost': row.get('retweet_count', 0),
-                    'like': row.get('like_count', 0),
-                    'quote': row.get('quote_count', 0)
+                    'reply': row.get('simulated_replies', 0),
+                    'repost': row.get('simulated_reposts', 0),
+                    'like': row.get('simulated_likes', 0),
+                    'quote': row.get('simulated_quotes', 0)
+                },
+                "user_metrics": {
+                    "followers": row.get('followers_count', 0),
+                    "following": row.get('following_count', 0),
+                    "tweet_count": row.get('tweet_count', 0),
+                    "listed_count": row.get('listed_count', 0),
                 }
             }
             transformed_data.append(transformed_row)

diff --git a/sample_data/preprocessing.py b/sample_data/preprocessing.py
@@ -151,7 +151,7 @@ def static_hashed(x):
 
 # Lastly, we export our data out
 filtered_facebook = merged[['id','parent_id','all_post_ids','text','author_name_hash','type','created_at','like', 'love', 'haha', 'wow', 'sad', 'angry', 'comments','shares']]
-filtered_facebook.to_csv(os.path.join(script_dir,'facebook_data/rrocfbsed/filtered_comment_post.csv'), index=False)
+# filtered_facebook.to_csv(os.path.join(script_dir,'facebook_data/rrocfbsed/filtered_comment_post.csv'), index=False)
 
 
 # TWITTER PREPROCESSING
@@ -167,16 +167,39 @@ def static_hashed(x):
     with open(file_path, 'r', encoding='utf-8') as json_file:
         for line in json_file:
             json_obj = json.loads(line.strip())
-            if 'data' in json_obj:
+            if 'data' in json_obj and 'includes' in json_obj:
                 data_part = json_obj['data']
+                includes = json_obj['includes']
+
+                # Preprocess ID, author_id, and created_at
                 if 'id' in data_part:
                     data_part['id'] = hashed(data_part['id'])
                 if 'author_id' in data_part:
                     data_part['author_id'] = hashed(data_part['author_id'])
                 if 'created_at' in data_part:
                     created = datetime.strptime(data_part['created_at'], '%Y-%m-%dT%H:%M:%S.%fZ')
                     data_part['created_at'] = str(created.strftime('%Y-%m-%d %H:%M:%S'))
-                jsons.append(data_part)
+
+                # Check for expanded_url
+                entities = data_part.get('entities', {})
+                urls = entities.get('urls', [])
+                expanded_url = urls[0].get('expanded_url', None) if urls else None
+
+                # Extracting user metrics from the first user in includes.users
+                users = includes.get('users', [])
+                user_metrics = users[0].get('public_metrics', {}) if users else None
+
+                # Only append if expanded_url is not None or user_metrics is not None and has content
+                if expanded_url or (user_metrics and any(user_metrics.values())):
+                    data_part['expanded_url'] = expanded_url
+                    if user_metrics:
+                        data_part.update({
+                            'followers_count': user_metrics.get('followers_count', 0),
+                            'following_count': user_metrics.get('following_count', 0),
+                            'tweet_count': user_metrics.get('tweet_count', 0),
+                            'listed_count': user_metrics.get('listed_count', 0),
+                        })
+                    jsons.append(data_part)
 
 with open(os.path.join(script_dir,'twitter_data/processed/filtered_jan_2023.json'), 'w', encoding='utf-8') as output_file:
     json.dump(jsons, output_file, indent=4)