-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_for_ramtin.py
216 lines (178 loc) · 7.7 KB
/
data_for_ramtin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import numpy as np
import argparse
import os
import time
import traceback
from termcolor import colored, cprint
import pickle
import tqdm
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from utils import get_lexicon, get_channels
from captions import Lexicon, Documents, CaptionIndex
from captions.query import Query
from captions.util import PostingUtil
<<<<<<< HEAD
import pickle
from test_face_gender import timeline_gender
DEFAULT_CONTEXT = 3
=======
from scan_face_gender import timeline_gender
>>>>>>> 9a7c722128d8519466f85c982d72c976b504aaaa
DEFAULT_CONTEXT = 30
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('index_dir', type=str,
help='Directory containing index files')
parser.add_argument('-c', dest='window_size', type=int,
default=DEFAULT_CONTEXT,
help='Context window width (default: {})'.format(
DEFAULT_CONTEXT))
return parser.parse_args()
def data_generator(index_dir, window_size, include_stop_words=False):
"""Given a directory and a window size outputs a list of
(sentence, number of men on screen,
number of women on screen,
mean number of men on screen,
mean number of women on screen,
channel)
sentence can be with or without stopwords
"""
# Open the transcript files
doc_path = os.path.join(index_dir, 'docs.list')
lex_path = os.path.join(index_dir, 'words.lex')
idx_path = os.path.join(index_dir, 'index.bin')
channel = 'MSNBC'
var = {'CNN':(1, 82529), 'FOX': (82530, 162639), 'MSNBC': (162640, 246922)}
SIZE = 20000
documents = Documents.load(doc_path)
lexicon = Lexicon.load(lex_path)
# Getting words
words = get_lexicon()
<<<<<<< HEAD
stop_words = set([">>"])
print("Stop words", stop_words)
=======
# Getting channels
docid_to_channels = get_channels()
# Selecting stop words
stop_words = set(list(STOP_WORDS) + ["know", "don", "ve", "say", "way", "said", "ll", "think", "thing", "don’t", "like", "got", "people", "going", "talk", "right", "happened", ">>"])
>>>>>>> 9a7c722128d8519466f85c982d72c976b504aaaa
start_idx, end_idx = var[channel]
doc_idxs = list(np.random.choice(np.arange(start_idx, end_idx), SIZE))
# Create stemmer
<<<<<<< HEAD
# stemmer = WordNetLemmatizer()
with CaptionIndex(idx_path, lexicon, documents) as index:
for doc_id in tqdm.tqdm(doc_idxs):
results = {}
count = 1
gender, locations, persons = timeline_gender(str(doc_id))
#print("meta data extracted")
if len(gender.keys()) == 0:
#print("Skipped id %d"%(doc_id))
continue
=======
stemmer = WordNetLemmatizer()
# Container for result tuples
results = []
with CaptionIndex(idx_path, lexicon, documents) as index:
for doc_id in tqdm.tqdm(doc_idxs):
## Get channel
channel = docid_to_channels[doc_id]
count = 1
# Loading the timeline of faces and their gender
timeline = timeline_gender(str(doc_id))
# If no faces in doc
if not timeline.shape[0]:
continue
# Get all the transcripts
>>>>>>> 9a7c722128d8519466f85c982d72c976b504aaaa
postings = index.intervals(int(doc_id))
sentence = ""
starttime = None
for p in postings:
if starttime is None:
starttime = p.start
<<<<<<< HEAD
# Cut after 30s
if p.end - starttime > 3*count:
#import pdb; pdb.set_trace()
t1 = int(starttime)
t2 = int(p.end)
for time_box in range(t1, t2):
if time_box in gender.keys():
frame_gender = gender[time_box]
else:
frame_gender = None
if time_box in persons.keys():
frame_persons = persons[time_box]
else:
frame_persons = None
if time_box in locations.keys():
frame_loc = locations[time_box]
else:
frame_loc = None
if time_box in results.keys():
results[time_box].append({'text': sentence, 'gender': frame_gender,
'persons': frame_persons, 'locations': frame_loc})
else:
results[time_box] = [{'text': sentence, 'gender': frame_gender,
'persons': frame_persons, 'locations': frame_loc}]
# if not (timeline[int(starttime):min(int(p.end), len(timeline))] == 0).all():
# results.append((sentence,
# np.sum(timeline[int(starttime):min(int(p.end), len(timeline)), 0]),
# np.sum(timeline[int(starttime):min(int(p.end), len(timeline)), 1]),
# np.mean(timeline[int(starttime):min(int(p.end), len(timeline)), 0]),
# np.mean(timeline[int(starttime):min(int(p.end), len(timeline)), 1]) ))
=======
# Cut after windows_size s
if p.end - starttime > window_size*count:
t1 = int(starttime)
t2 = min(int(p.end), len(timeline))
# Check if any faces appear in the timeframe
if not (timeline[t1:t2] == 0).all():
male_timeline = timeline[t1:t2, 0]
female_timeline = timeline[t1:t2, 1]
results.append((sentence,
np.sum(male_timeline),
np.sum(female_timeline),
np.mean(male_timeline),
np.mean(female_timeline), channel))
# Start a new sentence
>>>>>>> 9a7c722128d8519466f85c982d72c976b504aaaa
count += 1
starttime = p.end
sentence = ""
# Get words in posting
tokens = index.tokens(0, p.idx, p.len)
if not tokens:
continue
for token in tokens:
# Getting corresponding word
word = words[token]
<<<<<<< HEAD
# stemmed_word = stemmer.stem(word)
#if word not in stop_words and len(word)>1:
#stemmed_word = stemmer.lemmatize(word)
if word not in stop_words:
sentence += word + " "
pickle.dump(results, open('%s/meta_data_%d.p'%(channel, doc_id), 'wb'))
return None
=======
# Add word if we want all stopwords or if not a stopword
if include_stop_words or (word not in stop_words and len(word)>1):
stemmed_word = stemmer.lemmatize(word)
sentence += stemmed_word + " "
return results
>>>>>>> 9a7c722128d8519466f85c982d72c976b504aaaa
## DEPRECATED
def gen(index_dir, silent, windows_size):
return [x[0:5] for x in data_generator(index_dir, windows_size, False)]
if __name__ == '__main__':
args = get_args()
for x in data_generator(args.index_dir, args.window_size, True):
print(x)