From 69ded01a58fc2b247ba7835cfe4eea623be1ddef Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 5 Nov 2021 16:17:05 -0500 Subject: [PATCH 1/9] add class --- pliers/extractors/text.py | 114 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 8735ae231..ad65a1f0a 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -855,3 +855,117 @@ def _extract(self, stims): return ExtractorResult(word_counter, stims, self, features=self.features, onsets=onsets, durations=durations) + +class GPTForwardLMExtractor(ComplexTextExtractor): + ''' Returns predictions for GPT forward language modeling + Args: + pretrained_model (str): A string specifying which transformer + model to use. Can be any gpt-class model, either pretrained or + https://huggingface.co/transformers/pretrained_models.html + or path to custom model. + tokenizer (str): Type of tokenization used in the tokenization step. + If different from model, out-of-vocabulary tokens may be treated + as unknown tokens. + model_class (str): Specifies model type. Must be one of 'AutoModel' + (encoding extractor) or 'AutoModelWithLMHead' (language model). + These are generic model classes, which use the value of + pretrained_model to infer the model-specific transformers + class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel + or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. + framework (str): name deep learning framework to use. Must be 'pt' + (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + return_input (bool): if True, the extractor returns encoded token + and encoded word as features. + model_kwargs (dict): Named arguments for transformer model. + See https://huggingface.co/transformers/main_classes/model.html + tokenizer_kwargs (dict): Named arguments for tokenizer. + See https://huggingface.co/transformers/main_classes/tokenizer.html + ''' + + _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'model_class', 'return_input', 'model_kwargs', 'tokenizer_kwargs') + _model_attributes = ('pretrained_model', 'framework', 'model_class', + 'tokenizer_type') + # Add tokenizer + def __init__(self, + pretrained_model='bert-base-uncased', + tokenizer='bert-base-uncased', + model_class='AutoModel', + framework='pt', + return_input=False, + model_kwargs=None, + tokenizer_kwargs=None): + verify_dependencies(['transformers']) + if framework not in ['pt', 'tf']: + raise(ValueError('''Invalid framework; + must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) + self.pretrained_model = pretrained_model + self.tokenizer_type = tokenizer + self.model_class = model_class + self.framework = framework + self.return_input = return_input + self.model_kwargs = model_kwargs if model_kwargs else {} + self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} + model = model_class if self.framework == 'pt' else 'TF' + model_class + self.model = getattr(transformers, model).from_pretrained( + pretrained_model, **self.model_kwargs) + self.tokenizer = transformers.GPTTokenizerFast.from_pretrained( + tokenizer, **self.tokenizer_kwargs) + super().__init__() + + def _mask_words(self, wds): + ''' Called by _preprocess method. Takes list of words in the Stim as + input (i.e. the .text attribute for each TextStim in the + ComplexTextStim). If class has mask attribute, replaces word in + the input sequence with [MASK] token based on the value of mask + (either index in the sequence, or word to replace). Here, returns + list of words (without masking) + ''' + return wds + + def _preprocess(self, stims): + ''' Extracts text, onset, duration from ComplexTextStim, masks target + words (if relevant), tokenizes the input, and casts words, onsets, + and durations to token-level lists. Called within _extract method + to prepare input for the model. ''' + els = [(e.text, e.onset, e.duration) for e in stims.elements] + wds, ons, dur = map(list, zip(*els)) + tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] + n_tok = [len(t) for t in tok] + stims.name = ' '.join(wds) if stims.name == '' else stims.name + wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) + tok = list(flatten(tok)) + idx = self.tokenizer.encode(tok, return_tensors=self.framework) + return wds, ons, dur, tok, idx + + def _extract(self, stims): + ''' Takes stim as input, preprocesses it, feeds it to Bert model, + then postprocesses the output ''' + wds, ons, dur, tok, idx = self._preprocess(stims) + preds = self.model(idx) + data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur) + return ExtractorResult(data, stims, self, features=feat, onsets=ons, + durations=dur) + + def _postprocess(self, stims, preds, tok, wds, ons, dur): + ''' Postprocesses model output (subsets relevant information, + transforms it where relevant, adds model metadata). + Takes prediction array, token list, word list, onsets + and durations and input. Here, returns token-level encodings + (excluding special tokens). + ''' + out = preds.last_hidden_state[:, 1:-1, :] + if self.framework == 'pt': + out = out.detach() + out = out.numpy().squeeze() + data = [out.tolist()] + feat = ['encoding'] + if self.return_input: + data += [tok, wds] + feat += ['token', 'word'] + return data, feat, ons, dur + + def _to_df(self, result): + res_df = pd.DataFrame(dict(zip(result.features, result._data))) + res_df['object_id'] = range(res_df.shape[0]) + return res_df From 3daa22daa73714594ccb2c0679e77571690bb46e Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 5 Nov 2021 16:46:29 -0500 Subject: [PATCH 2/9] make notes on structure --- pliers/extractors/text.py | 123 ++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index ad65a1f0a..224bc802e 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -866,16 +866,10 @@ class GPTForwardLMExtractor(ComplexTextExtractor): tokenizer (str): Type of tokenization used in the tokenization step. If different from model, out-of-vocabulary tokens may be treated as unknown tokens. - model_class (str): Specifies model type. Must be one of 'AutoModel' - (encoding extractor) or 'AutoModelWithLMHead' (language model). - These are generic model classes, which use the value of - pretrained_model to infer the model-specific transformers - class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel - or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. + model_class (str): Specifies model class from transformers. + tokenizer_class (str): Specifies tokenizer type from transformers. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - return_input (bool): if True, the extractor returns encoded token - and encoded word as features. model_kwargs (dict): Named arguments for transformer model. See https://huggingface.co/transformers/main_classes/model.html tokenizer_kwargs (dict): Named arguments for tokenizer. @@ -883,16 +877,16 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'model_class', 'return_input', 'model_kwargs', 'tokenizer_kwargs') + 'model_class', 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'model_class', 'tokenizer_type') # Add tokenizer def __init__(self, - pretrained_model='bert-base-uncased', - tokenizer='bert-base-uncased', - model_class='AutoModel', + pretrained_model='gpt2', + tokenizer='gpt2', + model_class='GPT2LMHeadModel', + tokenizer_class='GPT2TokenizerFast', framework='pt', - return_input=False, model_kwargs=None, tokenizer_kwargs=None): verify_dependencies(['transformers']) @@ -903,69 +897,68 @@ def __init__(self, self.tokenizer_type = tokenizer self.model_class = model_class self.framework = framework - self.return_input = return_input self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} model = model_class if self.framework == 'pt' else 'TF' + model_class self.model = getattr(transformers, model).from_pretrained( pretrained_model, **self.model_kwargs) - self.tokenizer = transformers.GPTTokenizerFast.from_pretrained( + self.tokenizer = getattr(transformers, tokenizer_class).from_pretrained( tokenizer, **self.tokenizer_kwargs) super().__init__() def _mask_words(self, wds): - ''' Called by _preprocess method. Takes list of words in the Stim as - input (i.e. the .text attribute for each TextStim in the - ComplexTextStim). If class has mask attribute, replaces word in - the input sequence with [MASK] token based on the value of mask - (either index in the sequence, or word to replace). Here, returns - list of words (without masking) - ''' - return wds - - def _preprocess(self, stims): - ''' Extracts text, onset, duration from ComplexTextStim, masks target - words (if relevant), tokenizes the input, and casts words, onsets, - and durations to token-level lists. Called within _extract method - to prepare input for the model. ''' - els = [(e.text, e.onset, e.duration) for e in stims.elements] - wds, ons, dur = map(list, zip(*els)) - tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] - n_tok = [len(t) for t in tok] - stims.name = ' '.join(wds) if stims.name == '' else stims.name - wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) - tok = list(flatten(tok)) - idx = self.tokenizer.encode(tok, return_tensors=self.framework) - return wds, ons, dur, tok, idx + pass + + #def _preprocess(self, stims): + # ''' Extracts text, onset, duration from ComplexTextStim, masks target + # words (if relevant), tokenizes the input, and casts words, onsets, + # and durations to token-level lists. Called within _extract method + # to prepare input for the model. ''' + # els = [(e.text, e.onset, e.duration) for e in stims.elements] + # wds, ons, dur = map(list, zip(*els)) + # tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] + # n_tok = [len(t) for t in tok] + # stims.name = ' '.join(wds) if stims.name == '' else stims.name + # wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) + # tok = list(flatten(tok)) + # idx = self.tokenizer.encode(tok, return_tensors=self.framework) + # return wds, ons, dur, tok, idx def _extract(self, stims): ''' Takes stim as input, preprocesses it, feeds it to Bert model, then postprocesses the output ''' - wds, ons, dur, tok, idx = self._preprocess(stims) - preds = self.model(idx) - data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur) - return ExtractorResult(data, stims, self, features=feat, onsets=ons, - durations=dur) - - def _postprocess(self, stims, preds, tok, wds, ons, dur): - ''' Postprocesses model output (subsets relevant information, - transforms it where relevant, adds model metadata). - Takes prediction array, token list, word list, onsets - and durations and input. Here, returns token-level encodings - (excluding special tokens). - ''' - out = preds.last_hidden_state[:, 1:-1, :] - if self.framework == 'pt': - out = out.detach() - out = out.numpy().squeeze() - data = [out.tolist()] - feat = ['encoding'] - if self.return_input: - data += [tok, wds] - feat += ['token', 'word'] - return data, feat, ons, dur + # do the masking and get the labels = true token (what could be predicted), and the original word (_mask) + # pass to model + # add option to return true word and true token + # return the whole distribution or a trimmed distribution + # challenge: get onset for the last word, and how to deal with duration + + pass + #wds, ons, dur, tok, idx = self._preprocess(stims) + #preds = self.model(idx) + #data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur) + #return ExtractorResult(data, stims, self, features=feat, onsets=ons, + # durations=dur) + + #def _postprocess(self, stims, preds, tok, wds, ons, dur): + # ''' Postprocesses model output (subsets relevant information, + # transforms it where relevant, adds model metadata). + # Takes prediction array, token list, word list, onsets + # and durations and input. Here, returns token-level encodings + # (excluding special tokens). + # ''' + # out = preds.last_hidden_state[:, 1:-1, :] + # if self.framework == 'pt': + # out = out.detach() + # out = out.numpy().squeeze() + # data = [out.tolist()] + # feat = ['encoding'] + # if self.return_input: + # data += [tok, wds] + # feat += ['token', 'word'] + # return data, feat, ons, dur - def _to_df(self, result): - res_df = pd.DataFrame(dict(zip(result.features, result._data))) - res_df['object_id'] = range(res_df.shape[0]) - return res_df + #def _to_df(self, result): + # res_df = pd.DataFrame(dict(zip(result.features, result._data))) + # res_df['object_id'] = range(res_df.shape[0]) + # return res_df From e377e4a99ddc8c5407024dff9c1ce317e3e94cb2 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 5 Nov 2021 18:33:50 -0500 Subject: [PATCH 3/9] cleanup --- pliers/extractors/text.py | 56 +++++---------------------------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 224bc802e..18f298e46 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -906,59 +906,17 @@ def __init__(self, tokenizer, **self.tokenizer_kwargs) super().__init__() - def _mask_words(self, wds): + def _mask(self, wds): pass - - #def _preprocess(self, stims): - # ''' Extracts text, onset, duration from ComplexTextStim, masks target - # words (if relevant), tokenizes the input, and casts words, onsets, - # and durations to token-level lists. Called within _extract method - # to prepare input for the model. ''' - # els = [(e.text, e.onset, e.duration) for e in stims.elements] - # wds, ons, dur = map(list, zip(*els)) - # tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] - # n_tok = [len(t) for t in tok] - # stims.name = ' '.join(wds) if stims.name == '' else stims.name - # wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) - # tok = list(flatten(tok)) - # idx = self.tokenizer.encode(tok, return_tensors=self.framework) - # return wds, ons, dur, tok, idx def _extract(self, stims): ''' Takes stim as input, preprocesses it, feeds it to Bert model, then postprocesses the output ''' - # do the masking and get the labels = true token (what could be predicted), and the original word (_mask) + # does the onset refer to the last or the second-last word? + # tokenize all but last word + # tokenize last word, and take first token # pass to model - # add option to return true word and true token - # return the whole distribution or a trimmed distribution - # challenge: get onset for the last word, and how to deal with duration - + # return predictions and true token (id, word or token) + # enable trimming distribution pass - #wds, ons, dur, tok, idx = self._preprocess(stims) - #preds = self.model(idx) - #data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur) - #return ExtractorResult(data, stims, self, features=feat, onsets=ons, - # durations=dur) - - #def _postprocess(self, stims, preds, tok, wds, ons, dur): - # ''' Postprocesses model output (subsets relevant information, - # transforms it where relevant, adds model metadata). - # Takes prediction array, token list, word list, onsets - # and durations and input. Here, returns token-level encodings - # (excluding special tokens). - # ''' - # out = preds.last_hidden_state[:, 1:-1, :] - # if self.framework == 'pt': - # out = out.detach() - # out = out.numpy().squeeze() - # data = [out.tolist()] - # feat = ['encoding'] - # if self.return_input: - # data += [tok, wds] - # feat += ['token', 'word'] - # return data, feat, ons, dur - - #def _to_df(self, result): - # res_df = pd.DataFrame(dict(zip(result.features, result._data))) - # res_df['object_id'] = range(res_df.shape[0]) - # return res_df + From 0876fc3c7f405067a526fdaeecede5f866b3c00b Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Dec 2021 19:39:39 -0600 Subject: [PATCH 4/9] implement preprocessing/tokenization method --- pliers/extractors/text.py | 104 ++++++++++++++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 11 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 18f298e46..afe8a9b3d 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -876,11 +876,17 @@ class GPTForwardLMExtractor(ComplexTextExtractor): See https://huggingface.co/transformers/main_classes/tokenizer.html ''' - _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'model_class', 'model_kwargs', 'tokenizer_kwargs') - _model_attributes = ('pretrained_model', 'framework', 'model_class', - 'tokenizer_type') - # Add tokenizer + _log_attributes = ('pretrained_model', + 'framework', + 'tokenizer_type', + 'model_class', + 'model_kwargs', + 'tokenizer_kwargs') + _model_attributes = ('pretrained_model', + 'framework', + 'model_class', + 'tokenizer_type') + def __init__(self, pretrained_model='gpt2', tokenizer='gpt2', @@ -906,17 +912,93 @@ def __init__(self, tokenizer, **self.tokenizer_kwargs) super().__init__() - def _mask(self, wds): - pass + + def _preprocess(self, stims): + ''' Extracts text, onset, duration from ComplexTextStim, masks target + words (if relevant), tokenizes the input, and casts words, onsets, + and durations to token-level lists. Called within _extract method + to prepare input for the model. ''' + els = [(e.text, e.onset, e.duration) for e in stims.elements] + wds, ons, dur = map(list, zip(*els)) + c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur]) + c_tok = [self.tokenizer.tokenize(w) for w in c_wds] + n_tok = [len(t) for t in c_tok] + stims.name = ' '.join(wds) if stims.name == '' else stims.name + wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [c_wds, + c_ons, + c_dur]) + c_tok = list(flatten(c_tok)) + c_idx = self.tokenizer.encode(c_tok, return_tensors=self.framework) + t_wds = ' ' + wds[-1] + t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[:,:1][0,0] + t_tkn = self.tokenizer.decode(t_id) + return c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn def _extract(self, stims): - ''' Takes stim as input, preprocesses it, feeds it to Bert model, - then postprocesses the output ''' + c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn = self._preprocess(stims) # does the onset refer to the last or the second-last word? - # tokenize all but last word - # tokenize last word, and take first token + # pass to model # return predictions and true token (id, word or token) # enable trimming distribution pass +''' + def _postprocess(self, stims, preds, tok, wds, ons, dur): + if self.framework == 'pt': + preds = preds.logits[:,1:-1,:].detach().numpy() + else: + preds = preds.logits[:,1:-1,:].numpy() + if self.return_softmax: + preds = scipy.special.softmax(preds, axis=-1) + out_idx = preds[0,self.mask_pos,:].argsort()[::-1] + if self.top_n: + sub_idx = out_idx[:self.top_n] + elif self.target: + sub_idx = self.tokenizer.convert_tokens_to_ids(self.target) + elif self.threshold: + sub_idx = np.where(preds[0,self.mask_pos,:] >= self.threshold)[0] + else: + sub_idx = out_idx + out_idx = [idx for idx in out_idx if idx in sub_idx] + feat = self.tokenizer.convert_ids_to_tokens(out_idx) + feat = [f.capitalize() if len(f)==len(f.encode()) else f for f in feat] + data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] + if self.return_masked_word: + feat, data = self._return_masked_word(preds, feat, data) + if self.return_input: + data += [stims.name] + feat += ['sequence'] + mask_ons = listify(stims.elements[self.mask_pos].onset) + mask_dur = listify(stims.elements[self.mask_pos].duration) + return data, feat, mask_ons, mask_dur + + + def _return_masked_word(self, preds, feat, data): + if self.mask_token in self.tokenizer.vocab: + true_vocab_idx = self.tokenizer.vocab[self.mask_token] + true_score = preds[0,self.mask_pos,true_vocab_idx] + else: + true_score = np.nan + logging.warning('True token not in vocabulary. Returning NaN') + feat += ['true_word', 'true_word_score'] + data += [self.mask_token, true_score] + return feat, data + + + def _compute_metrics(self, outputs, wd_id, tokenizer): + loss = float(outputs.loss.cpu().detach().numpy()) + top_id = torch.argmax(outputs.logits[0,-1,:], + axis=-1) + top_token = tokenizer.decode(top_id) + softmaxed = self.softmax_fn(outputs.logits) + prob_true = softmaxed[0,-1,wd_id] + prob_true = float(prob_true.cpu().detach().numpy()) + prob_predicted = float(softmaxed[0,-1,top_id].cpu().detach().numpy()) + softmaxed = softmaxed[0,-1,:].cpu().detach().numpy() + entr = entropy(softmaxed) + top_5 = int(softmaxed.argsort().argsort()[wd_id] >= 50252) + top_10 = int(softmaxed.argsort().argsort()[wd_id] >= 50247) + return top_token, loss, entr, prob_true, prob_predicted, top_5, top_10 + +''' \ No newline at end of file From 0ade1ed003f90c2788d06d2eab35548dec2aef78 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Dec 2021 10:06:17 -0600 Subject: [PATCH 5/9] first pass at complete implementation --- pliers/extractors/text.py | 147 ++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 78 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index afe8a9b3d..42a9866d0 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -856,36 +856,40 @@ def _extract(self, stims): features=self.features, onsets=onsets, durations=durations) + class GPTForwardLMExtractor(ComplexTextExtractor): - ''' Returns predictions for GPT forward language modeling + ''' Returns next word predictions for GPT2 . Args: pretrained_model (str): A string specifying which transformer - model to use. Can be any gpt-class model, either pretrained or - https://huggingface.co/transformers/pretrained_models.html - or path to custom model. + model to use. tokenizer (str): Type of tokenization used in the tokenization step. - If different from model, out-of-vocabulary tokens may be treated - as unknown tokens. - model_class (str): Specifies model class from transformers. - tokenizer_class (str): Specifies tokenizer type from transformers. + If different from model, out-of-vocabulary tokens may be treated as + unknown tokens. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - model_kwargs (dict): Named arguments for transformer model. - See https://huggingface.co/transformers/main_classes/model.html + top_n (int): Specifies how many of the highest-probability tokens are + to be returned. Mutually exclusive with target and threshold. + target (str or list): Vocabulary token(s) for which probability is to + be returned. Tokens defined in the vocabulary change across + tokenizers. Mutually exclusive with top_n and threshold. + threshold (float): If defined, only values above this threshold will + be returned. Mutually exclusive with top_n and target. + return_softmax (bool): if True, returns probability scores instead of + raw predictions. + return_true (bool): if True, returns true_token and its probability. + return_input (bool): whether to return input sequence + model_kwargs (dict): Named arguments for pretrained model. + See: https://huggingface.co/transformers/main_classes/model.html + and https://huggingface.co/transformers/model_doc/bert.html. tokenizer_kwargs (dict): Named arguments for tokenizer. - See https://huggingface.co/transformers/main_classes/tokenizer.html + See https://huggingface.co/transformers/main_classes/tokenizer.html. ''' - _log_attributes = ('pretrained_model', - 'framework', - 'tokenizer_type', - 'model_class', - 'model_kwargs', - 'tokenizer_kwargs') - _model_attributes = ('pretrained_model', - 'framework', - 'model_class', - 'tokenizer_type') + _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', + 'threshold', 'tokenizer_type', 'return_softmax', 'return_true', + 'return_input') + _model_attributes = ('pretrained_model', 'framework', 'top_n', + 'target', 'threshold', 'tokenizer_type') def __init__(self, pretrained_model='gpt2', @@ -893,6 +897,12 @@ def __init__(self, model_class='GPT2LMHeadModel', tokenizer_class='GPT2TokenizerFast', framework='pt', + top_n=None, + threshold=None, + target=None, + return_true=True, + return_softmax=False, + return_input=True, model_kwargs=None, tokenizer_kwargs=None): verify_dependencies(['transformers']) @@ -910,14 +920,26 @@ def __init__(self, pretrained_model, **self.model_kwargs) self.tokenizer = getattr(transformers, tokenizer_class).from_pretrained( tokenizer, **self.tokenizer_kwargs) + self.target = listify(target) + if self.target: + missing = set(self.target) - set(self.tokenizer.vocab.keys()) + if missing: + logging.warning(f'{missing} not in vocabulary. Dropping.') + present = set(self.target) & set(self.tokenizer.vocab.keys()) + self.target = list(present) + if self.target == []: + raise ValueError('No valid target token. Import transformers' + ' and run transformers.GPT2Tokenizer.from_pretrained' + f'(\'{tokenizer}\').vocab.keys() to see available tokens') + self.top_n = top_n + self.threshold = threshold + self.return_softmax = return_softmax + self.return_true = return_true + self.return_input = return_input super().__init__() - def _preprocess(self, stims): - ''' Extracts text, onset, duration from ComplexTextStim, masks target - words (if relevant), tokenizes the input, and casts words, onsets, - and durations to token-level lists. Called within _extract method - to prepare input for the model. ''' + ''' Tokenizes input and returns context and target info ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur]) @@ -931,74 +953,43 @@ def _preprocess(self, stims): c_idx = self.tokenizer.encode(c_tok, return_tensors=self.framework) t_wds = ' ' + wds[-1] t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[:,:1][0,0] - t_tkn = self.tokenizer.decode(t_id) - return c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn + t_tok = self.tokenizer.decode(t_id) + return c_ons, c_dur, c_idx, t_id, t_tok # omit c_wds and c_tok - def _extract(self, stims): - c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn = self._preprocess(stims) - # does the onset refer to the last or the second-last word? - - # pass to model - # return predictions and true token (id, word or token) - # enable trimming distribution - pass -''' - def _postprocess(self, stims, preds, tok, wds, ons, dur): + def _extract(self, stims): + c_ons, c_dur, c_idx, t_id, t_tok = self._preprocess(stims) + outputs = self.model(c_idx) if self.framework == 'pt': - preds = preds.logits[:,1:-1,:].detach().numpy() + preds = outputs.logits[0,-1,:].detach().numpy() else: - preds = preds.logits[:,1:-1,:].numpy() + preds = outputs.logits[0,-1,:].numpy() if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) - out_idx = preds[0,self.mask_pos,:].argsort()[::-1] + out_idx = preds.argsort()[::-1] if self.top_n: sub_idx = out_idx[:self.top_n] elif self.target: sub_idx = self.tokenizer.convert_tokens_to_ids(self.target) elif self.threshold: - sub_idx = np.where(preds[0,self.mask_pos,:] >= self.threshold)[0] + sub_idx = np.where(preds >= self.threshold)[0] else: sub_idx = out_idx out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) feat = [f.capitalize() if len(f)==len(f.encode()) else f for f in feat] - data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] - if self.return_masked_word: - feat, data = self._return_masked_word(preds, feat, data) + data = [listify(p) for p in preds[out_idx]] + if self.return_true: + feat += ['true_token', 'true_token_score'] + data += [t_tok, preds[t_id]] if self.return_input: - data += [stims.name] feat += ['sequence'] - mask_ons = listify(stims.elements[self.mask_pos].onset) - mask_dur = listify(stims.elements[self.mask_pos].duration) - return data, feat, mask_ons, mask_dur - - - def _return_masked_word(self, preds, feat, data): - if self.mask_token in self.tokenizer.vocab: - true_vocab_idx = self.tokenizer.vocab[self.mask_token] - true_score = preds[0,self.mask_pos,true_vocab_idx] - else: - true_score = np.nan - logging.warning('True token not in vocabulary. Returning NaN') - feat += ['true_word', 'true_word_score'] - data += [self.mask_token, true_score] - return feat, data + data += [stims.name] + ons = listify(c_ons[-1]) + dur = listify(c_dur[-1]) + return data, feat, ons, dur - - def _compute_metrics(self, outputs, wd_id, tokenizer): - loss = float(outputs.loss.cpu().detach().numpy()) - top_id = torch.argmax(outputs.logits[0,-1,:], - axis=-1) - top_token = tokenizer.decode(top_id) - softmaxed = self.softmax_fn(outputs.logits) - prob_true = softmaxed[0,-1,wd_id] - prob_true = float(prob_true.cpu().detach().numpy()) - prob_predicted = float(softmaxed[0,-1,top_id].cpu().detach().numpy()) - softmaxed = softmaxed[0,-1,:].cpu().detach().numpy() - entr = entropy(softmaxed) - top_5 = int(softmaxed.argsort().argsort()[wd_id] >= 50252) - top_10 = int(softmaxed.argsort().argsort()[wd_id] >= 50247) - return top_token, loss, entr, prob_true, prob_predicted, top_5, top_10 - -''' \ No newline at end of file + def _to_df(self, result): + res_df = pd.DataFrame(dict(zip(result.features, result._data))) + res_df['object_id'] = range(res_df.shape[0]) + return res_df From ad2088948a2b7a0a24462596161c148c618e93f7 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Dec 2021 10:07:26 -0600 Subject: [PATCH 6/9] add to init --- pliers/extractors/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pliers/extractors/__init__.py b/pliers/extractors/__init__.py index cc1f983e5..653000dea 100644 --- a/pliers/extractors/__init__.py +++ b/pliers/extractors/__init__.py @@ -72,7 +72,7 @@ VADERSentimentExtractor, SpaCyExtractor, WordCounterExtractor, BertExtractor, BertSequenceEncodingExtractor, BertLMExtractor, - BertSentimentExtractor) + BertSentimentExtractor, GPTForwardLMExtractor) from .video import (FarnebackOpticalFlowExtractor) __all__ = [ @@ -154,6 +154,7 @@ 'BertSequenceEncodingExtractor', 'BertLMExtractor', 'BertSentimentExtractor', + 'GPTForwardLMExtractor', 'AudiosetLabelExtractor', 'WordCounterExtractor', 'MetricExtractor', From b6215c9706a002a1c788ba78846ce91371618a76 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Dec 2021 11:09:04 -0600 Subject: [PATCH 7/9] finalize implementation --- pliers/extractors/text.py | 53 ++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 42a9866d0..24b181374 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -879,15 +879,12 @@ class GPTForwardLMExtractor(ComplexTextExtractor): return_true (bool): if True, returns true_token and its probability. return_input (bool): whether to return input sequence model_kwargs (dict): Named arguments for pretrained model. - See: https://huggingface.co/transformers/main_classes/model.html - and https://huggingface.co/transformers/model_doc/bert.html. tokenizer_kwargs (dict): Named arguments for tokenizer. - See https://huggingface.co/transformers/main_classes/tokenizer.html. ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', - 'threshold', 'tokenizer_type', 'return_softmax', 'return_true', - 'return_input') + 'threshold', 'tokenizer_type', 'return_softmax', 'return_true_word', + 'return_true_token', 'return_input', 'return_context') _model_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 'threshold', 'tokenizer_type') @@ -900,9 +897,11 @@ def __init__(self, top_n=None, threshold=None, target=None, - return_true=True, - return_softmax=False, + return_true_token=True, + return_true_word=False, + return_softmax=None, return_input=True, + return_context=True, model_kwargs=None, tokenizer_kwargs=None): verify_dependencies(['transformers']) @@ -934,7 +933,9 @@ def __init__(self, self.top_n = top_n self.threshold = threshold self.return_softmax = return_softmax - self.return_true = return_true + self.return_context = return_context + self.return_true_word = return_true_word + self.return_true_token = return_true_token self.return_input = return_input super().__init__() @@ -942,24 +943,18 @@ def _preprocess(self, stims): ''' Tokenizes input and returns context and target info ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) - c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur]) - c_tok = [self.tokenizer.tokenize(w) for w in c_wds] - n_tok = [len(t) for t in c_tok] + c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur]) # second last + c_tok = self.tokenizer.encode(' '.join(c_wds), return_tensors=self.framework) stims.name = ' '.join(wds) if stims.name == '' else stims.name - wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [c_wds, - c_ons, - c_dur]) - c_tok = list(flatten(c_tok)) - c_idx = self.tokenizer.encode(c_tok, return_tensors=self.framework) t_wds = ' ' + wds[-1] - t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[:,:1][0,0] + t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[0,0] t_tok = self.tokenizer.decode(t_id) - return c_ons, c_dur, c_idx, t_id, t_tok # omit c_wds and c_tok + return c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds def _extract(self, stims): - c_ons, c_dur, c_idx, t_id, t_tok = self._preprocess(stims) - outputs = self.model(c_idx) + c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds = self._preprocess(stims) + outputs = self.model(c_tok) if self.framework == 'pt': preds = outputs.logits[0,-1,:].detach().numpy() else: @@ -976,18 +971,24 @@ def _extract(self, stims): else: sub_idx = out_idx out_idx = [idx for idx in out_idx if idx in sub_idx] - feat = self.tokenizer.convert_ids_to_tokens(out_idx) - feat = [f.capitalize() if len(f)==len(f.encode()) else f for f in feat] - data = [listify(p) for p in preds[out_idx]] - if self.return_true: + feat = [self.tokenizer.decode(o) for o in out_idx] + data = [listify(float(p)) for p in preds[out_idx]] + if self.return_true_token: feat += ['true_token', 'true_token_score'] - data += [t_tok, preds[t_id]] + data += [t_tok, float(preds[t_id])] + if self.return_true_word: + feat += ['true_word'] + data += [t_wds] + if self.return_context: + feat += ['context'] + data += [' '.join(c_wds)] if self.return_input: feat += ['sequence'] data += [stims.name] ons = listify(c_ons[-1]) dur = listify(c_dur[-1]) - return data, feat, ons, dur + return ExtractorResult(data, stims, self, + features=feat, onsets=ons, durations=dur) def _to_df(self, result): res_df = pd.DataFrame(dict(zip(result.features, result._data))) From e28d7daa93c2a6782c15d3dc1db4bd0be3e48554 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Dec 2021 10:38:24 -0600 Subject: [PATCH 8/9] enable last or second to last onset --- pliers/extractors/text.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 24b181374..1bea85d6c 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -858,7 +858,7 @@ def _extract(self, stims): class GPTForwardLMExtractor(ComplexTextExtractor): - ''' Returns next word predictions for GPT2 . + ''' Returns next word predictions for GPT models . Args: pretrained_model (str): A string specifying which transformer model to use. @@ -878,13 +878,16 @@ class GPTForwardLMExtractor(ComplexTextExtractor): raw predictions. return_true (bool): if True, returns true_token and its probability. return_input (bool): whether to return input sequence + onset (str): whether the onset in the result is the one from + the target word ('target') or from the last word in the + context ('last_context') model_kwargs (dict): Named arguments for pretrained model. tokenizer_kwargs (dict): Named arguments for tokenizer. ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 'threshold', 'tokenizer_type', 'return_softmax', 'return_true_word', - 'return_true_token', 'return_input', 'return_context') + 'return_true_token', 'return_input', 'return_context', 'onset') _model_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 'threshold', 'tokenizer_type') @@ -902,12 +905,16 @@ def __init__(self, return_softmax=None, return_input=True, return_context=True, + onset='target', model_kwargs=None, tokenizer_kwargs=None): verify_dependencies(['transformers']) if framework not in ['pt', 'tf']: raise(ValueError('''Invalid framework; must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) + if onset not in ['target', 'last_context']: + raise(ValueError('''Onset must be one of + 'target' or 'last_context'.''')) self.pretrained_model = pretrained_model self.tokenizer_type = tokenizer self.model_class = model_class @@ -937,6 +944,7 @@ def __init__(self, self.return_true_word = return_true_word self.return_true_token = return_true_token self.return_input = return_input + self.onset = onset super().__init__() def _preprocess(self, stims): @@ -949,11 +957,13 @@ def _preprocess(self, stims): t_wds = ' ' + wds[-1] t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[0,0] t_tok = self.tokenizer.decode(t_id) - return c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds - + return ((c_ons, c_dur, c_tok, c_wds), + (t_id, t_tok, t_wds, ons[-1], dur[-1])) def _extract(self, stims): - c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds = self._preprocess(stims) + c_outs, t_outs = self._preprocess(stims) + c_ons, c_dur, c_tok, c_wds = c_outs + t_id, t_tok, t_wds, t_ons, t_dur = t_outs outputs = self.model(c_tok) if self.framework == 'pt': preds = outputs.logits[0,-1,:].detach().numpy() @@ -985,8 +995,12 @@ def _extract(self, stims): if self.return_input: feat += ['sequence'] data += [stims.name] - ons = listify(c_ons[-1]) - dur = listify(c_dur[-1]) + if self.onset == 'target': + ons = listify(t_ons) + dur = listify(t_dur) + else: + ons = listify(c_ons[-1]) + dur = listify(c_dur[-1]) return ExtractorResult(data, stims, self, features=feat, onsets=ons, durations=dur) From 549b56fdcf9ca7b360ddee8ec20ce1224cfe339e Mon Sep 17 00:00:00 2001 From: Roberta Rocca <32483140+rbroc@users.noreply.github.com> Date: Tue, 7 Dec 2021 17:30:33 -0600 Subject: [PATCH 9/9] avoid confounding vocab items and additional features --- pliers/extractors/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 1bea85d6c..121b7b0f3 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -990,10 +990,10 @@ def _extract(self, stims): feat += ['true_word'] data += [t_wds] if self.return_context: - feat += ['context'] + feat += ['lm_context'] data += [' '.join(c_wds)] if self.return_input: - feat += ['sequence'] + feat += ['lm_sequence'] data += [stims.name] if self.onset == 'target': ons = listify(t_ons)