From 69ded01a58fc2b247ba7835cfe4eea623be1ddef Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Fri, 5 Nov 2021 16:17:05 -0500
Subject: [PATCH 1/9] add class

---
 pliers/extractors/text.py | 114 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index 8735ae231..ad65a1f0a 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -855,3 +855,117 @@ def _extract(self, stims):
         return ExtractorResult(word_counter, stims, self,
                                features=self.features,
                                onsets=onsets, durations=durations)
+
+class GPTForwardLMExtractor(ComplexTextExtractor):
+    ''' Returns predictions for GPT forward language modeling
+    Args:
+        pretrained_model (str): A string specifying which transformer
+            model to use. Can be any gpt-class model, either pretrained or  
+            https://huggingface.co/transformers/pretrained_models.html
+            or path to custom model.
+        tokenizer (str): Type of tokenization used in the tokenization step.
+            If different from model, out-of-vocabulary tokens may be treated 
+            as unknown tokens.
+        model_class (str): Specifies model type. Must be one of 'AutoModel' 
+            (encoding extractor) or  'AutoModelWithLMHead' (language model).
+            These are generic model classes, which use the value of 
+            pretrained_model to infer the model-specific transformers 
+            class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel 
+            or RobertaForMaskedLM for RoBERTa). Fixed by each subclass.
+        framework (str): name deep learning framework to use. Must be 'pt'
+            (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'.
+        return_input (bool): if True, the extractor returns encoded token
+            and encoded word as features.
+        model_kwargs (dict): Named arguments for transformer model.
+            See https://huggingface.co/transformers/main_classes/model.html
+        tokenizer_kwargs (dict): Named arguments for tokenizer.
+            See https://huggingface.co/transformers/main_classes/tokenizer.html
+    '''
+
+    _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type',
+        'model_class', 'return_input', 'model_kwargs', 'tokenizer_kwargs')
+    _model_attributes = ('pretrained_model', 'framework', 'model_class', 
+        'tokenizer_type')
+    # Add tokenizer
+    def __init__(self,
+                 pretrained_model='bert-base-uncased',
+                 tokenizer='bert-base-uncased',
+                 model_class='AutoModel',
+                 framework='pt',
+                 return_input=False,
+                 model_kwargs=None,
+                 tokenizer_kwargs=None):
+        verify_dependencies(['transformers'])
+        if framework not in ['pt', 'tf']:
+            raise(ValueError('''Invalid framework;
+                must be one of 'pt' (pytorch) or 'tf' (tensorflow)'''))
+        self.pretrained_model = pretrained_model
+        self.tokenizer_type = tokenizer
+        self.model_class = model_class
+        self.framework = framework
+        self.return_input = return_input
+        self.model_kwargs = model_kwargs if model_kwargs else {}
+        self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {}
+        model = model_class if self.framework == 'pt' else 'TF' + model_class
+        self.model = getattr(transformers, model).from_pretrained(
+            pretrained_model, **self.model_kwargs)
+        self.tokenizer = transformers.GPTTokenizerFast.from_pretrained(
+            tokenizer, **self.tokenizer_kwargs)
+        super().__init__()
+
+    def _mask_words(self, wds):
+        ''' Called by _preprocess method. Takes list of words in the Stim as
+            input (i.e. the .text attribute for each TextStim in the 
+            ComplexTextStim). If class has mask attribute, replaces word in 
+            the input sequence with [MASK] token based on the value of mask 
+            (either index in the sequence, or word to replace). Here, returns
+            list of words (without masking)
+        '''
+        return wds
+
+    def _preprocess(self, stims):
+        ''' Extracts text, onset, duration from ComplexTextStim, masks target
+            words (if relevant), tokenizes the input, and casts words, onsets,
+            and durations to token-level lists. Called within _extract method 
+            to prepare input for the model. '''
+        els = [(e.text, e.onset, e.duration) for e in stims.elements]
+        wds, ons, dur = map(list, zip(*els))
+        tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)]
+        n_tok = [len(t) for t in tok]
+        stims.name = ' '.join(wds) if stims.name == '' else stims.name
+        wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur])
+        tok = list(flatten(tok))
+        idx = self.tokenizer.encode(tok, return_tensors=self.framework)
+        return wds, ons, dur, tok, idx
+
+    def _extract(self, stims):
+        ''' Takes stim as input, preprocesses it, feeds it to Bert model, 
+            then postprocesses the output '''
+        wds, ons, dur, tok, idx = self._preprocess(stims)
+        preds = self.model(idx)
+        data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur)
+        return ExtractorResult(data, stims, self, features=feat, onsets=ons, 
+                               durations=dur)
+
+    def _postprocess(self, stims, preds, tok, wds, ons, dur):
+        ''' Postprocesses model output (subsets relevant information,
+            transforms it where relevant, adds model metadata). 
+            Takes prediction array, token list, word list, onsets 
+            and durations and input. Here, returns token-level encodings 
+            (excluding special tokens).
+        '''
+        out = preds.last_hidden_state[:, 1:-1, :]
+        if self.framework == 'pt':
+            out = out.detach() 
+        out = out.numpy().squeeze()
+        data = [out.tolist()]
+        feat = ['encoding']
+        if self.return_input:
+            data += [tok, wds]
+            feat += ['token', 'word']
+        return data, feat, ons, dur
+    
+    def _to_df(self, result):
+        res_df = pd.DataFrame(dict(zip(result.features, result._data)))
+        res_df['object_id'] = range(res_df.shape[0])
+        return res_df

From 3daa22daa73714594ccb2c0679e77571690bb46e Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Fri, 5 Nov 2021 16:46:29 -0500
Subject: [PATCH 2/9] make notes on structure

---
 pliers/extractors/text.py | 123 ++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 65 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index ad65a1f0a..224bc802e 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -866,16 +866,10 @@ class GPTForwardLMExtractor(ComplexTextExtractor):
         tokenizer (str): Type of tokenization used in the tokenization step.
             If different from model, out-of-vocabulary tokens may be treated 
             as unknown tokens.
-        model_class (str): Specifies model type. Must be one of 'AutoModel' 
-            (encoding extractor) or  'AutoModelWithLMHead' (language model).
-            These are generic model classes, which use the value of 
-            pretrained_model to infer the model-specific transformers 
-            class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel 
-            or RobertaForMaskedLM for RoBERTa). Fixed by each subclass.
+        model_class (str): Specifies model class from transformers. 
+        tokenizer_class (str): Specifies tokenizer type from transformers. 
         framework (str): name deep learning framework to use. Must be 'pt'
             (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'.
-        return_input (bool): if True, the extractor returns encoded token
-            and encoded word as features.
         model_kwargs (dict): Named arguments for transformer model.
             See https://huggingface.co/transformers/main_classes/model.html
         tokenizer_kwargs (dict): Named arguments for tokenizer.
@@ -883,16 +877,16 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel
     '''
 
     _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type',
-        'model_class', 'return_input', 'model_kwargs', 'tokenizer_kwargs')
+        'model_class', 'model_kwargs', 'tokenizer_kwargs')
     _model_attributes = ('pretrained_model', 'framework', 'model_class', 
         'tokenizer_type')
     # Add tokenizer
     def __init__(self,
-                 pretrained_model='bert-base-uncased',
-                 tokenizer='bert-base-uncased',
-                 model_class='AutoModel',
+                 pretrained_model='gpt2',
+                 tokenizer='gpt2',
+                 model_class='GPT2LMHeadModel',
+                 tokenizer_class='GPT2TokenizerFast',
                  framework='pt',
-                 return_input=False,
                  model_kwargs=None,
                  tokenizer_kwargs=None):
         verify_dependencies(['transformers'])
@@ -903,69 +897,68 @@ def __init__(self,
         self.tokenizer_type = tokenizer
         self.model_class = model_class
         self.framework = framework
-        self.return_input = return_input
         self.model_kwargs = model_kwargs if model_kwargs else {}
         self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {}
         model = model_class if self.framework == 'pt' else 'TF' + model_class
         self.model = getattr(transformers, model).from_pretrained(
             pretrained_model, **self.model_kwargs)
-        self.tokenizer = transformers.GPTTokenizerFast.from_pretrained(
+        self.tokenizer = getattr(transformers, tokenizer_class).from_pretrained(
             tokenizer, **self.tokenizer_kwargs)
         super().__init__()
 
     def _mask_words(self, wds):
-        ''' Called by _preprocess method. Takes list of words in the Stim as
-            input (i.e. the .text attribute for each TextStim in the 
-            ComplexTextStim). If class has mask attribute, replaces word in 
-            the input sequence with [MASK] token based on the value of mask 
-            (either index in the sequence, or word to replace). Here, returns
-            list of words (without masking)
-        '''
-        return wds
-
-    def _preprocess(self, stims):
-        ''' Extracts text, onset, duration from ComplexTextStim, masks target
-            words (if relevant), tokenizes the input, and casts words, onsets,
-            and durations to token-level lists. Called within _extract method 
-            to prepare input for the model. '''
-        els = [(e.text, e.onset, e.duration) for e in stims.elements]
-        wds, ons, dur = map(list, zip(*els))
-        tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)]
-        n_tok = [len(t) for t in tok]
-        stims.name = ' '.join(wds) if stims.name == '' else stims.name
-        wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur])
-        tok = list(flatten(tok))
-        idx = self.tokenizer.encode(tok, return_tensors=self.framework)
-        return wds, ons, dur, tok, idx
+        pass
+    
+    #def _preprocess(self, stims):
+    #    ''' Extracts text, onset, duration from ComplexTextStim, masks target
+    #        words (if relevant), tokenizes the input, and casts words, onsets,
+    #        and durations to token-level lists. Called within _extract method 
+    #        to prepare input for the model. '''
+    #    els = [(e.text, e.onset, e.duration) for e in stims.elements]
+    #    wds, ons, dur = map(list, zip(*els))
+    #    tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)]
+    #    n_tok = [len(t) for t in tok]
+    #    stims.name = ' '.join(wds) if stims.name == '' else stims.name
+    #    wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur])
+    #    tok = list(flatten(tok))
+    #    idx = self.tokenizer.encode(tok, return_tensors=self.framework)
+    #    return wds, ons, dur, tok, idx
 
     def _extract(self, stims):
         ''' Takes stim as input, preprocesses it, feeds it to Bert model, 
             then postprocesses the output '''
-        wds, ons, dur, tok, idx = self._preprocess(stims)
-        preds = self.model(idx)
-        data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur)
-        return ExtractorResult(data, stims, self, features=feat, onsets=ons, 
-                               durations=dur)
-
-    def _postprocess(self, stims, preds, tok, wds, ons, dur):
-        ''' Postprocesses model output (subsets relevant information,
-            transforms it where relevant, adds model metadata). 
-            Takes prediction array, token list, word list, onsets 
-            and durations and input. Here, returns token-level encodings 
-            (excluding special tokens).
-        '''
-        out = preds.last_hidden_state[:, 1:-1, :]
-        if self.framework == 'pt':
-            out = out.detach() 
-        out = out.numpy().squeeze()
-        data = [out.tolist()]
-        feat = ['encoding']
-        if self.return_input:
-            data += [tok, wds]
-            feat += ['token', 'word']
-        return data, feat, ons, dur
+        # do the masking and get the labels = true token (what could be predicted), and the original word (_mask)
+        # pass to model
+        # add option to return true word and true token
+        # return the whole distribution or a trimmed distribution
+        # challenge: get onset for the last word, and how to deal with duration
+
+        pass
+        #wds, ons, dur, tok, idx = self._preprocess(stims)
+        #preds = self.model(idx)
+        #data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur)
+        #return ExtractorResult(data, stims, self, features=feat, onsets=ons, 
+        #                       durations=dur)
+
+    #def _postprocess(self, stims, preds, tok, wds, ons, dur):
+    #    ''' Postprocesses model output (subsets relevant information,
+    #        transforms it where relevant, adds model metadata). 
+    #        Takes prediction array, token list, word list, onsets 
+    #        and durations and input. Here, returns token-level encodings 
+    #        (excluding special tokens).
+    #    '''
+    #   out = preds.last_hidden_state[:, 1:-1, :]
+    #   if self.framework == 'pt':
+    #       out = out.detach() 
+    #   out = out.numpy().squeeze()
+    #   data = [out.tolist()]
+    #   feat = ['encoding']
+    #   if self.return_input:
+    #       data += [tok, wds]
+    #        feat += ['token', 'word']
+    #   return data, feat, ons, dur
     
-    def _to_df(self, result):
-        res_df = pd.DataFrame(dict(zip(result.features, result._data)))
-        res_df['object_id'] = range(res_df.shape[0])
-        return res_df
+    #def _to_df(self, result):
+    #    res_df = pd.DataFrame(dict(zip(result.features, result._data)))
+    #    res_df['object_id'] = range(res_df.shape[0])
+    #    return res_df

From e377e4a99ddc8c5407024dff9c1ce317e3e94cb2 Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Fri, 5 Nov 2021 18:33:50 -0500
Subject: [PATCH 3/9] cleanup

---
 pliers/extractors/text.py | 56 +++++----------------------------------
 1 file changed, 7 insertions(+), 49 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index 224bc802e..18f298e46 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -906,59 +906,17 @@ def __init__(self,
             tokenizer, **self.tokenizer_kwargs)
         super().__init__()
 
-    def _mask_words(self, wds):
+    def _mask(self, wds):
         pass
-    
-    #def _preprocess(self, stims):
-    #    ''' Extracts text, onset, duration from ComplexTextStim, masks target
-    #        words (if relevant), tokenizes the input, and casts words, onsets,
-    #        and durations to token-level lists. Called within _extract method 
-    #        to prepare input for the model. '''
-    #    els = [(e.text, e.onset, e.duration) for e in stims.elements]
-    #    wds, ons, dur = map(list, zip(*els))
-    #    tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)]
-    #    n_tok = [len(t) for t in tok]
-    #    stims.name = ' '.join(wds) if stims.name == '' else stims.name
-    #    wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur])
-    #    tok = list(flatten(tok))
-    #    idx = self.tokenizer.encode(tok, return_tensors=self.framework)
-    #    return wds, ons, dur, tok, idx
 
     def _extract(self, stims):
         ''' Takes stim as input, preprocesses it, feeds it to Bert model, 
             then postprocesses the output '''
-        # do the masking and get the labels = true token (what could be predicted), and the original word (_mask)
+        # does the onset refer to the last or the second-last word?
+        # tokenize all but last word
+        # tokenize last word, and take first token
         # pass to model
-        # add option to return true word and true token
-        # return the whole distribution or a trimmed distribution
-        # challenge: get onset for the last word, and how to deal with duration
-
+        # return predictions and true token (id, word or token)
+        # enable trimming distribution
         pass
-        #wds, ons, dur, tok, idx = self._preprocess(stims)
-        #preds = self.model(idx)
-        #data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur)
-        #return ExtractorResult(data, stims, self, features=feat, onsets=ons, 
-        #                       durations=dur)
-
-    #def _postprocess(self, stims, preds, tok, wds, ons, dur):
-    #    ''' Postprocesses model output (subsets relevant information,
-    #        transforms it where relevant, adds model metadata). 
-    #        Takes prediction array, token list, word list, onsets 
-    #        and durations and input. Here, returns token-level encodings 
-    #        (excluding special tokens).
-    #    '''
-    #   out = preds.last_hidden_state[:, 1:-1, :]
-    #   if self.framework == 'pt':
-    #       out = out.detach() 
-    #   out = out.numpy().squeeze()
-    #   data = [out.tolist()]
-    #   feat = ['encoding']
-    #   if self.return_input:
-    #       data += [tok, wds]
-    #        feat += ['token', 'word']
-    #   return data, feat, ons, dur
-    
-    #def _to_df(self, result):
-    #    res_df = pd.DataFrame(dict(zip(result.features, result._data)))
-    #    res_df['object_id'] = range(res_df.shape[0])
-    #    return res_df
+

From 0876fc3c7f405067a526fdaeecede5f866b3c00b Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Wed, 1 Dec 2021 19:39:39 -0600
Subject: [PATCH 4/9] implement preprocessing/tokenization method

---
 pliers/extractors/text.py | 104 ++++++++++++++++++++++++++++++++++----
 1 file changed, 93 insertions(+), 11 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index 18f298e46..afe8a9b3d 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -876,11 +876,17 @@ class GPTForwardLMExtractor(ComplexTextExtractor):
             See https://huggingface.co/transformers/main_classes/tokenizer.html
     '''
 
-    _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type',
-        'model_class', 'model_kwargs', 'tokenizer_kwargs')
-    _model_attributes = ('pretrained_model', 'framework', 'model_class', 
-        'tokenizer_type')
-    # Add tokenizer
+    _log_attributes = ('pretrained_model', 
+                       'framework', 
+                       'tokenizer_type',
+                       'model_class', 
+                       'model_kwargs', 
+                       'tokenizer_kwargs')
+    _model_attributes = ('pretrained_model', 
+                         'framework', 
+                         'model_class', 
+                         'tokenizer_type')
+    
     def __init__(self,
                  pretrained_model='gpt2',
                  tokenizer='gpt2',
@@ -906,17 +912,93 @@ def __init__(self,
             tokenizer, **self.tokenizer_kwargs)
         super().__init__()
 
-    def _mask(self, wds):
-        pass
+
+    def _preprocess(self, stims):
+        ''' Extracts text, onset, duration from ComplexTextStim, masks target
+            words (if relevant), tokenizes the input, and casts words, onsets,
+            and durations to token-level lists. Called within _extract method 
+            to prepare input for the model. '''
+        els = [(e.text, e.onset, e.duration) for e in stims.elements]
+        wds, ons, dur = map(list, zip(*els))
+        c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur])
+        c_tok = [self.tokenizer.tokenize(w) for w in c_wds]
+        n_tok = [len(t) for t in c_tok]
+        stims.name = ' '.join(wds) if stims.name == '' else stims.name
+        wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [c_wds, 
+                                                            c_ons, 
+                                                            c_dur])
+        c_tok = list(flatten(c_tok))
+        c_idx = self.tokenizer.encode(c_tok, return_tensors=self.framework)
+        t_wds = ' ' + wds[-1]
+        t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[:,:1][0,0]
+        t_tkn =  self.tokenizer.decode(t_id)
+        return c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn
 
     def _extract(self, stims):
-        ''' Takes stim as input, preprocesses it, feeds it to Bert model, 
-            then postprocesses the output '''
+        c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn = self._preprocess(stims)
         # does the onset refer to the last or the second-last word?
-        # tokenize all but last word
-        # tokenize last word, and take first token
+        
         # pass to model
         # return predictions and true token (id, word or token)
         # enable trimming distribution
         pass
 
+'''
+    def _postprocess(self, stims, preds, tok, wds, ons, dur):
+        if self.framework == 'pt':
+            preds = preds.logits[:,1:-1,:].detach().numpy()
+        else:
+            preds = preds.logits[:,1:-1,:].numpy()
+        if self.return_softmax:
+            preds = scipy.special.softmax(preds, axis=-1)
+        out_idx = preds[0,self.mask_pos,:].argsort()[::-1]
+        if self.top_n:
+            sub_idx = out_idx[:self.top_n]
+        elif self.target:
+            sub_idx = self.tokenizer.convert_tokens_to_ids(self.target)
+        elif self.threshold:
+            sub_idx = np.where(preds[0,self.mask_pos,:] >= self.threshold)[0]
+        else:
+            sub_idx = out_idx
+        out_idx = [idx for idx in out_idx if idx in sub_idx]
+        feat = self.tokenizer.convert_ids_to_tokens(out_idx)
+        feat = [f.capitalize() if len(f)==len(f.encode()) else f for f in feat]
+        data = [listify(p) for p in preds[0,self.mask_pos,out_idx]]
+        if self.return_masked_word:
+            feat, data = self._return_masked_word(preds, feat, data)
+        if self.return_input:
+            data += [stims.name]
+            feat += ['sequence']
+        mask_ons = listify(stims.elements[self.mask_pos].onset)
+        mask_dur = listify(stims.elements[self.mask_pos].duration)
+        return data, feat, mask_ons, mask_dur
+    
+
+    def _return_masked_word(self, preds, feat, data):
+        if self.mask_token in self.tokenizer.vocab:
+            true_vocab_idx = self.tokenizer.vocab[self.mask_token]
+            true_score = preds[0,self.mask_pos,true_vocab_idx]
+        else:
+            true_score = np.nan
+            logging.warning('True token not in vocabulary. Returning NaN')
+        feat += ['true_word', 'true_word_score']
+        data += [self.mask_token, true_score]
+        return feat, data
+
+    
+    def _compute_metrics(self, outputs, wd_id, tokenizer):
+        loss = float(outputs.loss.cpu().detach().numpy())
+        top_id = torch.argmax(outputs.logits[0,-1,:], 
+                              axis=-1)
+        top_token = tokenizer.decode(top_id)
+        softmaxed = self.softmax_fn(outputs.logits)
+        prob_true = softmaxed[0,-1,wd_id]
+        prob_true = float(prob_true.cpu().detach().numpy())
+        prob_predicted = float(softmaxed[0,-1,top_id].cpu().detach().numpy())
+        softmaxed = softmaxed[0,-1,:].cpu().detach().numpy()
+        entr = entropy(softmaxed)
+        top_5 = int(softmaxed.argsort().argsort()[wd_id] >= 50252) 
+        top_10 = int(softmaxed.argsort().argsort()[wd_id] >= 50247) 
+        return top_token, loss, entr, prob_true, prob_predicted, top_5, top_10
+
+'''
\ No newline at end of file

From 0ade1ed003f90c2788d06d2eab35548dec2aef78 Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Thu, 2 Dec 2021 10:06:17 -0600
Subject: [PATCH 5/9] first pass at complete implementation

---
 pliers/extractors/text.py | 147 ++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 78 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index afe8a9b3d..42a9866d0 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -856,36 +856,40 @@ def _extract(self, stims):
                                features=self.features,
                                onsets=onsets, durations=durations)
 
+
 class GPTForwardLMExtractor(ComplexTextExtractor):
-    ''' Returns predictions for GPT forward language modeling
+    ''' Returns next word predictions for GPT2 .
     Args:
         pretrained_model (str): A string specifying which transformer
-            model to use. Can be any gpt-class model, either pretrained or  
-            https://huggingface.co/transformers/pretrained_models.html
-            or path to custom model.
+            model to use. 
         tokenizer (str): Type of tokenization used in the tokenization step.
-            If different from model, out-of-vocabulary tokens may be treated 
-            as unknown tokens.
-        model_class (str): Specifies model class from transformers. 
-        tokenizer_class (str): Specifies tokenizer type from transformers. 
+            If different from model, out-of-vocabulary tokens may be treated as
+            unknown tokens.
         framework (str): name deep learning framework to use. Must be 'pt'
             (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'.
-        model_kwargs (dict): Named arguments for transformer model.
-            See https://huggingface.co/transformers/main_classes/model.html
+        top_n (int): Specifies how many of the highest-probability tokens are
+            to be returned. Mutually exclusive with target and threshold.
+        target (str or list): Vocabulary token(s) for which probability is to 
+            be returned. Tokens defined in the vocabulary change across 
+            tokenizers. Mutually exclusive with top_n and threshold.
+        threshold (float): If defined, only values above this threshold will
+            be returned. Mutually exclusive with top_n and target.
+        return_softmax (bool): if True, returns probability scores instead of 
+            raw predictions.
+        return_true (bool): if True, returns true_token and its probability.
+        return_input (bool): whether to return input sequence
+        model_kwargs (dict): Named arguments for pretrained model.
+            See: https://huggingface.co/transformers/main_classes/model.html
+            and https://huggingface.co/transformers/model_doc/bert.html.
         tokenizer_kwargs (dict): Named arguments for tokenizer.
-            See https://huggingface.co/transformers/main_classes/tokenizer.html
+            See https://huggingface.co/transformers/main_classes/tokenizer.html.
     '''
 
-    _log_attributes = ('pretrained_model', 
-                       'framework', 
-                       'tokenizer_type',
-                       'model_class', 
-                       'model_kwargs', 
-                       'tokenizer_kwargs')
-    _model_attributes = ('pretrained_model', 
-                         'framework', 
-                         'model_class', 
-                         'tokenizer_type')
+    _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 
+        'threshold', 'tokenizer_type', 'return_softmax', 'return_true',
+        'return_input')
+    _model_attributes = ('pretrained_model', 'framework', 'top_n',
+        'target', 'threshold', 'tokenizer_type')
     
     def __init__(self,
                  pretrained_model='gpt2',
@@ -893,6 +897,12 @@ def __init__(self,
                  model_class='GPT2LMHeadModel',
                  tokenizer_class='GPT2TokenizerFast',
                  framework='pt',
+                 top_n=None,
+                 threshold=None,
+                 target=None,
+                 return_true=True,
+                 return_softmax=False,
+                 return_input=True,
                  model_kwargs=None,
                  tokenizer_kwargs=None):
         verify_dependencies(['transformers'])
@@ -910,14 +920,26 @@ def __init__(self,
             pretrained_model, **self.model_kwargs)
         self.tokenizer = getattr(transformers, tokenizer_class).from_pretrained(
             tokenizer, **self.tokenizer_kwargs)
+        self.target = listify(target)
+        if self.target:
+            missing = set(self.target) - set(self.tokenizer.vocab.keys())
+            if missing:
+                logging.warning(f'{missing} not in vocabulary. Dropping.')
+            present = set(self.target) & set(self.tokenizer.vocab.keys())
+            self.target = list(present)
+            if self.target == []:
+                raise ValueError('No valid target token. Import transformers'
+                    ' and run transformers.GPT2Tokenizer.from_pretrained'
+                    f'(\'{tokenizer}\').vocab.keys() to see available tokens')
+        self.top_n = top_n
+        self.threshold = threshold
+        self.return_softmax = return_softmax
+        self.return_true = return_true
+        self.return_input = return_input
         super().__init__()
 
-
     def _preprocess(self, stims):
-        ''' Extracts text, onset, duration from ComplexTextStim, masks target
-            words (if relevant), tokenizes the input, and casts words, onsets,
-            and durations to token-level lists. Called within _extract method 
-            to prepare input for the model. '''
+        ''' Tokenizes input and returns context and target info '''
         els = [(e.text, e.onset, e.duration) for e in stims.elements]
         wds, ons, dur = map(list, zip(*els))
         c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur])
@@ -931,74 +953,43 @@ def _preprocess(self, stims):
         c_idx = self.tokenizer.encode(c_tok, return_tensors=self.framework)
         t_wds = ' ' + wds[-1]
         t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[:,:1][0,0]
-        t_tkn =  self.tokenizer.decode(t_id)
-        return c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn
+        t_tok =  self.tokenizer.decode(t_id)
+        return c_ons, c_dur, c_idx, t_id, t_tok # omit c_wds and c_tok
 
-    def _extract(self, stims):
-        c_wds, c_ons, c_dur, c_tok, c_idx, t_id, t_tkn = self._preprocess(stims)
-        # does the onset refer to the last or the second-last word?
-        
-        # pass to model
-        # return predictions and true token (id, word or token)
-        # enable trimming distribution
-        pass
 
-'''
-    def _postprocess(self, stims, preds, tok, wds, ons, dur):
+    def _extract(self, stims):
+        c_ons, c_dur, c_idx, t_id, t_tok = self._preprocess(stims)
+        outputs = self.model(c_idx)
         if self.framework == 'pt':
-            preds = preds.logits[:,1:-1,:].detach().numpy()
+            preds = outputs.logits[0,-1,:].detach().numpy()
         else:
-            preds = preds.logits[:,1:-1,:].numpy()
+            preds = outputs.logits[0,-1,:].numpy()
         if self.return_softmax:
             preds = scipy.special.softmax(preds, axis=-1)
-        out_idx = preds[0,self.mask_pos,:].argsort()[::-1]
+        out_idx = preds.argsort()[::-1]
         if self.top_n:
             sub_idx = out_idx[:self.top_n]
         elif self.target:
             sub_idx = self.tokenizer.convert_tokens_to_ids(self.target)
         elif self.threshold:
-            sub_idx = np.where(preds[0,self.mask_pos,:] >= self.threshold)[0]
+            sub_idx = np.where(preds >= self.threshold)[0]
         else:
             sub_idx = out_idx
         out_idx = [idx for idx in out_idx if idx in sub_idx]
         feat = self.tokenizer.convert_ids_to_tokens(out_idx)
         feat = [f.capitalize() if len(f)==len(f.encode()) else f for f in feat]
-        data = [listify(p) for p in preds[0,self.mask_pos,out_idx]]
-        if self.return_masked_word:
-            feat, data = self._return_masked_word(preds, feat, data)
+        data = [listify(p) for p in preds[out_idx]]
+        if self.return_true:
+            feat += ['true_token', 'true_token_score']
+            data += [t_tok, preds[t_id]]
         if self.return_input:
-            data += [stims.name]
             feat += ['sequence']
-        mask_ons = listify(stims.elements[self.mask_pos].onset)
-        mask_dur = listify(stims.elements[self.mask_pos].duration)
-        return data, feat, mask_ons, mask_dur
-    
-
-    def _return_masked_word(self, preds, feat, data):
-        if self.mask_token in self.tokenizer.vocab:
-            true_vocab_idx = self.tokenizer.vocab[self.mask_token]
-            true_score = preds[0,self.mask_pos,true_vocab_idx]
-        else:
-            true_score = np.nan
-            logging.warning('True token not in vocabulary. Returning NaN')
-        feat += ['true_word', 'true_word_score']
-        data += [self.mask_token, true_score]
-        return feat, data
+            data += [stims.name]
+        ons = listify(c_ons[-1])
+        dur = listify(c_dur[-1])
+        return data, feat, ons, dur
 
-    
-    def _compute_metrics(self, outputs, wd_id, tokenizer):
-        loss = float(outputs.loss.cpu().detach().numpy())
-        top_id = torch.argmax(outputs.logits[0,-1,:], 
-                              axis=-1)
-        top_token = tokenizer.decode(top_id)
-        softmaxed = self.softmax_fn(outputs.logits)
-        prob_true = softmaxed[0,-1,wd_id]
-        prob_true = float(prob_true.cpu().detach().numpy())
-        prob_predicted = float(softmaxed[0,-1,top_id].cpu().detach().numpy())
-        softmaxed = softmaxed[0,-1,:].cpu().detach().numpy()
-        entr = entropy(softmaxed)
-        top_5 = int(softmaxed.argsort().argsort()[wd_id] >= 50252) 
-        top_10 = int(softmaxed.argsort().argsort()[wd_id] >= 50247) 
-        return top_token, loss, entr, prob_true, prob_predicted, top_5, top_10
-
-'''
\ No newline at end of file
+    def _to_df(self, result):
+        res_df = pd.DataFrame(dict(zip(result.features, result._data)))
+        res_df['object_id'] = range(res_df.shape[0])
+        return res_df

From ad2088948a2b7a0a24462596161c148c618e93f7 Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Thu, 2 Dec 2021 10:07:26 -0600
Subject: [PATCH 6/9] add to init

---
 pliers/extractors/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pliers/extractors/__init__.py b/pliers/extractors/__init__.py
index cc1f983e5..653000dea 100644
--- a/pliers/extractors/__init__.py
+++ b/pliers/extractors/__init__.py
@@ -72,7 +72,7 @@
                    VADERSentimentExtractor, SpaCyExtractor,
                    WordCounterExtractor, BertExtractor,
                    BertSequenceEncodingExtractor, BertLMExtractor,
-                   BertSentimentExtractor)
+                   BertSentimentExtractor, GPTForwardLMExtractor)
 from .video import (FarnebackOpticalFlowExtractor)
 
 __all__ = [
@@ -154,6 +154,7 @@
     'BertSequenceEncodingExtractor',
     'BertLMExtractor',
     'BertSentimentExtractor',
+    'GPTForwardLMExtractor',
     'AudiosetLabelExtractor',
     'WordCounterExtractor',
     'MetricExtractor',

From b6215c9706a002a1c788ba78846ce91371618a76 Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Thu, 2 Dec 2021 11:09:04 -0600
Subject: [PATCH 7/9] finalize implementation

---
 pliers/extractors/text.py | 53 ++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index 42a9866d0..24b181374 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -879,15 +879,12 @@ class GPTForwardLMExtractor(ComplexTextExtractor):
         return_true (bool): if True, returns true_token and its probability.
         return_input (bool): whether to return input sequence
         model_kwargs (dict): Named arguments for pretrained model.
-            See: https://huggingface.co/transformers/main_classes/model.html
-            and https://huggingface.co/transformers/model_doc/bert.html.
         tokenizer_kwargs (dict): Named arguments for tokenizer.
-            See https://huggingface.co/transformers/main_classes/tokenizer.html.
     '''
 
     _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 
-        'threshold', 'tokenizer_type', 'return_softmax', 'return_true',
-        'return_input')
+        'threshold', 'tokenizer_type', 'return_softmax', 'return_true_word',
+        'return_true_token', 'return_input', 'return_context')
     _model_attributes = ('pretrained_model', 'framework', 'top_n',
         'target', 'threshold', 'tokenizer_type')
     
@@ -900,9 +897,11 @@ def __init__(self,
                  top_n=None,
                  threshold=None,
                  target=None,
-                 return_true=True,
-                 return_softmax=False,
+                 return_true_token=True,
+                 return_true_word=False,
+                 return_softmax=None,
                  return_input=True,
+                 return_context=True,
                  model_kwargs=None,
                  tokenizer_kwargs=None):
         verify_dependencies(['transformers'])
@@ -934,7 +933,9 @@ def __init__(self,
         self.top_n = top_n
         self.threshold = threshold
         self.return_softmax = return_softmax
-        self.return_true = return_true
+        self.return_context = return_context
+        self.return_true_word = return_true_word
+        self.return_true_token = return_true_token
         self.return_input = return_input
         super().__init__()
 
@@ -942,24 +943,18 @@ def _preprocess(self, stims):
         ''' Tokenizes input and returns context and target info '''
         els = [(e.text, e.onset, e.duration) for e in stims.elements]
         wds, ons, dur = map(list, zip(*els))
-        c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur])
-        c_tok = [self.tokenizer.tokenize(w) for w in c_wds]
-        n_tok = [len(t) for t in c_tok]
+        c_wds, c_ons, c_dur = (l[:-1] for l in [wds,ons,dur]) # second last
+        c_tok = self.tokenizer.encode(' '.join(c_wds), return_tensors=self.framework)
         stims.name = ' '.join(wds) if stims.name == '' else stims.name
-        wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [c_wds, 
-                                                            c_ons, 
-                                                            c_dur])
-        c_tok = list(flatten(c_tok))
-        c_idx = self.tokenizer.encode(c_tok, return_tensors=self.framework)
         t_wds = ' ' + wds[-1]
-        t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[:,:1][0,0]
+        t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[0,0]
         t_tok =  self.tokenizer.decode(t_id)
-        return c_ons, c_dur, c_idx, t_id, t_tok # omit c_wds and c_tok
+        return c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds
 
 
     def _extract(self, stims):
-        c_ons, c_dur, c_idx, t_id, t_tok = self._preprocess(stims)
-        outputs = self.model(c_idx)
+        c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds = self._preprocess(stims)
+        outputs = self.model(c_tok)
         if self.framework == 'pt':
             preds = outputs.logits[0,-1,:].detach().numpy()
         else:
@@ -976,18 +971,24 @@ def _extract(self, stims):
         else:
             sub_idx = out_idx
         out_idx = [idx for idx in out_idx if idx in sub_idx]
-        feat = self.tokenizer.convert_ids_to_tokens(out_idx)
-        feat = [f.capitalize() if len(f)==len(f.encode()) else f for f in feat]
-        data = [listify(p) for p in preds[out_idx]]
-        if self.return_true:
+        feat = [self.tokenizer.decode(o) for o in out_idx]
+        data = [listify(float(p)) for p in preds[out_idx]]
+        if self.return_true_token:
             feat += ['true_token', 'true_token_score']
-            data += [t_tok, preds[t_id]]
+            data += [t_tok, float(preds[t_id])]
+        if self.return_true_word:
+            feat += ['true_word']
+            data += [t_wds]
+        if self.return_context:
+            feat += ['context']
+            data += [' '.join(c_wds)]
         if self.return_input:
             feat += ['sequence']
             data += [stims.name]
         ons = listify(c_ons[-1])
         dur = listify(c_dur[-1])
-        return data, feat, ons, dur
+        return ExtractorResult(data, stims, self, 
+                               features=feat, onsets=ons, durations=dur)
 
     def _to_df(self, result):
         res_df = pd.DataFrame(dict(zip(result.features, result._data)))

From e28d7daa93c2a6782c15d3dc1db4bd0be3e48554 Mon Sep 17 00:00:00 2001
From: rbroc <rbrrcc@gmail.com>
Date: Fri, 3 Dec 2021 10:38:24 -0600
Subject: [PATCH 8/9] enable last or second to last onset

---
 pliers/extractors/text.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index 24b181374..1bea85d6c 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -858,7 +858,7 @@ def _extract(self, stims):
 
 
 class GPTForwardLMExtractor(ComplexTextExtractor):
-    ''' Returns next word predictions for GPT2 .
+    ''' Returns next word predictions for GPT models .
     Args:
         pretrained_model (str): A string specifying which transformer
             model to use. 
@@ -878,13 +878,16 @@ class GPTForwardLMExtractor(ComplexTextExtractor):
             raw predictions.
         return_true (bool): if True, returns true_token and its probability.
         return_input (bool): whether to return input sequence
+        onset (str): whether the onset in the result is the one from 
+            the target word ('target') or from the last word in the 
+            context ('last_context')
         model_kwargs (dict): Named arguments for pretrained model.
         tokenizer_kwargs (dict): Named arguments for tokenizer.
     '''
 
     _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 
         'threshold', 'tokenizer_type', 'return_softmax', 'return_true_word',
-        'return_true_token', 'return_input', 'return_context')
+        'return_true_token', 'return_input', 'return_context', 'onset')
     _model_attributes = ('pretrained_model', 'framework', 'top_n',
         'target', 'threshold', 'tokenizer_type')
     
@@ -902,12 +905,16 @@ def __init__(self,
                  return_softmax=None,
                  return_input=True,
                  return_context=True,
+                 onset='target',
                  model_kwargs=None,
                  tokenizer_kwargs=None):
         verify_dependencies(['transformers'])
         if framework not in ['pt', 'tf']:
             raise(ValueError('''Invalid framework;
                 must be one of 'pt' (pytorch) or 'tf' (tensorflow)'''))
+        if onset not in ['target', 'last_context']:
+            raise(ValueError('''Onset must be one of 
+            'target' or 'last_context'.'''))
         self.pretrained_model = pretrained_model
         self.tokenizer_type = tokenizer
         self.model_class = model_class
@@ -937,6 +944,7 @@ def __init__(self,
         self.return_true_word = return_true_word
         self.return_true_token = return_true_token
         self.return_input = return_input
+        self.onset = onset
         super().__init__()
 
     def _preprocess(self, stims):
@@ -949,11 +957,13 @@ def _preprocess(self, stims):
         t_wds = ' ' + wds[-1]
         t_id = self.tokenizer.encode(t_wds, return_tensors=self.framework)[0,0]
         t_tok =  self.tokenizer.decode(t_id)
-        return c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds
-
+        return ((c_ons, c_dur, c_tok, c_wds),
+                (t_id, t_tok, t_wds, ons[-1], dur[-1]))
 
     def _extract(self, stims):
-        c_ons, c_dur, c_tok, c_wds, t_id, t_tok, t_wds = self._preprocess(stims)
+        c_outs, t_outs = self._preprocess(stims)
+        c_ons, c_dur, c_tok, c_wds = c_outs 
+        t_id, t_tok, t_wds, t_ons, t_dur = t_outs
         outputs = self.model(c_tok)
         if self.framework == 'pt':
             preds = outputs.logits[0,-1,:].detach().numpy()
@@ -985,8 +995,12 @@ def _extract(self, stims):
         if self.return_input:
             feat += ['sequence']
             data += [stims.name]
-        ons = listify(c_ons[-1])
-        dur = listify(c_dur[-1])
+        if self.onset == 'target':
+            ons = listify(t_ons)
+            dur = listify(t_dur)
+        else:
+            ons = listify(c_ons[-1])
+            dur = listify(c_dur[-1])
         return ExtractorResult(data, stims, self, 
                                features=feat, onsets=ons, durations=dur)
 

From 549b56fdcf9ca7b360ddee8ec20ce1224cfe339e Mon Sep 17 00:00:00 2001
From: Roberta Rocca <32483140+rbroc@users.noreply.github.com>
Date: Tue, 7 Dec 2021 17:30:33 -0600
Subject: [PATCH 9/9] avoid confounding vocab items and additional features

---
 pliers/extractors/text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py
index 1bea85d6c..121b7b0f3 100644
--- a/pliers/extractors/text.py
+++ b/pliers/extractors/text.py
@@ -990,10 +990,10 @@ def _extract(self, stims):
             feat += ['true_word']
             data += [t_wds]
         if self.return_context:
-            feat += ['context']
+            feat += ['lm_context']
             data += [' '.join(c_wds)]
         if self.return_input:
-            feat += ['sequence']
+            feat += ['lm_sequence']
             data += [stims.name]
         if self.onset == 'target':
             ons = listify(t_ons)