diff --git a/cat.zip b/cat.zip
new file mode 100644
index 0000000..baf64cf
Binary files /dev/null and b/cat.zip differ
diff --git a/cat/dataset.py b/cat/dataset.py
index 1df70f6..09aadb1 100644
--- a/cat/dataset.py
+++ b/cat/dataset.py
@@ -5,14 +5,29 @@
def loader(instance_path,
label_path,
+ label_multi_path,
subset_labels_path,
split_labels=False,
mapping=None):
# subset_labels = set(subset_labels)
+
+ multi_labels = []
+ with open(label_multi_path, 'r') as file:
+ for line in file:
+ current_array = eval(line.strip())
+ multi_labels.append(current_array)
+
+ # multi_labels = open(label_multi_path)
+ # multi_labels = [x for x in multi_labels]
+
labels = open(label_path)
labels = [x.strip().lower().split() for x in labels]
- subset_labels = open(subset_labels_path)
+ # subset_labels = open(subset_labels_path)
+ subset_labels = []
+ with open(subset_labels_path, 'r', encoding='utf-8') as file:
+ for line in file:
+ subset_labels.append(line.strip())
subset_labels = set([x.strip().lower() for x in subset_labels])
# print(subset_labels)
@@ -21,7 +36,7 @@ def loader(instance_path,
# subset_labels = {'wine', 'place', 'food'}
instances = []
- for line in open(instance_path):
+ for line in open(instance_path, encoding='utf-8'):
instances.append(line.strip().lower().split())
if split_labels:
@@ -29,7 +44,7 @@ def loader(instance_path,
instances, gold = zip(*[(x, y[0]) for x, y in zip(instances, labels)
if len(y) == 1])
- # y[0] in subset_labels])
+ # and y[0] in subset_labels])
if mapping is not None:
gold = [mapping.get(x, x) for x in gold]
@@ -38,63 +53,25 @@ def loader(instance_path,
y = le.fit_transform(gold)
label_set = le.classes_.tolist()
- return instances, y, label_set, subset_labels, gold
-
-
-rest_14_test = partial(loader,
- instance_path="data/restaurant_test_2014_tok.txt", # noqa
- label_path="data/labels_restaurant_test_2014.txt", # noqa
- subset_labels={"ambience",
- "service",
- "food"})
-
-
-rest_14_train = partial(loader,
- instance_path="data/restaurant_train_2014.txt", # noqa
- label_path="data/labels_restaurant_train_2014.txt", # noqa
- subset_labels={"ambience",
- "service",
- "food"})
-
-
-ganu_test = partial(loader,
- instance_path="data/test_tok.txt",
- label_path="data/test_label.txt",
- subset_labels={"ambience",
- "staff",
- "food"})
-
-
-rest_15_train = partial(loader,
- instance_path="data/restaurant_train_2015_tok.txt",
- label_path="data/labels_restaurant_train_2015.txt",
- subset_labels={"ambience",
- "service",
- "food"},
- split_labels=True)
-
-rest_15_test = partial(loader,
- instance_path="data/restaurant_test_2015_tok.txt",
- label_path="data/labels_restaurant_test_2015.txt",
- subset_labels={"ambience",
- "service",
- "food"},
- split_labels=True)
+ return instances, y, label_set, subset_labels, gold, multi_labels
-toy_test = partial(loader,
- instance_path="../data/0/toy_test.txt",
- label_path="../data/0/toy_test_label.txt",
- subset_labels_path="../data/toy_train_label.txt",
- split_labels=True)
+# rest_15_test = partial(loader,
+# instance_path="data/restaurant_test_2015_tok.txt",
+# label_path="data/labels_restaurant_test_2015.txt",
+# subset_labels={"ambience",
+# "service",
+# "food"},
+# split_labels=True)
-def restaurants_train():
- yield rest_14_train()
- yield rest_15_train()
+def test(f, dataset):
+ for h in range(0, 101, 10):
+ data_test = partial(loader,
+ instance_path=f"../data/{dataset}/test/{h}/test.txt",
+ label_path=f"../data/{dataset}/test/{h}/test_label.txt",
+ label_multi_path=f"../data/{dataset}/test/{h}/test_label_multi.txt",
+ subset_labels_path=f"../data/{dataset}/train/{f}/train_label.txt",
+ split_labels=True)
-def restaurants_test():
- yield toy_test()
- # yield rest_14_test()
- # yield rest_15_test()
- # yield ganu_test()
+ yield data_test()
\ No newline at end of file
diff --git a/cat/utils.py b/cat/utils.py
index 2540b7c..c468399 100644
--- a/cat/utils.py
+++ b/cat/utils.py
@@ -5,7 +5,7 @@
def conll2text(paths, outpath):
"""Write a conll file to a text file."""
- with open(outpath, 'w') as f:
+ with open(outpath, 'w', encoding='utf-8') as f:
for path in paths:
for sent in pyconll.iter_from_file(path):
txt = []
diff --git a/cmn/__init__.py b/cmn/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cmn/mams.py b/cmn/mams.py
new file mode 100644
index 0000000..9e722e7
--- /dev/null
+++ b/cmn/mams.py
@@ -0,0 +1,93 @@
+import os, spacy
+from tqdm import tqdm
+import xml.etree.ElementTree as ET
+
+from cmn.review import Review
+
+class MAMSReview(Review):
+ def __init__(self, id, sentences, time, author, aos):
+ super().__init__(self, id, sentences, time, author, aos)
+
+ @staticmethod
+ def xmlloader(path):
+ reviews_list = []
+ nlp = spacy.load("en_core_web_sm")
+ tree = ET.parse(path)
+ sentences = tree.getroot()
+ i = -1
+ for sentence in sentences: # each sentence is an individual review, unlike SemEval16
+ i += 1
+
+ text = ""
+ tokens = []
+ aos_list_list = []
+
+ for data in sentence:
+ if data.tag == "text": # clean the associated aspect tokens from punctuations
+ raw_text = data.text
+ current_text = raw_text
+ opinion_text = sentence.findall(".//aspectTerm")
+ for o in opinion_text:
+ aspect = o.attrib["term"]
+ aspect_list = aspect.split()
+ if len(aspect_list) == 0: # contains no aspect (mams dataset doesn't have NULL aspects)
+ continue
+ letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
+ current_text = current_text.replace(' ', ' ')
+ current_text = current_text[0:letter_index_tuple[0]] + ' ' + aspect + ' ' + current_text[letter_index_tuple[1]+1:]
+ #print("processing text:" + str(current_text))
+ tokens = current_text.split()
+
+ if data.tag == "aspectTerms":
+ aos_list = []
+ for o in data: # each o is an aspectTerm
+
+ sentiment = o.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0')
+
+ aspect = o.attrib["term"]
+ aspect_list = aspect.split() # the aspect may consist more than 1 word
+ if len(aspect_list) == 0:
+ continue
+
+ letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
+
+ # find the aspect instance of all text instances of the phrase
+ #print(tokens)
+
+ text_incidences = [i for i in range(len(raw_text))
+ if raw_text.startswith(aspect, i)
+ and not raw_text[i-1].isalpha()
+ and not raw_text[i+len(aspect)].isalpha()]
+ #print("text incidences: " + str(text_incidences))
+ idx_of_from = text_incidences.index(letter_index_tuple[0])
+ #print("index of from: " + str(idx_of_from))
+
+ # find the location of the aspect token
+ start_token_of_aspect = [i for i in range(len(tokens))
+ if i + len(aspect_list) <= len(tokens)
+ and tokens[i:i + len(aspect_list)] == aspect_list]
+
+ #print("start token of aspect: " + str(start_token_of_aspect))
+
+ idx_start_token_of_aspect = start_token_of_aspect[idx_of_from]
+
+ idx_aspect_list = list(
+ range(idx_start_token_of_aspect, idx_start_token_of_aspect + len(aspect_list)))
+
+ # compile the final aos 3-tuple for each aspect
+ aos = (idx_aspect_list, [], eval(sentiment))
+
+ if len(aos) != 0:
+ aos_list.append(aos)
+
+ if len(aos_list) != 0:
+ aos_list_list.append(aos_list)
+
+ if len(aos_list_list) == 0: # if no aspect in the sentence, it is not added
+ continue
+
+ reviews_list.append(
+ Review(id=i, sentences=[[str(t).lower() for t in current_text.split()]], time=None,
+ author=None, aos=aos_list_list, lempos=""))
+
+ return reviews_list
\ No newline at end of file
diff --git a/cmn/review.py b/cmn/review.py
new file mode 100644
index 0000000..a3dbcb1
--- /dev/null
+++ b/cmn/review.py
@@ -0,0 +1,271 @@
+import pandas as pd, copy, numpy as np
+from scipy.spatial.distance import cosine
+
+class Review(object):
+ translator_mdl = None; translator_tokenizer = None
+ semantic_mdl = None; align_mdl = None
+ def __init__(self, id, sentences, time=None, author=None, aos=None, lempos=None, parent=None, lang='eng_Latn', category=None):
+ self.id = id
+ self.sentences = sentences #list of sentences of list of tokens
+ self.time = time
+ self.author = author
+ self.aos = aos #list of list of aspect_opinion_sentiment triples for per sentence, e.g., [[([7,8], [10, 11, 12], -1), ([15,17], [20], +1)]]
+ self.lempos = lempos
+ self.lang = lang
+ self.category = category
+
+ self.parent = parent
+ self.augs = {} #distionary of translated and backtranslated augmentations of this review in object format, e.g.,
+ # {'deu_Latn': (Review1(self.id, 'dies ist eine bewertung', None, None, None, None, self, 'deu_Latn'),
+ # Review2(self.id, 'this is a review', None, None, None, None, self, 'eng_Latn'),
+ # semantic_similarity_score)
+
+ def to_dict(self, w_augs=False):
+ result = [{'id': self.id,
+ 'text': self.get_txt(),
+ 'sentences': self.sentences,
+ 'aos': self.get_aos(), #self.parent.get_aos() if self.parent else self.get_aos(),
+ 'lang': self.lang,
+ 'orig': False if self.parent else True}]
+ if not w_augs: return result
+ for k in self.augs:
+ #result += self.augs[k][0].to_dict()
+ result += self.augs[k][1].to_dict()
+ return result
+
+ def get_aos(self):
+ r = []
+ if not self.aos: return r
+ for i, aos in enumerate(self.aos): r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos])
+ return r
+
+ def get_txt(self): return '. '.join(' '.join(s) for s in self.sentences)
+
+ def hide_aspects(self):
+ r = copy.deepcopy(self)
+ for i, sent in enumerate(r.sentences):
+ # [sent.pop(k) for j, _, _ in r.aos[i] for k in j]
+ for j, _, _ in r.aos[i]:
+ for k in j: sent[k] = '#####'
+ return r
+
+ def preprocess(self): return self # note that any removal of words breakes the aos indexing!
+
+ def translate(self, tgt, settings):
+ src = self.lang
+ if not Review.translator_mdl:
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+ Review.translator_mdl = AutoModelForSeq2SeqLM.from_pretrained(settings['nllb'])
+ Review.translator_tokenizer = AutoTokenizer.from_pretrained(settings['nllb'])
+
+ from transformers import pipeline
+ Review.translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=src, tgt_lang=tgt, max_length=settings['max_l'], device=settings['device'])
+ Review.back_translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=tgt, tgt_lang=src, max_length=settings['max_l'], device=settings['device'])
+
+ translated_txt = Review.translator(self.get_txt())[0]['translation_text']
+ translated_obj = Review(id=self.id, sentences=[[str(t).lower() for t in translated_txt.split()]], parent=self, lang=tgt, time=None, author=None, aos=None)
+ translated_obj.aos, _ = self.semalign(translated_obj)
+
+ back_translated_txt = Review.back_translator(translated_txt)[0]['translation_text']
+ back_translated_obj = Review(id=self.id, sentences=[[str(t).lower() for t in back_translated_txt.split()]], parent=self, lang=src, time=None, author=None, aos=None)
+ back_translated_obj.aos, _ = self.semalign(back_translated_obj)
+
+ self.augs[tgt] = (translated_obj, back_translated_obj, self.semsim(back_translated_obj))
+ return self.augs[tgt]
+
+ def semsim(self, other):
+ if not Review.semantic_mdl:
+ from sentence_transformers import SentenceTransformer
+ Review.semantic_mdl = SentenceTransformer("johngiorgi/declutr-small")
+ me, you = Review.semantic_mdl.encode([self.get_txt(), other.get_txt()])
+ return 1 - cosine(me, you)
+
+ def semalign(self, other):
+ if not Review.align_mdl:
+ from simalign import SentenceAligner
+ Review.align_mdl = SentenceAligner(model="bert", token_type="bpe", matching_methods="i")
+ aligns = [Review.align_mdl.get_word_aligns(s1, o1)['itermax'] for s1, o1 in zip(self.sentences, other.sentences)]
+ other_aos = []
+ for i, (aos, _) in enumerate(zip(self.aos, self.sentences)):
+ for (a, o, s) in aos:
+ other_a = [idx2 for idx in a for idx1, idx2 in aligns[i] if idx == idx1]
+ other_a.sort()
+ other_aos.append((other_a, o, s))
+ return other_aos, aligns
+
+ def get_lang_stats(self):
+ import nltk
+ from rouge import Rouge
+ from sklearn.metrics import accuracy_score
+
+ result = {}
+ r = self.get_txt()
+ result['r_ntoken'] = len(r.split())
+ for lang in self.augs.keys():
+ r_ = self.augs[lang][1].get_txt()
+ # r_ = r #for testing purpose => should be very close to 1 for all metrics
+ result[lang + '_r_backtrans_ntoken'] = len(r_.split())
+ result[lang + '_semsim'] = self.augs[lang][2]
+ result[lang + '_bleu'] = np.mean(nltk.translate.bleu_score.sentence_bleu([r.split()], r_.split(), weights=[(1 / bleu_no,) * bleu_no for bleu_no in range(1, min(4, result['r_ntoken'] + 1))]))
+ # https://pypi.org/project/rouge/
+ result[lang + '_rouge_f'] = np.mean([v['f'] for k, v in Rouge(metrics=[f'rouge-{i+1}' for i in range(0, min(5, len(r.split())))]).get_scores(r_, r)[0].items()])
+ # we need to make r_ as equal size as r
+ result[lang + '_em'] = accuracy_score(r.split(), r_.split()[:result['r_ntoken']] if len(r_.split()) > result['r_ntoken'] else r_.split() + [''] * (result['r_ntoken'] - len(r_.split())))
+ return result
+
+ @staticmethod
+ def load(path): pass
+
+ @staticmethod
+ def to_df(reviews, w_augs=False): return pd.DataFrame.from_dict([rr for r in reviews for rr in r.to_dict(w_augs)])
+
+ @staticmethod
+ def translate_batch(reviews, tgt, settings):
+ src = reviews[0].lang
+ if not Review.translator_mdl:
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+ Review.translator_mdl = AutoModelForSeq2SeqLM.from_pretrained(settings['nllb'])
+ Review.translator_tokenizer = AutoTokenizer.from_pretrained(settings['nllb'])
+
+ from transformers import pipeline
+ translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=src, tgt_lang=tgt, max_length=settings['max_l'], device=settings['device'])
+ back_translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=tgt, tgt_lang=src, max_length=settings['max_l'], device=settings['device'])
+
+ reviews_txt = [r.get_txt() for r in reviews]
+ translated_txt = translator(reviews_txt)
+ back_translated_txt = back_translator([r_['translation_text'] for r_ in translated_txt])
+
+ for i, r in enumerate(reviews):
+ translated_obj = Review(id=r.id, sentences=[[str(t).lower() for t in translated_txt[i]['translation_text'].split()]], parent=r, lang=tgt, time=None, author=None, aos=None, lempos=None,)
+ translated_obj.aos, _ = r.semalign(translated_obj)
+
+ back_translated_obj = Review(id=r.id, sentences=[[str(t).lower() for t in back_translated_txt[i]['translation_text'].split()]], parent=r, lang=src, time=None, author=None, aos=r.aos, lempos=None,)
+ r.augs[tgt] = (translated_obj, back_translated_obj, r.semsim(back_translated_obj))
+
+ @staticmethod
+ def get_stats(datapath, output, cache=True, plot=True, plot_title=None):
+ try:
+ print(f'Loading the stats pickle from {datapath}...')
+ if not cache: raise FileNotFoundError
+ stats = pd.read_pickle(f'{output}/stats.pkl')
+ if plot: Review.plot_dist(stats, output, plot_title)
+ except FileNotFoundError:
+ print(f'File {datapath} not found! Generating stats ...')
+ reviews = pd.read_pickle(datapath)
+ from collections import Counter
+ stats = {'*nreviews': len(reviews), '*naspects': 0, '*ntokens': 0}
+ asp_nreviews = Counter() # aspects : number of reviews that contains the aspect
+ token_nreviews = Counter() # tokens : number of reviews that contains the token
+ nreviews_naspects = Counter() # v number of reviews with 1 aspect, ..., k aspects, ...
+ nreviews_ntokens = Counter() # v number of reviews with 1 token, ..., k tokens, ...
+ nreviews_category = Counter() # v number of categories with 1 review, ..., k reviews, ...
+ reviews_lang_stats = []
+
+ for r in reviews:
+ r_aspects = r.get_aos()[0]
+ r_tokens = [token for sentence in r.sentences for token in sentence]
+ asp_nreviews.update(' '.join(a) for (a, o, s) in r_aspects)
+ token_nreviews.update(token for token in r_tokens)
+ nreviews_naspects.update([len(r_aspects)])
+ nreviews_ntokens.update([len(r_tokens)])
+ if hasattr(r, 'category'): nreviews_category.update(r.category)
+
+ reviews_lang_stats.append(r.get_lang_stats())
+
+ naspects_nreviews = Counter(asp_nreviews.values()) # v number of aspects with 1 review, ..., k reviews, ...
+ ntokens_nreviews = Counter(token_nreviews.values()) # v number of tokens with 1 review, ..., k reviews, ...
+ stats["*naspects"] = len(asp_nreviews.keys()) # unique. Non-unique number of aspects: sum(asp_nreviews.values())
+ stats["*ntokens"] = len(token_nreviews.keys()) # unique. Non-unique number of tokens: sum(token_nreviews.values())
+ stats['nreviews_naspects'] = {k: v for k, v in sorted(nreviews_naspects.items(), key=lambda item: item[1], reverse=True)}
+ stats['nreviews_ntokens'] = {k: v for k, v in sorted(nreviews_ntokens.items(), key=lambda item: item[1], reverse=True)}
+ stats['naspects_nreviews'] = {k: v for k, v in sorted(naspects_nreviews.items(), key=lambda item: item[1], reverse=True)}
+ stats['ntokens_nreviews'] = {k: v for k, v in sorted(ntokens_nreviews.items(), key=lambda item: item[1], reverse=True)}
+ stats['nreviews_category'] = {k: v for k, v in sorted(nreviews_category.items(), key=lambda item: item[1], reverse=True)}
+ stats['*avg_ntokens_review'] = sum(k * v for k, v in nreviews_ntokens.items()) / sum(nreviews_ntokens.values()) # average number of tokens per review
+ stats['*avg_naspects_review'] = sum(k * v for k, v in nreviews_naspects.items()) / sum(nreviews_naspects.values()) # average number of aspects per review
+ stats['*avg_lang_stats'] = pd.DataFrame.from_dict(reviews_lang_stats).mean().to_dict()
+ if output: pd.to_pickle(stats, f'{output}/stats.pkl')
+ if plot: Review.plot_dist(stats, output, plot_title)
+ import json
+ print(json.dumps(stats, indent=4))
+ # print(stats)
+ return stats
+
+ @staticmethod
+ def plot_dist(stats, output, plot_title):
+ from matplotlib import pyplot as plt
+ plt.rcParams.update({'font.family': 'Consolas'})
+ print("plotting distribution data ...")
+ for k, v in stats.items():
+ if (not k.startswith("*")): # the * values cannot be plotted
+ fig = plt.figure(k, figsize=(1.5, 1.5))
+ ax = fig.add_subplot(1, 1, 1)
+ ax.set_facecolor('whitesmoke')
+ ax.loglog(*zip(*stats[k].items()), marker='x', linestyle='None', markeredgecolor='m')
+ ax.set_xlabel(k.split('_')[1][0].replace('n', '#') + k.split('_')[1][1:])
+ ax.set_ylabel(k.split('_')[0][0].replace('n', '#') + k.split('_')[0][1:])
+ ax.grid(True, color="#93a1a1", alpha=0.3)
+ ax.minorticks_off()
+ ax.xaxis.set_tick_params(size=2, direction='in')
+ ax.yaxis.set_tick_params(size=2, direction='in')
+
+ # wrapping labels
+ labels = []
+ for l in ax.get_xticklabels():
+ l.set_text('\n#'.join(l.get_text().split("#")))
+ labels.append(l)
+ ax.set_xticklabels(labels, ha='left')
+
+ ax.xaxis.get_label().set_size(12)
+ ax.yaxis.get_label().set_size(12)
+ ax.set_title(plot_title)
+ fig.savefig(f'{output}/{k}.pdf', dpi=100, bbox_inches='tight')
+ plt.show()
+
+ @staticmethod
+ def plot_semsim_dist(datapath, output, plot_title):
+ from matplotlib import pyplot as plt
+ plt.rcParams.update({'font.family': 'Consolas'})
+ import seaborn as sns
+ reviews = pd.read_pickle(datapath)
+
+ hist_dict = [{'original': 'eng_Latn', 'target': Review.lang_title(k), 'score': v[2]} for r in reviews for k, v in r.augs.items()]
+ df = pd.DataFrame.from_dict(hist_dict)
+ fig = plt.figure(figsize=(6, 2))
+ ax = fig.add_subplot(1, 1, 1)
+ x_range = [i / 10 for i in range(0, 11)]
+ # ax.set_ylim([0, len(reviews)])
+ # plt.yscale('log')
+ plt.ylabel('#reviews')
+ plt.xlabel('similarity score')
+ ax.set_title(plot_title, x=0.2, y=0.8, fontsize=11)
+ ax.set_facecolor('whitesmoke')
+ h = sns.histplot(df,
+ x='score',
+ hue='target',
+ element='step', #also try 'poly'
+ stat='density',
+ common_norm=False)
+ sns.move_legend(ax, 'upper left')
+
+ plt.legend([])
+
+ h.legend_.set_title(None)
+ h.set(xticks=x_range)
+ plt.savefig(f'{output}', dpi=100, bbox_inches='tight')
+ # df.to_csv(f'{output.replace("pdf", "csv")}')
+ plt.show()
+ # plt.clf()
+
+ @staticmethod
+ def lang_title(lang_code):
+ if lang_code == 'zho_Hans': return 'chinese'
+ elif lang_code == 'deu_Latn': return 'german'
+ elif lang_code == 'fra_Latn': return 'french'
+ elif lang_code == 'arb_Arab': return 'arabic'
+ elif lang_code == 'pes_Arab': return 'farsi'
+ elif lang_code == 'spa_Latn': return 'spanish'
+ elif lang_code == 'eng_Latn': return 'english'
+ elif lang_code == 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn': return 'all'
+ elif lang_code == None: return ['pes_Arab', 'zho_Hans', 'deu_Latn', 'arb_Arab', 'fra_Latn', 'spa_Latn', 'all']
+
diff --git a/cmn/semeval.py b/cmn/semeval.py
new file mode 100644
index 0000000..0278535
--- /dev/null
+++ b/cmn/semeval.py
@@ -0,0 +1,107 @@
+import os, spacy
+from tqdm import tqdm
+import xml.etree.ElementTree as et
+
+#nlp = spacy.load("en_core_web_sm") # en_core_web_trf for transformer-based; error ==> python -m spacy download en_core_web_sm
+
+from cmn.review import Review
+
+class SemEvalReview(Review):
+
+ def __init__(self, id, sentences, time, author, aos): super().__init__(self, id, sentences, time, author, aos)
+
+ @staticmethod
+ def load(path):
+ if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path)
+ return SemEvalReview._txtloader(input)
+
+ @staticmethod
+ def _txtloader(path):
+ reviews = []
+ with tqdm(total=os.path.getsize(path)) as pbar, open(path, "r", encoding='utf-8') as f:
+ for i, line in enumerate(f.readlines()):
+ pbar.update(len(line))
+ sentence, aos = line.split('####')
+ aos = aos.replace('\'POS\'', '+1').replace('\'NEG\'', '-1').replace('\'NEU\'', '0')
+
+ # for the current datafile, each row is a review of single sentence!
+ # sentence = nlp(sentence)
+ reviews.append(Review(id=i, sentences=[[str(t).lower() for t in sentence.split()]], time=None, author=None,
+ aos=[eval(aos)], lempos=None,
+ parent=None, lang='eng_Latn'))
+ return reviews
+
+ @staticmethod
+ def _xmlloader(path):
+ reviews_list = []
+ xtree = et.parse(path).getroot()
+ if xtree.tag == 'Reviews': reviews = [SemEvalReview._parse(xsentence) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences]
+ if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence) for xsentence in tqdm(xtree)]
+
+ return [r for r in reviews if r]
+
+ @staticmethod
+ def _map_idx(aspect, text):
+ # aspect: ('token', from_char, to_char)
+ text_tokens = text[:aspect[1]].split()
+ # to fix if "aaaa ,b, c" ",b c" if b is the aspect
+ if len(text_tokens) > 0 and not text[aspect[1] - 1].isspace(): text_tokens.pop()
+ aspect_tokens = aspect[0].split()
+
+ # tmp = [*text] #mutable string :)
+ # # these two blank space add bug to the char indexes for aspects if a sentence have multiple aspects!
+ # tmp[aspect[1]: aspect[2]] = [' '] + [*aspect[0]] + [' ']
+ # text = ''.join(tmp)
+
+ return [i for i in range(len(text_tokens), len(text_tokens) + len(aspect_tokens))]
+
+ @staticmethod
+ def _parse(xsentence):
+ id = xsentence.attrib["id"]
+ aos = []; aos_cats = []
+ for element in xsentence:
+ if element.tag == 'text': sentence = element.text # we consider each sentence as a signle review
+ elif element.tag == 'Opinions':#semeval-15-16
+ #
+ for opinion in element:
+ if opinion.attrib["target"] == 'NULL': continue
+ # we may have duplicates for the same aspect due to being in different category like in semeval 2016's
+ aspect = (opinion.attrib["target"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
+ # we need to map char index to token index in aspect
+ aspect = SemEvalReview._map_idx(aspect, sentence)
+ category = opinion.attrib["category"] # 'RESTAURANT#GENERAL'
+ sentiment = opinion.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0') #'+1'
+ aos.append((aspect, [], sentiment, opinion.attrib["target"]))
+ aos_cats.append(category)
+ aos = sorted(aos, key=lambda x: int(x[0][0])) #based on start of sentence
+
+ elif element.tag == 'aspectTerms':#semeval-14
+ #
+ for opinion in element:
+ if opinion.attrib["term"] == 'NULL': continue
+ # we may have duplicates for the same aspect due to being in different category like in semeval 2016's
+ aspect = (opinion.attrib["term"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10)
+ # we need to map char index to token index in aspect
+ aspect = SemEvalReview._map_idx(aspect, sentence)
+ sentiment = opinion.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0') #'+1'
+ aos.append((aspect, [], sentiment, opinion.attrib["term"]))
+
+ aos = sorted(aos, key=lambda x: int(x[0][0])) #based on start of sentence
+
+ elif element.tag == 'aspectCategories': # semeval-14
+ for opinion in element:
+ #
+ aos_cats.append(opinion.attrib["category"])
+
+ #sentence = nlp(sentence) # as it does some processing, it destroys the token idx for aspect term
+ tokens = sentence.split()
+ # to fix ",a b c," to "a b c"
+ # to fix '"sales" team' to 'sales team' => semeval-14-labptop-
+ # todo: fix 'Food-awesome.' to 'food awesome' => semeval-14-restaurant-
+ for i, (idxlist, o, s, aspect_token) in enumerate(aos):
+ for j, idx in enumerate(idxlist): tokens[idx] = aspect_token.split()[j].replace('"', '')
+ aos[i] = (idxlist, o, s)
+ return Review(id=id, sentences=[[str(t).lower() for t in tokens]], time=None, author=None,
+ aos=[aos], lempos=None,
+ parent=None, lang='eng_Latn', category=aos_cats) if aos else None
+
diff --git a/experiments/experiment_test-googletranslate.py b/experiments/experiment_test-googletranslate.py
new file mode 100644
index 0000000..4286ccf
--- /dev/null
+++ b/experiments/experiment_test-googletranslate.py
@@ -0,0 +1,133 @@
+"""Experiment on the test data."""
+import json
+import os
+
+import numpy as np
+
+# LADy_eval
+import pytrec_eval
+import pandas as pd
+
+from cat.simple import get_scores, attention, rbf_attention
+from cat.dataset import test
+from reach import Reach
+from sklearn.metrics import precision_recall_fscore_support
+from collections import defaultdict, Counter
+from itertools import product
+
+
+GAMMA = .03
+BEST_ATT = {"n_noun": 980}
+BEST_RBF = {"n_noun": 200}
+
+if __name__ == "__main__":
+
+ # LADy_eval
+ metrics = ['P', 'recall', 'ndcg_cut', 'map_cut', 'success']
+ topkstr = '1,5,10,100'
+ metrics_set = set()
+ for m in metrics:
+ metrics_set.add(f'{m}_{topkstr}')
+ datasets = []
+ for d in ['googletranslate-twitter']: # 'googletranslate-SemEval-14-L'
+ for l in ['en', 'fa', 'zh-CN', 'de', 'ar', 'fr', 'es', 'fa.zh-CN.de.ar.fr.es']: #
+ if l == 'en':
+ datasets.append(f'{d}')
+ else:
+ datasets.append(f'{d}-{l}')
+ for dataset in datasets:
+
+ output_path = f'../output-googletranslate/{dataset}'
+ if not os.path.isdir(output_path):
+ os.makedirs(output_path)
+
+ mean_list = [pd.DataFrame() for i in range(0, 11)]
+ for f in range(5):
+ fold_path = f'{dataset}/train/{f}'
+ scores = defaultdict(dict)
+ r = Reach.load(f'../embeddings/{fold_path}/vecs_w2v.vec',
+ unk_word="")
+ d = json.load(open(f'../data/{fold_path}/nouns.json'))
+
+ nouns = Counter()
+ for k, v in d.items():
+ if k.lower() in r.items:
+ nouns[k.lower()] += v
+
+ embedding_paths = [f'../embeddings/{fold_path}/vecs_w2v.vec']
+ # bundles = ((rbf_attention, attention), embedding_paths)
+ bundles = ((rbf_attention, ), embedding_paths)
+
+ for att, path in product(*bundles):
+ r = Reach.load(path, unk_word="")
+
+ if att == rbf_attention:
+ candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
+ else:
+ candidates, _ = zip(*nouns.most_common(BEST_ATT["n_noun"]))
+
+ aspects = [[x] for x in candidates]
+ sorted_output = []
+ for idx, (instances, y, label_set, subset_labels, gold, multi_labels) in enumerate(test(f, dataset)):
+ # output_path_hidden = f'{output_path}/{idx*10}'
+ # if not os.path.isdir(output_path_hidden):
+ # os.makedirs(output_path_hidden)
+ # print("label_set", label_set)
+ s = get_scores(instances,
+ aspects,
+ r,
+ subset_labels,
+ gamma=GAMMA,
+ remove_oov=False,
+ attention_func=att)
+
+ # print("predicted", s)
+ # print("subset_labels", subset_labels)
+ # print("gold", list(gold))
+ output = [[(label, value) for value, label in zip(sublist, subset_labels)] for sublist in s]
+ sorted_output = [sorted(sublist, key=lambda x: x[1], reverse=True) for sublist in output]
+ # print("output", sorted_output)
+
+ qrel = dict()
+ run = dict()
+
+ for i, word in enumerate(multi_labels):
+ q_key = 'q{}'.format(i)
+ # qrel[q_key] = {word: 1}
+ qrel[q_key] = {w: 1 for w in word}
+
+ for i, sublist in enumerate(sorted_output):
+ q_key = 'q{}'.format(i)
+ run[q_key] = {}
+ for j, (word, _) in enumerate(sublist):
+ run[q_key][word] = len(sublist) - j
+
+ # print("qrel: ", qrel)
+ # print("run: ", run)
+
+ print(f'pytrec_eval for {metrics_set} for fold {f} with {idx*10} percent hidden aspect in dataset {dataset}...')
+ df = pd.DataFrame.from_dict(pytrec_eval.RelevanceEvaluator(qrel, metrics_set).evaluate(run))
+ df_mean = df.mean(axis=1).to_frame('mean')
+ df_mean.to_csv(f'{output_path}/f{f}.model.ad.pred.{idx/10}.eval.mean.csv')
+ mean_list[idx] = pd.concat([mean_list[idx], df_mean], axis=1)
+ for i in range(0, 11):
+ # output_path_hidden = f'{output_path}/{i*10}'
+ mean_list[i].mean(axis=1).to_frame('mean').to_csv(f'{output_path}/model.ad.pred.eval.mean.{i/10}.csv')
+ # y_pred = s.argmax(1)
+ # f1_score = precision_recall_fscore_support(y, y_pred)
+ # f1_macro = precision_recall_fscore_support(y,
+ # y_pred,
+ # average="weighted")
+ # scores[(att, path)][idx] = (f1_score, f1_macro)
+ #
+ # att_score = {k: v for k, v in scores.items() if k[0] == attention}
+ # att_per_class = [[z[x][0][:-1] for x in range(3)]
+ # for z in att_score.values()]
+ # att_per_class = np.stack(att_per_class).mean(0)
+ # att_macro = np.mean([v[2][1][:-1] for v in att_score.values()], 0)
+ #
+ # rbf_score = {k: v for k, v in scores.items() if k[0] == rbf_attention}
+ # rbf_per_class = [[z[x][0][:-1] for x in range(3)]
+ # for z in rbf_score.values()]
+ # rbf_per_class = np.stack(rbf_per_class).mean(0)
+ # rbf_macro = np.mean([v[2][1][:-1] for v in rbf_score.values()], 0)
diff --git a/experiments/experiment_test.py b/experiments/experiment_test.py
index 4a0e39e..29677bd 100644
--- a/experiments/experiment_test.py
+++ b/experiments/experiment_test.py
@@ -9,7 +9,7 @@
import pandas as pd
from cat.simple import get_scores, attention, rbf_attention
-from cat.dataset import restaurants_test
+from cat.dataset import test
from reach import Reach
from sklearn.metrics import precision_recall_fscore_support
from collections import defaultdict, Counter
@@ -23,76 +23,96 @@
if __name__ == "__main__":
# LADy_eval
- output_path = "../output/"
- if not os.path.isdir(output_path):
- os.makedirs(output_path)
metrics = ['P', 'recall', 'ndcg_cut', 'map_cut', 'success']
topkstr = '1,5,10,100'
metrics_set = set()
for m in metrics:
- metrics_set.add(f'{m}_{topkstr}')
-
- scores = defaultdict(dict)
- r = Reach.load("../embeddings/toy_vecs_w2v.vec",
- unk_word="")
- d = json.load(open("../data/toy_nouns.json"))
-
- nouns = Counter()
- for k, v in d.items():
- if k.lower() in r.items:
- nouns[k.lower()] += v
-
- embedding_paths = ["../embeddings/toy_vecs_w2v.vec"]
- # bundles = ((rbf_attention, attention), embedding_paths)
- bundles = ((rbf_attention, ), embedding_paths)
-
- for att, path in product(*bundles):
- r = Reach.load(path, unk_word="")
-
- if att == rbf_attention:
- candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
- else:
- candidates, _ = zip(*nouns.most_common(BEST_ATT["n_noun"]))
-
- aspects = [[x] for x in candidates]
- sorted_output = []
- for idx, (instances, y, label_set, subset_labels, gold) in enumerate(restaurants_test()):
- # print("label_set", label_set)
- s = get_scores(instances,
- aspects,
- r,
- subset_labels,
- gamma=GAMMA,
- remove_oov=False,
- attention_func=att)
-
- # print("predicted", s)
- # print("subset_labels", subset_labels)
- # print("gold", list(gold))
- output = [[(label, value) for value, label in zip(sublist, subset_labels)] for sublist in s]
- sorted_output = [sorted(sublist, key=lambda x: x[1], reverse=True) for sublist in output]
- # print("output", sorted_output)
-
- qrel = dict()
- run = dict()
-
- for i, word in enumerate(gold):
- q_key = 'q{}'.format(i)
- qrel[q_key] = {word: 1}
-
- for i, sublist in enumerate(sorted_output):
- q_key = 'q{}'.format(i)
- run[q_key] = {}
- for j, (word, _) in enumerate(sublist):
- run[q_key][word] = len(sublist) - j
-
- print("qrel: ", qrel)
- print("run: ", run)
-
- print(f'pytrec_eval for {metrics_set} ...')
- df = pd.DataFrame.from_dict(pytrec_eval.RelevanceEvaluator(qrel, metrics_set).evaluate(run))
- df_mean = df.mean(axis=1).to_frame('mean')
- df_mean.to_csv(f'{output_path}pred.eval.mean.csv')
+ metrics_set.add(f'{m}_{topkstr}')
+ datasets = []
+ for d in ['lowresource-2014r']:
+ for l in ['lao_Laoo', 'san_Deva']:
+ if l == 'eng':
+ datasets.append(f'{d}')
+ else:
+ datasets.append(f'{d}-{l}')
+ for dataset in datasets:
+
+ output_path = f'../output-low-resource/{dataset}'
+ if not os.path.isdir(output_path):
+ os.makedirs(output_path)
+
+ mean_list = [pd.DataFrame() for i in range(0, 11)]
+ for f in range(5):
+ fold_path = f'{dataset}/train/{f}'
+ scores = defaultdict(dict)
+ r = Reach.load(f'../embeddings/{fold_path}/vecs_w2v.vec',
+ unk_word="")
+ d = json.load(open(f'../data/{fold_path}/nouns.json'))
+
+ nouns = Counter()
+ for k, v in d.items():
+ if k.lower() in r.items:
+ nouns[k.lower()] += v
+
+ embedding_paths = [f'../embeddings/{fold_path}/vecs_w2v.vec']
+ # bundles = ((rbf_attention, attention), embedding_paths)
+ bundles = ((rbf_attention, ), embedding_paths)
+
+ for att, path in product(*bundles):
+ r = Reach.load(path, unk_word="")
+
+ if att == rbf_attention:
+ candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
+ else:
+ candidates, _ = zip(*nouns.most_common(BEST_ATT["n_noun"]))
+
+ aspects = [[x] for x in candidates]
+ sorted_output = []
+ for idx, (instances, y, label_set, subset_labels, gold, multi_labels) in enumerate(test(f, dataset)):
+ # output_path_hidden = f'{output_path}/{idx*10}'
+ # if not os.path.isdir(output_path_hidden):
+ # os.makedirs(output_path_hidden)
+ # print("label_set", label_set)
+ s = get_scores(instances,
+ aspects,
+ r,
+ subset_labels,
+ gamma=GAMMA,
+ remove_oov=False,
+ attention_func=att)
+
+ # print("predicted", s)
+ # print("subset_labels", subset_labels)
+ # print("gold", list(gold))
+ output = [[(label, value) for value, label in zip(sublist, subset_labels)] for sublist in s]
+ sorted_output = [sorted(sublist, key=lambda x: x[1], reverse=True) for sublist in output]
+ # print("output", sorted_output)
+
+ qrel = dict()
+ run = dict()
+
+ for i, word in enumerate(multi_labels):
+ q_key = 'q{}'.format(i)
+ # qrel[q_key] = {word: 1}
+ qrel[q_key] = {w: 1 for w in word}
+
+ for i, sublist in enumerate(sorted_output):
+ q_key = 'q{}'.format(i)
+ run[q_key] = {}
+ for j, (word, _) in enumerate(sublist):
+ run[q_key][word] = len(sublist) - j
+
+ # print("qrel: ", qrel)
+ # print("run: ", run)
+
+ print(f'pytrec_eval for {metrics_set} for fold {f} with {idx*10} percent hidden aspect in dataset {dataset}...')
+ df = pd.DataFrame.from_dict(pytrec_eval.RelevanceEvaluator(qrel, metrics_set).evaluate(run))
+ df_mean = df.mean(axis=1).to_frame('mean')
+ df_mean.to_csv(f'{output_path}/f{f}.model.ad.pred.{idx/10}.eval.mean.csv')
+ mean_list[idx] = pd.concat([mean_list[idx], df_mean], axis=1)
+ for i in range(0, 11):
+ # output_path_hidden = f'{output_path}/{i*10}'
+ mean_list[i].mean(axis=1).to_frame('mean').to_csv(f'{output_path}/model.ad.pred.eval.mean.{i/10}.csv')
# y_pred = s.argmax(1)
# f1_score = precision_recall_fscore_support(y, y_pred)
# f1_macro = precision_recall_fscore_support(y,
diff --git a/experiments/preprocessing_embeddings-googletranslate.py b/experiments/preprocessing_embeddings-googletranslate.py
new file mode 100644
index 0000000..da9b945
--- /dev/null
+++ b/experiments/preprocessing_embeddings-googletranslate.py
@@ -0,0 +1,41 @@
+"""Creating fragments takes a long time so we treat it as a
+pre-processing step."""
+import logging
+import os
+
+from gensim.models import Word2Vec
+from cat.fragments import create_noun_counts
+from cat.utils import conll2text
+
+logging.basicConfig(level=logging.INFO)
+
+
+if __name__ == "__main__":
+ datasets = []
+ for d in ['googletranslate-twitter']:
+ for l in ['en', 'fa', 'zh-CN', 'de', 'ar', 'fr', 'es', 'fa.zh-CN.de.ar.fr.es']:
+ if l == 'en':
+ datasets.append(f'{d}')
+ else:
+ datasets.append(f'{d}-{l}')
+ for dataset in datasets:
+ for f in range(5):
+ fold_path = f'{dataset}/train/{f}'
+ paths = [f'../data/{fold_path}/input.conllu']
+ create_noun_counts(paths, f'../data/{fold_path}/nouns.json')
+ conll2text(paths, f'../data/{fold_path}/all_txt.txt')
+ corpus = [x.lower().strip().split()
+ for x in open(f'../data/{fold_path}/all_txt.txt', encoding='utf-8')]
+
+ f = Word2Vec(corpus,
+ sg=0,
+ negative=5,
+ window=10,
+ vector_size=200,
+ min_count=2,
+ epochs=40,
+ workers=10)
+ embedding_path = f"../embeddings/{fold_path}"
+ if not os.path.isdir(embedding_path):
+ os.makedirs(embedding_path)
+ f.wv.save_word2vec_format(f'{embedding_path}/vecs_w2v.vec')
diff --git a/experiments/preprocessing_embeddings.py b/experiments/preprocessing_embeddings.py
index 76039ca..d86fa54 100644
--- a/experiments/preprocessing_embeddings.py
+++ b/experiments/preprocessing_embeddings.py
@@ -1,6 +1,7 @@
"""Creating fragments takes a long time so we treat it as a
pre-processing step."""
import logging
+import os
from gensim.models import Word2Vec
from cat.fragments import create_noun_counts
@@ -10,21 +11,31 @@
if __name__ == "__main__":
+ datasets = []
+ for d in ['lowresource-2014r']:
+ for l in ['lao_Laoo', 'san_Deva']:
+ if l == 'eng':
+ datasets.append(f'{d}')
+ else:
+ datasets.append(f'{d}-{l}')
+ for dataset in datasets:
+ for f in range(5):
+ fold_path = f'{dataset}/train/{f}'
+ paths = [f'../data/{fold_path}/input.conllu']
+ create_noun_counts(paths, f'../data/{fold_path}/nouns.json')
+ conll2text(paths, f'../data/{fold_path}/all_txt.txt')
+ corpus = [x.lower().strip().split()
+ for x in open(f'../data/{fold_path}/all_txt.txt', encoding='utf-8')]
- paths = ["../data/input.conllu"]
- create_noun_counts(paths,
- "../data/toy_nouns.json")
- conll2text(paths, "../data/toy_all_txt.txt")
- corpus = [x.lower().strip().split()
- for x in open("../data/toy_all_txt.txt")]
-
- f = Word2Vec(corpus,
- sg=0,
- negative=5,
- window=10,
- vector_size=200,
- min_count=2,
- epochs=5,
- workers=10)
-
- f.wv.save_word2vec_format(f"../embeddings/toy_vecs_w2v.vec")
+ f = Word2Vec(corpus,
+ sg=0,
+ negative=5,
+ window=10,
+ vector_size=200,
+ min_count=2,
+ epochs=200,
+ workers=10)
+ embedding_path = f"../embeddings/{fold_path}"
+ if not os.path.isdir(embedding_path):
+ os.makedirs(embedding_path)
+ f.wv.save_word2vec_format(f'{embedding_path}/vecs_w2v.vec')
diff --git a/text-to-CONLLu.py b/text-to-CONLLu.py
index 53fc960..85a485f 100644
--- a/text-to-CONLLu.py
+++ b/text-to-CONLLu.py
@@ -129,4 +129,16 @@ def main():
if __name__ == "__main__":
- main()
\ No newline at end of file
+ datasets = []
+ for d in ['lowresource-2015', 'lowresource-2016', 'lowresource-2014l', 'lowresource-2014r']: # , 'googletranslate-SemEval-14-L']:
+ for l in ['lao_Laoo', 'san_Deva']:
+ if l == 'eng':
+ datasets.append(f'{d}')
+ else:
+ datasets.append(f'{d}-{l}')
+ for dataset in datasets:
+ for f in range(5):
+ sys.argv.append(f'data/{dataset}/train/{f}/train.txt')
+ sys.argv.append(f'data/{dataset}/train/{f}/input.conllu')
+ main()
+ del sys.argv[-2:]
diff --git a/wrapper.py b/wrapper.py
index 6934c74..4a8fc52 100644
--- a/wrapper.py
+++ b/wrapper.py
@@ -11,9 +11,8 @@
def load(reviews, splits):
print('\nLoading reviews and preprocessing ...')
- print('#' * 50)
+ print('_' * 50)
try:
- print('\nLoading reviews files ...')
with open(f'{reviews}', 'rb') as f:
reviews = pickle.load(f)
with open(f'{splits}', 'r') as f:
@@ -25,44 +24,129 @@ def load(reviews, splits):
return reviews, splits
-def preprocess(org_reviews):
+def get_aos_augmented(review):
+ r = []
+ if not review.aos: return r
+ for i, aos in enumerate(review.aos): r.append([([review.sentences[i][j] for j in a], [review.sentences[i][j] for j in o], s) for (a, o, s) in aos])
+ return r
+
+
+def preprocess(org_reviews, status, lang):
reviews_list = []
label_list = []
for r in org_reviews:
if not len(r.aos[0]):
continue
else:
- for aos_instance in r.get_aos():
- for aos in aos_instance[0][0]:
- reviews_list.append(r.get_txt())
- label_list.append(aos)
- if r.augs:
- for key, value in r.items():
- for aos_instance in r[key][1].get_aos():
- for aos in aos_instance[0][0]:
- reviews_list.append(r[key][1].get_txt())
- label_list.append(aos)
+
+ if status == 'test':
+ reviews_list.append(r.get_txt())
+ label_list.append(r.get_aos()[0][0][0][0])
+
+ elif status == 'multi-test': # test should not be duplicated in case of having more than one aspect
+ reviews_list.append(r.get_txt())
+ label_per_review = []
+ for aos in r.get_aos()[0][0][0]:
+ label_per_review.append(aos)
+ label_list.append(label_per_review)
+
+ else: # train should be duplicated in case of having more than one aspect
+ for aos in r.get_aos()[0][0][0]:
+ text = r.get_txt()
+ label = aos
+ reviews_list.append(text)
+ label_list.append(label)
+
+ '''
+ if len(r.get_aos()[0][0][0]) == 1:
+ text = r.get_txt()
+ label = r.get_aos()[0][0][0][0]
+ reviews_list.append(text)
+ label_list.append(label)
+ '''
+
+ if r.augs and status == 'train': # data for train can be augmented
+
+ # if lang == 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn':
+ if lang == 'fa.zh-CN.de.ar.fr.es':
+ for key, value in r.augs.items():
+ for aos_instance in get_aos_augmented(r.augs[key][1])[0][0][0]:
+ text = r.augs[key][1].get_txt()
+ label = aos_instance
+ reviews_list.append(text)
+ label_list.append(label)
+ else:
+ # for l in lang.split('.'):
+ # for aos_instance in get_aos_augmented(r.augs[l][1])[0][0][0]:
+ # text = r.augs[l][1].get_txt()
+ # label = aos_instance
+ # reviews_list.append(text)
+ # label_list.append(label)
+ for aos_instance in get_aos_augmented(r.augs[lang][1])[0][0][0]:
+ text = r.augs[lang][1].get_txt()
+ label = aos_instance
+ reviews_list.append(text)
+ label_list.append(label)
+ '''
+ # for key, value in r.augs.items():
+ # if len(get_aos_augmented(r.augs[key][1])) == 0:
+ # text = r.augs[key][1].get_txt()
+ # reviews_list.append(text)
+ # continue
+ # for aos_instance in r.augs[key][1].get_aos()[0]:
+ if lang == 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn':
+ for key, value in r.augs.items():
+ if len(get_aos_augmented(r.augs[key][1])[0][0][0]) == 1:
+ text = r.augs[key][1].get_txt()
+ label = get_aos_augmented(r.augs[key][1])[0][0][0][0]
+ reviews_list.append(text)
+ # label_list.append(label)
+ elif len(get_aos_augmented(r.augs[lang][1])[0][0][0]) == 1:
+ text = r.augs[lang][1].get_txt()
+ label = get_aos_augmented(r.augs[lang][1])[0][0][0][0]
+ reviews_list.append(text)
+ # label_list.append(label)
+ '''
+
return reviews_list, label_list
-# python main.py -ds_name [YOUR_DATASET_NAME] -sgd_lr [YOUR_LEARNING_RATE_FOR_SGD] -win [YOUR_WINDOW_SIZE] -optimizer [YOUR_OPTIMIZER] -rnn_type [LSTM|GRU] -attention_type [bilinear|concat]
def main(args):
- if not os.path.isdir(f'{args.output}'): os.makedirs(f'{args.output}')
+ output_path = f'{args.output}/{args.dname}/'
+ if not os.path.isdir(output_path): os.makedirs(output_path)
org_reviews, splits = load(args.reviews, args.splits)
+
+ for f in range(5):
+ path = f'{output_path}train/{f}/'
+ if not os.path.isdir(path): os.makedirs(path)
+ train, label_list = preprocess(np.array(org_reviews)[splits['folds'][str(f)]['train']].tolist(), 'train', args.lang)
+
+ with open(f'{path}train.txt', 'w', encoding='utf-8') as file:
+ for d in train:
+ file.write(d + '\n')
+ with open(f'{path}train_label.txt', 'w', encoding='utf-8') as file:
+ for d in label_list:
+ file.write(d + '\n')
+
test = np.array(org_reviews)[splits['test']].tolist()
for h in range(0, 101, 10):
- path = f'{args.output}/{h}/{args.dname}'
- if not os.path.isdir(f'{args.output}/{h}'):
- os.makedirs(f'{args.output}/{h}')
+ path = f'{output_path}/test/{h}/'
+ if not os.path.isdir(path):
+ os.makedirs(path)
- preprocessed_test, label_list = preprocess(test)
+ preprocessed_test, label_list = preprocess(test, 'test', args.lang)
- with open(f'{path}_test_label.txt', 'w') as file:
+ with open(f'{path}test_label.txt', 'w', encoding='utf-8') as file:
for d in label_list:
file.write(d + '\n')
+ _, labels_list = preprocess(test, 'multi-test', args.lang)
+ with open(f'{path}test_label_multi.txt', 'w', encoding='utf-8') as file:
+ for d in labels_list:
+ file.write(str(d) + '\n')
+
hp = h / 100
test_hidden = []
for t in range(len(test)):
@@ -70,34 +154,48 @@ def main(args):
test_hidden.append(test[t].hide_aspects())
else:
test_hidden.append(test[t])
- preprocessed_test, label_list = preprocess(test_hidden)
+ preprocessed_test, label_list = preprocess(test_hidden, 'test', args.lang)
- with open(f'{path}_test.txt', 'w') as file:
+ with open(f'{path}test.txt', 'w', encoding='utf-8') as file:
for d in preprocessed_test:
file.write(d + '\n')
- train, label_list = preprocess(np.array(org_reviews)[splits['folds']['0']['train']].tolist())
- path = f'{args.output}/{args.dname}'
- with open(f'{path}_train.txt', 'w') as file:
- for d in train:
- file.write(d + '\n')
- with open(f'{path}_train_label.txt', 'w') as file:
- for d in label_list:
- file.write(d + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='CAt Wrapper')
- parser.add_argument('--dname', dest='dname', type=str, default='toy')
+ parser.add_argument('--dname', dest='dname', type=str, default='SemEval-14-R')
parser.add_argument('--reviews', dest='reviews', type=str,
- default='data/reviews.pkl',
+ default='data/2015SB12/reviews.pes_Arab.pkl',
help='raw dataset file path')
parser.add_argument('--splits', dest='splits', type=str,
- default='data/splits.json',
+ default='data/2015SB12/splits.json',
help='raw dataset file path')
parser.add_argument('--output', dest='output', type=str,
- default='data/',
+ default='data',
help='output path')
+ parser.add_argument('--lang', dest='lang', type=str,
+ default='eng',
+ help='language')
args = parser.parse_args()
- main(args)
+ # 'SemEval14L','SemEval14R', '2015SB12', '2016SB5'
+ # 'output-twitter-modified'
+ # 'googletranslate-2015SB12','googletranslate-2016SB5','googletranslate-SemEval-14-L'
+ # for dataset in ['googletranslate-2015SB12','googletranslate-2016SB5','googletranslate-SemEval-14-L','googletranslate-SemEval-14-R', 'googletranslate-twitter']:
+ for dataset in ['lowresource-2015', 'lowresource-2016', 'lowresource-2014l', 'lowresource-2014r']:
+ args.splits = f'data/{dataset}/splits.json'
+ # for lang in []:'eng', 'pes_Arab', 'zho_Hans', 'deu_Latn', 'arb_Arab', 'fra_Latn', 'spa_Latn',
+ # # 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn'
+ for lang in ['lao_Laoo', 'san_Deva']:
+ # if lang == 'en':
+ if lang == 'eng':
+ args.lang = lang
+ args.dname = f'{dataset}'
+ args.reviews = f'data/{dataset}/reviews.pkl'
+ else:
+ args.lang = lang
+ args.dname = f'{dataset}-{lang}'
+ args.reviews = f'data/{dataset}/reviews.{lang}.pkl'
+ print(args)
+ main(args)