diff --git a/cat.zip b/cat.zip new file mode 100644 index 0000000..baf64cf Binary files /dev/null and b/cat.zip differ diff --git a/cat/dataset.py b/cat/dataset.py index 1df70f6..09aadb1 100644 --- a/cat/dataset.py +++ b/cat/dataset.py @@ -5,14 +5,29 @@ def loader(instance_path, label_path, + label_multi_path, subset_labels_path, split_labels=False, mapping=None): # subset_labels = set(subset_labels) + + multi_labels = [] + with open(label_multi_path, 'r') as file: + for line in file: + current_array = eval(line.strip()) + multi_labels.append(current_array) + + # multi_labels = open(label_multi_path) + # multi_labels = [x for x in multi_labels] + labels = open(label_path) labels = [x.strip().lower().split() for x in labels] - subset_labels = open(subset_labels_path) + # subset_labels = open(subset_labels_path) + subset_labels = [] + with open(subset_labels_path, 'r', encoding='utf-8') as file: + for line in file: + subset_labels.append(line.strip()) subset_labels = set([x.strip().lower() for x in subset_labels]) # print(subset_labels) @@ -21,7 +36,7 @@ def loader(instance_path, # subset_labels = {'wine', 'place', 'food'} instances = [] - for line in open(instance_path): + for line in open(instance_path, encoding='utf-8'): instances.append(line.strip().lower().split()) if split_labels: @@ -29,7 +44,7 @@ def loader(instance_path, instances, gold = zip(*[(x, y[0]) for x, y in zip(instances, labels) if len(y) == 1]) - # y[0] in subset_labels]) + # and y[0] in subset_labels]) if mapping is not None: gold = [mapping.get(x, x) for x in gold] @@ -38,63 +53,25 @@ def loader(instance_path, y = le.fit_transform(gold) label_set = le.classes_.tolist() - return instances, y, label_set, subset_labels, gold - - -rest_14_test = partial(loader, - instance_path="data/restaurant_test_2014_tok.txt", # noqa - label_path="data/labels_restaurant_test_2014.txt", # noqa - subset_labels={"ambience", - "service", - "food"}) - - -rest_14_train = partial(loader, - instance_path="data/restaurant_train_2014.txt", # noqa - label_path="data/labels_restaurant_train_2014.txt", # noqa - subset_labels={"ambience", - "service", - "food"}) - - -ganu_test = partial(loader, - instance_path="data/test_tok.txt", - label_path="data/test_label.txt", - subset_labels={"ambience", - "staff", - "food"}) - - -rest_15_train = partial(loader, - instance_path="data/restaurant_train_2015_tok.txt", - label_path="data/labels_restaurant_train_2015.txt", - subset_labels={"ambience", - "service", - "food"}, - split_labels=True) - -rest_15_test = partial(loader, - instance_path="data/restaurant_test_2015_tok.txt", - label_path="data/labels_restaurant_test_2015.txt", - subset_labels={"ambience", - "service", - "food"}, - split_labels=True) + return instances, y, label_set, subset_labels, gold, multi_labels -toy_test = partial(loader, - instance_path="../data/0/toy_test.txt", - label_path="../data/0/toy_test_label.txt", - subset_labels_path="../data/toy_train_label.txt", - split_labels=True) +# rest_15_test = partial(loader, +# instance_path="data/restaurant_test_2015_tok.txt", +# label_path="data/labels_restaurant_test_2015.txt", +# subset_labels={"ambience", +# "service", +# "food"}, +# split_labels=True) -def restaurants_train(): - yield rest_14_train() - yield rest_15_train() +def test(f, dataset): + for h in range(0, 101, 10): + data_test = partial(loader, + instance_path=f"../data/{dataset}/test/{h}/test.txt", + label_path=f"../data/{dataset}/test/{h}/test_label.txt", + label_multi_path=f"../data/{dataset}/test/{h}/test_label_multi.txt", + subset_labels_path=f"../data/{dataset}/train/{f}/train_label.txt", + split_labels=True) -def restaurants_test(): - yield toy_test() - # yield rest_14_test() - # yield rest_15_test() - # yield ganu_test() + yield data_test() \ No newline at end of file diff --git a/cat/utils.py b/cat/utils.py index 2540b7c..c468399 100644 --- a/cat/utils.py +++ b/cat/utils.py @@ -5,7 +5,7 @@ def conll2text(paths, outpath): """Write a conll file to a text file.""" - with open(outpath, 'w') as f: + with open(outpath, 'w', encoding='utf-8') as f: for path in paths: for sent in pyconll.iter_from_file(path): txt = [] diff --git a/cmn/__init__.py b/cmn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cmn/mams.py b/cmn/mams.py new file mode 100644 index 0000000..9e722e7 --- /dev/null +++ b/cmn/mams.py @@ -0,0 +1,93 @@ +import os, spacy +from tqdm import tqdm +import xml.etree.ElementTree as ET + +from cmn.review import Review + +class MAMSReview(Review): + def __init__(self, id, sentences, time, author, aos): + super().__init__(self, id, sentences, time, author, aos) + + @staticmethod + def xmlloader(path): + reviews_list = [] + nlp = spacy.load("en_core_web_sm") + tree = ET.parse(path) + sentences = tree.getroot() + i = -1 + for sentence in sentences: # each sentence is an individual review, unlike SemEval16 + i += 1 + + text = "" + tokens = [] + aos_list_list = [] + + for data in sentence: + if data.tag == "text": # clean the associated aspect tokens from punctuations + raw_text = data.text + current_text = raw_text + opinion_text = sentence.findall(".//aspectTerm") + for o in opinion_text: + aspect = o.attrib["term"] + aspect_list = aspect.split() + if len(aspect_list) == 0: # contains no aspect (mams dataset doesn't have NULL aspects) + continue + letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to'])) + current_text = current_text.replace(' ', ' ') + current_text = current_text[0:letter_index_tuple[0]] + ' ' + aspect + ' ' + current_text[letter_index_tuple[1]+1:] + #print("processing text:" + str(current_text)) + tokens = current_text.split() + + if data.tag == "aspectTerms": + aos_list = [] + for o in data: # each o is an aspectTerm + + sentiment = o.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0') + + aspect = o.attrib["term"] + aspect_list = aspect.split() # the aspect may consist more than 1 word + if len(aspect_list) == 0: + continue + + letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to'])) + + # find the aspect instance of all text instances of the phrase + #print(tokens) + + text_incidences = [i for i in range(len(raw_text)) + if raw_text.startswith(aspect, i) + and not raw_text[i-1].isalpha() + and not raw_text[i+len(aspect)].isalpha()] + #print("text incidences: " + str(text_incidences)) + idx_of_from = text_incidences.index(letter_index_tuple[0]) + #print("index of from: " + str(idx_of_from)) + + # find the location of the aspect token + start_token_of_aspect = [i for i in range(len(tokens)) + if i + len(aspect_list) <= len(tokens) + and tokens[i:i + len(aspect_list)] == aspect_list] + + #print("start token of aspect: " + str(start_token_of_aspect)) + + idx_start_token_of_aspect = start_token_of_aspect[idx_of_from] + + idx_aspect_list = list( + range(idx_start_token_of_aspect, idx_start_token_of_aspect + len(aspect_list))) + + # compile the final aos 3-tuple for each aspect + aos = (idx_aspect_list, [], eval(sentiment)) + + if len(aos) != 0: + aos_list.append(aos) + + if len(aos_list) != 0: + aos_list_list.append(aos_list) + + if len(aos_list_list) == 0: # if no aspect in the sentence, it is not added + continue + + reviews_list.append( + Review(id=i, sentences=[[str(t).lower() for t in current_text.split()]], time=None, + author=None, aos=aos_list_list, lempos="")) + + return reviews_list \ No newline at end of file diff --git a/cmn/review.py b/cmn/review.py new file mode 100644 index 0000000..a3dbcb1 --- /dev/null +++ b/cmn/review.py @@ -0,0 +1,271 @@ +import pandas as pd, copy, numpy as np +from scipy.spatial.distance import cosine + +class Review(object): + translator_mdl = None; translator_tokenizer = None + semantic_mdl = None; align_mdl = None + def __init__(self, id, sentences, time=None, author=None, aos=None, lempos=None, parent=None, lang='eng_Latn', category=None): + self.id = id + self.sentences = sentences #list of sentences of list of tokens + self.time = time + self.author = author + self.aos = aos #list of list of aspect_opinion_sentiment triples for per sentence, e.g., [[([7,8], [10, 11, 12], -1), ([15,17], [20], +1)]] + self.lempos = lempos + self.lang = lang + self.category = category + + self.parent = parent + self.augs = {} #distionary of translated and backtranslated augmentations of this review in object format, e.g., + # {'deu_Latn': (Review1(self.id, 'dies ist eine bewertung', None, None, None, None, self, 'deu_Latn'), + # Review2(self.id, 'this is a review', None, None, None, None, self, 'eng_Latn'), + # semantic_similarity_score) + + def to_dict(self, w_augs=False): + result = [{'id': self.id, + 'text': self.get_txt(), + 'sentences': self.sentences, + 'aos': self.get_aos(), #self.parent.get_aos() if self.parent else self.get_aos(), + 'lang': self.lang, + 'orig': False if self.parent else True}] + if not w_augs: return result + for k in self.augs: + #result += self.augs[k][0].to_dict() + result += self.augs[k][1].to_dict() + return result + + def get_aos(self): + r = [] + if not self.aos: return r + for i, aos in enumerate(self.aos): r.append([([self.sentences[i][j] for j in a], [self.sentences[i][j] for j in o], s) for (a, o, s) in aos]) + return r + + def get_txt(self): return '. '.join(' '.join(s) for s in self.sentences) + + def hide_aspects(self): + r = copy.deepcopy(self) + for i, sent in enumerate(r.sentences): + # [sent.pop(k) for j, _, _ in r.aos[i] for k in j] + for j, _, _ in r.aos[i]: + for k in j: sent[k] = '#####' + return r + + def preprocess(self): return self # note that any removal of words breakes the aos indexing! + + def translate(self, tgt, settings): + src = self.lang + if not Review.translator_mdl: + from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + Review.translator_mdl = AutoModelForSeq2SeqLM.from_pretrained(settings['nllb']) + Review.translator_tokenizer = AutoTokenizer.from_pretrained(settings['nllb']) + + from transformers import pipeline + Review.translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=src, tgt_lang=tgt, max_length=settings['max_l'], device=settings['device']) + Review.back_translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=tgt, tgt_lang=src, max_length=settings['max_l'], device=settings['device']) + + translated_txt = Review.translator(self.get_txt())[0]['translation_text'] + translated_obj = Review(id=self.id, sentences=[[str(t).lower() for t in translated_txt.split()]], parent=self, lang=tgt, time=None, author=None, aos=None) + translated_obj.aos, _ = self.semalign(translated_obj) + + back_translated_txt = Review.back_translator(translated_txt)[0]['translation_text'] + back_translated_obj = Review(id=self.id, sentences=[[str(t).lower() for t in back_translated_txt.split()]], parent=self, lang=src, time=None, author=None, aos=None) + back_translated_obj.aos, _ = self.semalign(back_translated_obj) + + self.augs[tgt] = (translated_obj, back_translated_obj, self.semsim(back_translated_obj)) + return self.augs[tgt] + + def semsim(self, other): + if not Review.semantic_mdl: + from sentence_transformers import SentenceTransformer + Review.semantic_mdl = SentenceTransformer("johngiorgi/declutr-small") + me, you = Review.semantic_mdl.encode([self.get_txt(), other.get_txt()]) + return 1 - cosine(me, you) + + def semalign(self, other): + if not Review.align_mdl: + from simalign import SentenceAligner + Review.align_mdl = SentenceAligner(model="bert", token_type="bpe", matching_methods="i") + aligns = [Review.align_mdl.get_word_aligns(s1, o1)['itermax'] for s1, o1 in zip(self.sentences, other.sentences)] + other_aos = [] + for i, (aos, _) in enumerate(zip(self.aos, self.sentences)): + for (a, o, s) in aos: + other_a = [idx2 for idx in a for idx1, idx2 in aligns[i] if idx == idx1] + other_a.sort() + other_aos.append((other_a, o, s)) + return other_aos, aligns + + def get_lang_stats(self): + import nltk + from rouge import Rouge + from sklearn.metrics import accuracy_score + + result = {} + r = self.get_txt() + result['r_ntoken'] = len(r.split()) + for lang in self.augs.keys(): + r_ = self.augs[lang][1].get_txt() + # r_ = r #for testing purpose => should be very close to 1 for all metrics + result[lang + '_r_backtrans_ntoken'] = len(r_.split()) + result[lang + '_semsim'] = self.augs[lang][2] + result[lang + '_bleu'] = np.mean(nltk.translate.bleu_score.sentence_bleu([r.split()], r_.split(), weights=[(1 / bleu_no,) * bleu_no for bleu_no in range(1, min(4, result['r_ntoken'] + 1))])) + # https://pypi.org/project/rouge/ + result[lang + '_rouge_f'] = np.mean([v['f'] for k, v in Rouge(metrics=[f'rouge-{i+1}' for i in range(0, min(5, len(r.split())))]).get_scores(r_, r)[0].items()]) + # we need to make r_ as equal size as r + result[lang + '_em'] = accuracy_score(r.split(), r_.split()[:result['r_ntoken']] if len(r_.split()) > result['r_ntoken'] else r_.split() + [''] * (result['r_ntoken'] - len(r_.split()))) + return result + + @staticmethod + def load(path): pass + + @staticmethod + def to_df(reviews, w_augs=False): return pd.DataFrame.from_dict([rr for r in reviews for rr in r.to_dict(w_augs)]) + + @staticmethod + def translate_batch(reviews, tgt, settings): + src = reviews[0].lang + if not Review.translator_mdl: + from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + Review.translator_mdl = AutoModelForSeq2SeqLM.from_pretrained(settings['nllb']) + Review.translator_tokenizer = AutoTokenizer.from_pretrained(settings['nllb']) + + from transformers import pipeline + translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=src, tgt_lang=tgt, max_length=settings['max_l'], device=settings['device']) + back_translator = pipeline("translation", model=Review.translator_mdl, tokenizer=Review.translator_tokenizer, src_lang=tgt, tgt_lang=src, max_length=settings['max_l'], device=settings['device']) + + reviews_txt = [r.get_txt() for r in reviews] + translated_txt = translator(reviews_txt) + back_translated_txt = back_translator([r_['translation_text'] for r_ in translated_txt]) + + for i, r in enumerate(reviews): + translated_obj = Review(id=r.id, sentences=[[str(t).lower() for t in translated_txt[i]['translation_text'].split()]], parent=r, lang=tgt, time=None, author=None, aos=None, lempos=None,) + translated_obj.aos, _ = r.semalign(translated_obj) + + back_translated_obj = Review(id=r.id, sentences=[[str(t).lower() for t in back_translated_txt[i]['translation_text'].split()]], parent=r, lang=src, time=None, author=None, aos=r.aos, lempos=None,) + r.augs[tgt] = (translated_obj, back_translated_obj, r.semsim(back_translated_obj)) + + @staticmethod + def get_stats(datapath, output, cache=True, plot=True, plot_title=None): + try: + print(f'Loading the stats pickle from {datapath}...') + if not cache: raise FileNotFoundError + stats = pd.read_pickle(f'{output}/stats.pkl') + if plot: Review.plot_dist(stats, output, plot_title) + except FileNotFoundError: + print(f'File {datapath} not found! Generating stats ...') + reviews = pd.read_pickle(datapath) + from collections import Counter + stats = {'*nreviews': len(reviews), '*naspects': 0, '*ntokens': 0} + asp_nreviews = Counter() # aspects : number of reviews that contains the aspect + token_nreviews = Counter() # tokens : number of reviews that contains the token + nreviews_naspects = Counter() # v number of reviews with 1 aspect, ..., k aspects, ... + nreviews_ntokens = Counter() # v number of reviews with 1 token, ..., k tokens, ... + nreviews_category = Counter() # v number of categories with 1 review, ..., k reviews, ... + reviews_lang_stats = [] + + for r in reviews: + r_aspects = r.get_aos()[0] + r_tokens = [token for sentence in r.sentences for token in sentence] + asp_nreviews.update(' '.join(a) for (a, o, s) in r_aspects) + token_nreviews.update(token for token in r_tokens) + nreviews_naspects.update([len(r_aspects)]) + nreviews_ntokens.update([len(r_tokens)]) + if hasattr(r, 'category'): nreviews_category.update(r.category) + + reviews_lang_stats.append(r.get_lang_stats()) + + naspects_nreviews = Counter(asp_nreviews.values()) # v number of aspects with 1 review, ..., k reviews, ... + ntokens_nreviews = Counter(token_nreviews.values()) # v number of tokens with 1 review, ..., k reviews, ... + stats["*naspects"] = len(asp_nreviews.keys()) # unique. Non-unique number of aspects: sum(asp_nreviews.values()) + stats["*ntokens"] = len(token_nreviews.keys()) # unique. Non-unique number of tokens: sum(token_nreviews.values()) + stats['nreviews_naspects'] = {k: v for k, v in sorted(nreviews_naspects.items(), key=lambda item: item[1], reverse=True)} + stats['nreviews_ntokens'] = {k: v for k, v in sorted(nreviews_ntokens.items(), key=lambda item: item[1], reverse=True)} + stats['naspects_nreviews'] = {k: v for k, v in sorted(naspects_nreviews.items(), key=lambda item: item[1], reverse=True)} + stats['ntokens_nreviews'] = {k: v for k, v in sorted(ntokens_nreviews.items(), key=lambda item: item[1], reverse=True)} + stats['nreviews_category'] = {k: v for k, v in sorted(nreviews_category.items(), key=lambda item: item[1], reverse=True)} + stats['*avg_ntokens_review'] = sum(k * v for k, v in nreviews_ntokens.items()) / sum(nreviews_ntokens.values()) # average number of tokens per review + stats['*avg_naspects_review'] = sum(k * v for k, v in nreviews_naspects.items()) / sum(nreviews_naspects.values()) # average number of aspects per review + stats['*avg_lang_stats'] = pd.DataFrame.from_dict(reviews_lang_stats).mean().to_dict() + if output: pd.to_pickle(stats, f'{output}/stats.pkl') + if plot: Review.plot_dist(stats, output, plot_title) + import json + print(json.dumps(stats, indent=4)) + # print(stats) + return stats + + @staticmethod + def plot_dist(stats, output, plot_title): + from matplotlib import pyplot as plt + plt.rcParams.update({'font.family': 'Consolas'}) + print("plotting distribution data ...") + for k, v in stats.items(): + if (not k.startswith("*")): # the * values cannot be plotted + fig = plt.figure(k, figsize=(1.5, 1.5)) + ax = fig.add_subplot(1, 1, 1) + ax.set_facecolor('whitesmoke') + ax.loglog(*zip(*stats[k].items()), marker='x', linestyle='None', markeredgecolor='m') + ax.set_xlabel(k.split('_')[1][0].replace('n', '#') + k.split('_')[1][1:]) + ax.set_ylabel(k.split('_')[0][0].replace('n', '#') + k.split('_')[0][1:]) + ax.grid(True, color="#93a1a1", alpha=0.3) + ax.minorticks_off() + ax.xaxis.set_tick_params(size=2, direction='in') + ax.yaxis.set_tick_params(size=2, direction='in') + + # wrapping labels + labels = [] + for l in ax.get_xticklabels(): + l.set_text('\n#'.join(l.get_text().split("#"))) + labels.append(l) + ax.set_xticklabels(labels, ha='left') + + ax.xaxis.get_label().set_size(12) + ax.yaxis.get_label().set_size(12) + ax.set_title(plot_title) + fig.savefig(f'{output}/{k}.pdf', dpi=100, bbox_inches='tight') + plt.show() + + @staticmethod + def plot_semsim_dist(datapath, output, plot_title): + from matplotlib import pyplot as plt + plt.rcParams.update({'font.family': 'Consolas'}) + import seaborn as sns + reviews = pd.read_pickle(datapath) + + hist_dict = [{'original': 'eng_Latn', 'target': Review.lang_title(k), 'score': v[2]} for r in reviews for k, v in r.augs.items()] + df = pd.DataFrame.from_dict(hist_dict) + fig = plt.figure(figsize=(6, 2)) + ax = fig.add_subplot(1, 1, 1) + x_range = [i / 10 for i in range(0, 11)] + # ax.set_ylim([0, len(reviews)]) + # plt.yscale('log') + plt.ylabel('#reviews') + plt.xlabel('similarity score') + ax.set_title(plot_title, x=0.2, y=0.8, fontsize=11) + ax.set_facecolor('whitesmoke') + h = sns.histplot(df, + x='score', + hue='target', + element='step', #also try 'poly' + stat='density', + common_norm=False) + sns.move_legend(ax, 'upper left') + + plt.legend([]) + + h.legend_.set_title(None) + h.set(xticks=x_range) + plt.savefig(f'{output}', dpi=100, bbox_inches='tight') + # df.to_csv(f'{output.replace("pdf", "csv")}') + plt.show() + # plt.clf() + + @staticmethod + def lang_title(lang_code): + if lang_code == 'zho_Hans': return 'chinese' + elif lang_code == 'deu_Latn': return 'german' + elif lang_code == 'fra_Latn': return 'french' + elif lang_code == 'arb_Arab': return 'arabic' + elif lang_code == 'pes_Arab': return 'farsi' + elif lang_code == 'spa_Latn': return 'spanish' + elif lang_code == 'eng_Latn': return 'english' + elif lang_code == 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn': return 'all' + elif lang_code == None: return ['pes_Arab', 'zho_Hans', 'deu_Latn', 'arb_Arab', 'fra_Latn', 'spa_Latn', 'all'] + diff --git a/cmn/semeval.py b/cmn/semeval.py new file mode 100644 index 0000000..0278535 --- /dev/null +++ b/cmn/semeval.py @@ -0,0 +1,107 @@ +import os, spacy +from tqdm import tqdm +import xml.etree.ElementTree as et + +#nlp = spacy.load("en_core_web_sm") # en_core_web_trf for transformer-based; error ==> python -m spacy download en_core_web_sm + +from cmn.review import Review + +class SemEvalReview(Review): + + def __init__(self, id, sentences, time, author, aos): super().__init__(self, id, sentences, time, author, aos) + + @staticmethod + def load(path): + if str(path).endswith('.xml'): return SemEvalReview._xmlloader(path) + return SemEvalReview._txtloader(input) + + @staticmethod + def _txtloader(path): + reviews = [] + with tqdm(total=os.path.getsize(path)) as pbar, open(path, "r", encoding='utf-8') as f: + for i, line in enumerate(f.readlines()): + pbar.update(len(line)) + sentence, aos = line.split('####') + aos = aos.replace('\'POS\'', '+1').replace('\'NEG\'', '-1').replace('\'NEU\'', '0') + + # for the current datafile, each row is a review of single sentence! + # sentence = nlp(sentence) + reviews.append(Review(id=i, sentences=[[str(t).lower() for t in sentence.split()]], time=None, author=None, + aos=[eval(aos)], lempos=None, + parent=None, lang='eng_Latn')) + return reviews + + @staticmethod + def _xmlloader(path): + reviews_list = [] + xtree = et.parse(path).getroot() + if xtree.tag == 'Reviews': reviews = [SemEvalReview._parse(xsentence) for xreview in tqdm(xtree) for xsentences in xreview for xsentence in xsentences] + if xtree.tag == 'sentences': reviews = [SemEvalReview._parse(xsentence) for xsentence in tqdm(xtree)] + + return [r for r in reviews if r] + + @staticmethod + def _map_idx(aspect, text): + # aspect: ('token', from_char, to_char) + text_tokens = text[:aspect[1]].split() + # to fix if "aaaa ,b, c" ",b c" if b is the aspect + if len(text_tokens) > 0 and not text[aspect[1] - 1].isspace(): text_tokens.pop() + aspect_tokens = aspect[0].split() + + # tmp = [*text] #mutable string :) + # # these two blank space add bug to the char indexes for aspects if a sentence have multiple aspects! + # tmp[aspect[1]: aspect[2]] = [' '] + [*aspect[0]] + [' '] + # text = ''.join(tmp) + + return [i for i in range(len(text_tokens), len(text_tokens) + len(aspect_tokens))] + + @staticmethod + def _parse(xsentence): + id = xsentence.attrib["id"] + aos = []; aos_cats = [] + for element in xsentence: + if element.tag == 'text': sentence = element.text # we consider each sentence as a signle review + elif element.tag == 'Opinions':#semeval-15-16 + # + for opinion in element: + if opinion.attrib["target"] == 'NULL': continue + # we may have duplicates for the same aspect due to being in different category like in semeval 2016's + aspect = (opinion.attrib["target"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10) + # we need to map char index to token index in aspect + aspect = SemEvalReview._map_idx(aspect, sentence) + category = opinion.attrib["category"] # 'RESTAURANT#GENERAL' + sentiment = opinion.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0') #'+1' + aos.append((aspect, [], sentiment, opinion.attrib["target"])) + aos_cats.append(category) + aos = sorted(aos, key=lambda x: int(x[0][0])) #based on start of sentence + + elif element.tag == 'aspectTerms':#semeval-14 + # + for opinion in element: + if opinion.attrib["term"] == 'NULL': continue + # we may have duplicates for the same aspect due to being in different category like in semeval 2016's + aspect = (opinion.attrib["term"], int(opinion.attrib["from"]), int(opinion.attrib["to"])) #('place', 5, 10) + # we need to map char index to token index in aspect + aspect = SemEvalReview._map_idx(aspect, sentence) + sentiment = opinion.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0') #'+1' + aos.append((aspect, [], sentiment, opinion.attrib["term"])) + + aos = sorted(aos, key=lambda x: int(x[0][0])) #based on start of sentence + + elif element.tag == 'aspectCategories': # semeval-14 + for opinion in element: + # + aos_cats.append(opinion.attrib["category"]) + + #sentence = nlp(sentence) # as it does some processing, it destroys the token idx for aspect term + tokens = sentence.split() + # to fix ",a b c," to "a b c" + # to fix '"sales" team' to 'sales team' => semeval-14-labptop- + # todo: fix 'Food-awesome.' to 'food awesome' => semeval-14-restaurant- + for i, (idxlist, o, s, aspect_token) in enumerate(aos): + for j, idx in enumerate(idxlist): tokens[idx] = aspect_token.split()[j].replace('"', '') + aos[i] = (idxlist, o, s) + return Review(id=id, sentences=[[str(t).lower() for t in tokens]], time=None, author=None, + aos=[aos], lempos=None, + parent=None, lang='eng_Latn', category=aos_cats) if aos else None + diff --git a/experiments/experiment_test-googletranslate.py b/experiments/experiment_test-googletranslate.py new file mode 100644 index 0000000..4286ccf --- /dev/null +++ b/experiments/experiment_test-googletranslate.py @@ -0,0 +1,133 @@ +"""Experiment on the test data.""" +import json +import os + +import numpy as np + +# LADy_eval +import pytrec_eval +import pandas as pd + +from cat.simple import get_scores, attention, rbf_attention +from cat.dataset import test +from reach import Reach +from sklearn.metrics import precision_recall_fscore_support +from collections import defaultdict, Counter +from itertools import product + + +GAMMA = .03 +BEST_ATT = {"n_noun": 980} +BEST_RBF = {"n_noun": 200} + +if __name__ == "__main__": + + # LADy_eval + metrics = ['P', 'recall', 'ndcg_cut', 'map_cut', 'success'] + topkstr = '1,5,10,100' + metrics_set = set() + for m in metrics: + metrics_set.add(f'{m}_{topkstr}') + datasets = [] + for d in ['googletranslate-twitter']: # 'googletranslate-SemEval-14-L' + for l in ['en', 'fa', 'zh-CN', 'de', 'ar', 'fr', 'es', 'fa.zh-CN.de.ar.fr.es']: # + if l == 'en': + datasets.append(f'{d}') + else: + datasets.append(f'{d}-{l}') + for dataset in datasets: + + output_path = f'../output-googletranslate/{dataset}' + if not os.path.isdir(output_path): + os.makedirs(output_path) + + mean_list = [pd.DataFrame() for i in range(0, 11)] + for f in range(5): + fold_path = f'{dataset}/train/{f}' + scores = defaultdict(dict) + r = Reach.load(f'../embeddings/{fold_path}/vecs_w2v.vec', + unk_word="") + d = json.load(open(f'../data/{fold_path}/nouns.json')) + + nouns = Counter() + for k, v in d.items(): + if k.lower() in r.items: + nouns[k.lower()] += v + + embedding_paths = [f'../embeddings/{fold_path}/vecs_w2v.vec'] + # bundles = ((rbf_attention, attention), embedding_paths) + bundles = ((rbf_attention, ), embedding_paths) + + for att, path in product(*bundles): + r = Reach.load(path, unk_word="") + + if att == rbf_attention: + candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"])) + else: + candidates, _ = zip(*nouns.most_common(BEST_ATT["n_noun"])) + + aspects = [[x] for x in candidates] + sorted_output = [] + for idx, (instances, y, label_set, subset_labels, gold, multi_labels) in enumerate(test(f, dataset)): + # output_path_hidden = f'{output_path}/{idx*10}' + # if not os.path.isdir(output_path_hidden): + # os.makedirs(output_path_hidden) + # print("label_set", label_set) + s = get_scores(instances, + aspects, + r, + subset_labels, + gamma=GAMMA, + remove_oov=False, + attention_func=att) + + # print("predicted", s) + # print("subset_labels", subset_labels) + # print("gold", list(gold)) + output = [[(label, value) for value, label in zip(sublist, subset_labels)] for sublist in s] + sorted_output = [sorted(sublist, key=lambda x: x[1], reverse=True) for sublist in output] + # print("output", sorted_output) + + qrel = dict() + run = dict() + + for i, word in enumerate(multi_labels): + q_key = 'q{}'.format(i) + # qrel[q_key] = {word: 1} + qrel[q_key] = {w: 1 for w in word} + + for i, sublist in enumerate(sorted_output): + q_key = 'q{}'.format(i) + run[q_key] = {} + for j, (word, _) in enumerate(sublist): + run[q_key][word] = len(sublist) - j + + # print("qrel: ", qrel) + # print("run: ", run) + + print(f'pytrec_eval for {metrics_set} for fold {f} with {idx*10} percent hidden aspect in dataset {dataset}...') + df = pd.DataFrame.from_dict(pytrec_eval.RelevanceEvaluator(qrel, metrics_set).evaluate(run)) + df_mean = df.mean(axis=1).to_frame('mean') + df_mean.to_csv(f'{output_path}/f{f}.model.ad.pred.{idx/10}.eval.mean.csv') + mean_list[idx] = pd.concat([mean_list[idx], df_mean], axis=1) + for i in range(0, 11): + # output_path_hidden = f'{output_path}/{i*10}' + mean_list[i].mean(axis=1).to_frame('mean').to_csv(f'{output_path}/model.ad.pred.eval.mean.{i/10}.csv') + # y_pred = s.argmax(1) + # f1_score = precision_recall_fscore_support(y, y_pred) + # f1_macro = precision_recall_fscore_support(y, + # y_pred, + # average="weighted") + # scores[(att, path)][idx] = (f1_score, f1_macro) + # + # att_score = {k: v for k, v in scores.items() if k[0] == attention} + # att_per_class = [[z[x][0][:-1] for x in range(3)] + # for z in att_score.values()] + # att_per_class = np.stack(att_per_class).mean(0) + # att_macro = np.mean([v[2][1][:-1] for v in att_score.values()], 0) + # + # rbf_score = {k: v for k, v in scores.items() if k[0] == rbf_attention} + # rbf_per_class = [[z[x][0][:-1] for x in range(3)] + # for z in rbf_score.values()] + # rbf_per_class = np.stack(rbf_per_class).mean(0) + # rbf_macro = np.mean([v[2][1][:-1] for v in rbf_score.values()], 0) diff --git a/experiments/experiment_test.py b/experiments/experiment_test.py index 4a0e39e..29677bd 100644 --- a/experiments/experiment_test.py +++ b/experiments/experiment_test.py @@ -9,7 +9,7 @@ import pandas as pd from cat.simple import get_scores, attention, rbf_attention -from cat.dataset import restaurants_test +from cat.dataset import test from reach import Reach from sklearn.metrics import precision_recall_fscore_support from collections import defaultdict, Counter @@ -23,76 +23,96 @@ if __name__ == "__main__": # LADy_eval - output_path = "../output/" - if not os.path.isdir(output_path): - os.makedirs(output_path) metrics = ['P', 'recall', 'ndcg_cut', 'map_cut', 'success'] topkstr = '1,5,10,100' metrics_set = set() for m in metrics: - metrics_set.add(f'{m}_{topkstr}') - - scores = defaultdict(dict) - r = Reach.load("../embeddings/toy_vecs_w2v.vec", - unk_word="") - d = json.load(open("../data/toy_nouns.json")) - - nouns = Counter() - for k, v in d.items(): - if k.lower() in r.items: - nouns[k.lower()] += v - - embedding_paths = ["../embeddings/toy_vecs_w2v.vec"] - # bundles = ((rbf_attention, attention), embedding_paths) - bundles = ((rbf_attention, ), embedding_paths) - - for att, path in product(*bundles): - r = Reach.load(path, unk_word="") - - if att == rbf_attention: - candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"])) - else: - candidates, _ = zip(*nouns.most_common(BEST_ATT["n_noun"])) - - aspects = [[x] for x in candidates] - sorted_output = [] - for idx, (instances, y, label_set, subset_labels, gold) in enumerate(restaurants_test()): - # print("label_set", label_set) - s = get_scores(instances, - aspects, - r, - subset_labels, - gamma=GAMMA, - remove_oov=False, - attention_func=att) - - # print("predicted", s) - # print("subset_labels", subset_labels) - # print("gold", list(gold)) - output = [[(label, value) for value, label in zip(sublist, subset_labels)] for sublist in s] - sorted_output = [sorted(sublist, key=lambda x: x[1], reverse=True) for sublist in output] - # print("output", sorted_output) - - qrel = dict() - run = dict() - - for i, word in enumerate(gold): - q_key = 'q{}'.format(i) - qrel[q_key] = {word: 1} - - for i, sublist in enumerate(sorted_output): - q_key = 'q{}'.format(i) - run[q_key] = {} - for j, (word, _) in enumerate(sublist): - run[q_key][word] = len(sublist) - j - - print("qrel: ", qrel) - print("run: ", run) - - print(f'pytrec_eval for {metrics_set} ...') - df = pd.DataFrame.from_dict(pytrec_eval.RelevanceEvaluator(qrel, metrics_set).evaluate(run)) - df_mean = df.mean(axis=1).to_frame('mean') - df_mean.to_csv(f'{output_path}pred.eval.mean.csv') + metrics_set.add(f'{m}_{topkstr}') + datasets = [] + for d in ['lowresource-2014r']: + for l in ['lao_Laoo', 'san_Deva']: + if l == 'eng': + datasets.append(f'{d}') + else: + datasets.append(f'{d}-{l}') + for dataset in datasets: + + output_path = f'../output-low-resource/{dataset}' + if not os.path.isdir(output_path): + os.makedirs(output_path) + + mean_list = [pd.DataFrame() for i in range(0, 11)] + for f in range(5): + fold_path = f'{dataset}/train/{f}' + scores = defaultdict(dict) + r = Reach.load(f'../embeddings/{fold_path}/vecs_w2v.vec', + unk_word="") + d = json.load(open(f'../data/{fold_path}/nouns.json')) + + nouns = Counter() + for k, v in d.items(): + if k.lower() in r.items: + nouns[k.lower()] += v + + embedding_paths = [f'../embeddings/{fold_path}/vecs_w2v.vec'] + # bundles = ((rbf_attention, attention), embedding_paths) + bundles = ((rbf_attention, ), embedding_paths) + + for att, path in product(*bundles): + r = Reach.load(path, unk_word="") + + if att == rbf_attention: + candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"])) + else: + candidates, _ = zip(*nouns.most_common(BEST_ATT["n_noun"])) + + aspects = [[x] for x in candidates] + sorted_output = [] + for idx, (instances, y, label_set, subset_labels, gold, multi_labels) in enumerate(test(f, dataset)): + # output_path_hidden = f'{output_path}/{idx*10}' + # if not os.path.isdir(output_path_hidden): + # os.makedirs(output_path_hidden) + # print("label_set", label_set) + s = get_scores(instances, + aspects, + r, + subset_labels, + gamma=GAMMA, + remove_oov=False, + attention_func=att) + + # print("predicted", s) + # print("subset_labels", subset_labels) + # print("gold", list(gold)) + output = [[(label, value) for value, label in zip(sublist, subset_labels)] for sublist in s] + sorted_output = [sorted(sublist, key=lambda x: x[1], reverse=True) for sublist in output] + # print("output", sorted_output) + + qrel = dict() + run = dict() + + for i, word in enumerate(multi_labels): + q_key = 'q{}'.format(i) + # qrel[q_key] = {word: 1} + qrel[q_key] = {w: 1 for w in word} + + for i, sublist in enumerate(sorted_output): + q_key = 'q{}'.format(i) + run[q_key] = {} + for j, (word, _) in enumerate(sublist): + run[q_key][word] = len(sublist) - j + + # print("qrel: ", qrel) + # print("run: ", run) + + print(f'pytrec_eval for {metrics_set} for fold {f} with {idx*10} percent hidden aspect in dataset {dataset}...') + df = pd.DataFrame.from_dict(pytrec_eval.RelevanceEvaluator(qrel, metrics_set).evaluate(run)) + df_mean = df.mean(axis=1).to_frame('mean') + df_mean.to_csv(f'{output_path}/f{f}.model.ad.pred.{idx/10}.eval.mean.csv') + mean_list[idx] = pd.concat([mean_list[idx], df_mean], axis=1) + for i in range(0, 11): + # output_path_hidden = f'{output_path}/{i*10}' + mean_list[i].mean(axis=1).to_frame('mean').to_csv(f'{output_path}/model.ad.pred.eval.mean.{i/10}.csv') # y_pred = s.argmax(1) # f1_score = precision_recall_fscore_support(y, y_pred) # f1_macro = precision_recall_fscore_support(y, diff --git a/experiments/preprocessing_embeddings-googletranslate.py b/experiments/preprocessing_embeddings-googletranslate.py new file mode 100644 index 0000000..da9b945 --- /dev/null +++ b/experiments/preprocessing_embeddings-googletranslate.py @@ -0,0 +1,41 @@ +"""Creating fragments takes a long time so we treat it as a +pre-processing step.""" +import logging +import os + +from gensim.models import Word2Vec +from cat.fragments import create_noun_counts +from cat.utils import conll2text + +logging.basicConfig(level=logging.INFO) + + +if __name__ == "__main__": + datasets = [] + for d in ['googletranslate-twitter']: + for l in ['en', 'fa', 'zh-CN', 'de', 'ar', 'fr', 'es', 'fa.zh-CN.de.ar.fr.es']: + if l == 'en': + datasets.append(f'{d}') + else: + datasets.append(f'{d}-{l}') + for dataset in datasets: + for f in range(5): + fold_path = f'{dataset}/train/{f}' + paths = [f'../data/{fold_path}/input.conllu'] + create_noun_counts(paths, f'../data/{fold_path}/nouns.json') + conll2text(paths, f'../data/{fold_path}/all_txt.txt') + corpus = [x.lower().strip().split() + for x in open(f'../data/{fold_path}/all_txt.txt', encoding='utf-8')] + + f = Word2Vec(corpus, + sg=0, + negative=5, + window=10, + vector_size=200, + min_count=2, + epochs=40, + workers=10) + embedding_path = f"../embeddings/{fold_path}" + if not os.path.isdir(embedding_path): + os.makedirs(embedding_path) + f.wv.save_word2vec_format(f'{embedding_path}/vecs_w2v.vec') diff --git a/experiments/preprocessing_embeddings.py b/experiments/preprocessing_embeddings.py index 76039ca..d86fa54 100644 --- a/experiments/preprocessing_embeddings.py +++ b/experiments/preprocessing_embeddings.py @@ -1,6 +1,7 @@ """Creating fragments takes a long time so we treat it as a pre-processing step.""" import logging +import os from gensim.models import Word2Vec from cat.fragments import create_noun_counts @@ -10,21 +11,31 @@ if __name__ == "__main__": + datasets = [] + for d in ['lowresource-2014r']: + for l in ['lao_Laoo', 'san_Deva']: + if l == 'eng': + datasets.append(f'{d}') + else: + datasets.append(f'{d}-{l}') + for dataset in datasets: + for f in range(5): + fold_path = f'{dataset}/train/{f}' + paths = [f'../data/{fold_path}/input.conllu'] + create_noun_counts(paths, f'../data/{fold_path}/nouns.json') + conll2text(paths, f'../data/{fold_path}/all_txt.txt') + corpus = [x.lower().strip().split() + for x in open(f'../data/{fold_path}/all_txt.txt', encoding='utf-8')] - paths = ["../data/input.conllu"] - create_noun_counts(paths, - "../data/toy_nouns.json") - conll2text(paths, "../data/toy_all_txt.txt") - corpus = [x.lower().strip().split() - for x in open("../data/toy_all_txt.txt")] - - f = Word2Vec(corpus, - sg=0, - negative=5, - window=10, - vector_size=200, - min_count=2, - epochs=5, - workers=10) - - f.wv.save_word2vec_format(f"../embeddings/toy_vecs_w2v.vec") + f = Word2Vec(corpus, + sg=0, + negative=5, + window=10, + vector_size=200, + min_count=2, + epochs=200, + workers=10) + embedding_path = f"../embeddings/{fold_path}" + if not os.path.isdir(embedding_path): + os.makedirs(embedding_path) + f.wv.save_word2vec_format(f'{embedding_path}/vecs_w2v.vec') diff --git a/text-to-CONLLu.py b/text-to-CONLLu.py index 53fc960..85a485f 100644 --- a/text-to-CONLLu.py +++ b/text-to-CONLLu.py @@ -129,4 +129,16 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + datasets = [] + for d in ['lowresource-2015', 'lowresource-2016', 'lowresource-2014l', 'lowresource-2014r']: # , 'googletranslate-SemEval-14-L']: + for l in ['lao_Laoo', 'san_Deva']: + if l == 'eng': + datasets.append(f'{d}') + else: + datasets.append(f'{d}-{l}') + for dataset in datasets: + for f in range(5): + sys.argv.append(f'data/{dataset}/train/{f}/train.txt') + sys.argv.append(f'data/{dataset}/train/{f}/input.conllu') + main() + del sys.argv[-2:] diff --git a/wrapper.py b/wrapper.py index 6934c74..4a8fc52 100644 --- a/wrapper.py +++ b/wrapper.py @@ -11,9 +11,8 @@ def load(reviews, splits): print('\nLoading reviews and preprocessing ...') - print('#' * 50) + print('_' * 50) try: - print('\nLoading reviews files ...') with open(f'{reviews}', 'rb') as f: reviews = pickle.load(f) with open(f'{splits}', 'r') as f: @@ -25,44 +24,129 @@ def load(reviews, splits): return reviews, splits -def preprocess(org_reviews): +def get_aos_augmented(review): + r = [] + if not review.aos: return r + for i, aos in enumerate(review.aos): r.append([([review.sentences[i][j] for j in a], [review.sentences[i][j] for j in o], s) for (a, o, s) in aos]) + return r + + +def preprocess(org_reviews, status, lang): reviews_list = [] label_list = [] for r in org_reviews: if not len(r.aos[0]): continue else: - for aos_instance in r.get_aos(): - for aos in aos_instance[0][0]: - reviews_list.append(r.get_txt()) - label_list.append(aos) - if r.augs: - for key, value in r.items(): - for aos_instance in r[key][1].get_aos(): - for aos in aos_instance[0][0]: - reviews_list.append(r[key][1].get_txt()) - label_list.append(aos) + + if status == 'test': + reviews_list.append(r.get_txt()) + label_list.append(r.get_aos()[0][0][0][0]) + + elif status == 'multi-test': # test should not be duplicated in case of having more than one aspect + reviews_list.append(r.get_txt()) + label_per_review = [] + for aos in r.get_aos()[0][0][0]: + label_per_review.append(aos) + label_list.append(label_per_review) + + else: # train should be duplicated in case of having more than one aspect + for aos in r.get_aos()[0][0][0]: + text = r.get_txt() + label = aos + reviews_list.append(text) + label_list.append(label) + + ''' + if len(r.get_aos()[0][0][0]) == 1: + text = r.get_txt() + label = r.get_aos()[0][0][0][0] + reviews_list.append(text) + label_list.append(label) + ''' + + if r.augs and status == 'train': # data for train can be augmented + + # if lang == 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn': + if lang == 'fa.zh-CN.de.ar.fr.es': + for key, value in r.augs.items(): + for aos_instance in get_aos_augmented(r.augs[key][1])[0][0][0]: + text = r.augs[key][1].get_txt() + label = aos_instance + reviews_list.append(text) + label_list.append(label) + else: + # for l in lang.split('.'): + # for aos_instance in get_aos_augmented(r.augs[l][1])[0][0][0]: + # text = r.augs[l][1].get_txt() + # label = aos_instance + # reviews_list.append(text) + # label_list.append(label) + for aos_instance in get_aos_augmented(r.augs[lang][1])[0][0][0]: + text = r.augs[lang][1].get_txt() + label = aos_instance + reviews_list.append(text) + label_list.append(label) + ''' + # for key, value in r.augs.items(): + # if len(get_aos_augmented(r.augs[key][1])) == 0: + # text = r.augs[key][1].get_txt() + # reviews_list.append(text) + # continue + # for aos_instance in r.augs[key][1].get_aos()[0]: + if lang == 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn': + for key, value in r.augs.items(): + if len(get_aos_augmented(r.augs[key][1])[0][0][0]) == 1: + text = r.augs[key][1].get_txt() + label = get_aos_augmented(r.augs[key][1])[0][0][0][0] + reviews_list.append(text) + # label_list.append(label) + elif len(get_aos_augmented(r.augs[lang][1])[0][0][0]) == 1: + text = r.augs[lang][1].get_txt() + label = get_aos_augmented(r.augs[lang][1])[0][0][0][0] + reviews_list.append(text) + # label_list.append(label) + ''' + return reviews_list, label_list -# python main.py -ds_name [YOUR_DATASET_NAME] -sgd_lr [YOUR_LEARNING_RATE_FOR_SGD] -win [YOUR_WINDOW_SIZE] -optimizer [YOUR_OPTIMIZER] -rnn_type [LSTM|GRU] -attention_type [bilinear|concat] def main(args): - if not os.path.isdir(f'{args.output}'): os.makedirs(f'{args.output}') + output_path = f'{args.output}/{args.dname}/' + if not os.path.isdir(output_path): os.makedirs(output_path) org_reviews, splits = load(args.reviews, args.splits) + + for f in range(5): + path = f'{output_path}train/{f}/' + if not os.path.isdir(path): os.makedirs(path) + train, label_list = preprocess(np.array(org_reviews)[splits['folds'][str(f)]['train']].tolist(), 'train', args.lang) + + with open(f'{path}train.txt', 'w', encoding='utf-8') as file: + for d in train: + file.write(d + '\n') + with open(f'{path}train_label.txt', 'w', encoding='utf-8') as file: + for d in label_list: + file.write(d + '\n') + test = np.array(org_reviews)[splits['test']].tolist() for h in range(0, 101, 10): - path = f'{args.output}/{h}/{args.dname}' - if not os.path.isdir(f'{args.output}/{h}'): - os.makedirs(f'{args.output}/{h}') + path = f'{output_path}/test/{h}/' + if not os.path.isdir(path): + os.makedirs(path) - preprocessed_test, label_list = preprocess(test) + preprocessed_test, label_list = preprocess(test, 'test', args.lang) - with open(f'{path}_test_label.txt', 'w') as file: + with open(f'{path}test_label.txt', 'w', encoding='utf-8') as file: for d in label_list: file.write(d + '\n') + _, labels_list = preprocess(test, 'multi-test', args.lang) + with open(f'{path}test_label_multi.txt', 'w', encoding='utf-8') as file: + for d in labels_list: + file.write(str(d) + '\n') + hp = h / 100 test_hidden = [] for t in range(len(test)): @@ -70,34 +154,48 @@ def main(args): test_hidden.append(test[t].hide_aspects()) else: test_hidden.append(test[t]) - preprocessed_test, label_list = preprocess(test_hidden) + preprocessed_test, label_list = preprocess(test_hidden, 'test', args.lang) - with open(f'{path}_test.txt', 'w') as file: + with open(f'{path}test.txt', 'w', encoding='utf-8') as file: for d in preprocessed_test: file.write(d + '\n') - train, label_list = preprocess(np.array(org_reviews)[splits['folds']['0']['train']].tolist()) - path = f'{args.output}/{args.dname}' - with open(f'{path}_train.txt', 'w') as file: - for d in train: - file.write(d + '\n') - with open(f'{path}_train_label.txt', 'w') as file: - for d in label_list: - file.write(d + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser(description='CAt Wrapper') - parser.add_argument('--dname', dest='dname', type=str, default='toy') + parser.add_argument('--dname', dest='dname', type=str, default='SemEval-14-R') parser.add_argument('--reviews', dest='reviews', type=str, - default='data/reviews.pkl', + default='data/2015SB12/reviews.pes_Arab.pkl', help='raw dataset file path') parser.add_argument('--splits', dest='splits', type=str, - default='data/splits.json', + default='data/2015SB12/splits.json', help='raw dataset file path') parser.add_argument('--output', dest='output', type=str, - default='data/', + default='data', help='output path') + parser.add_argument('--lang', dest='lang', type=str, + default='eng', + help='language') args = parser.parse_args() - main(args) + # 'SemEval14L','SemEval14R', '2015SB12', '2016SB5' + # 'output-twitter-modified' + # 'googletranslate-2015SB12','googletranslate-2016SB5','googletranslate-SemEval-14-L' + # for dataset in ['googletranslate-2015SB12','googletranslate-2016SB5','googletranslate-SemEval-14-L','googletranslate-SemEval-14-R', 'googletranslate-twitter']: + for dataset in ['lowresource-2015', 'lowresource-2016', 'lowresource-2014l', 'lowresource-2014r']: + args.splits = f'data/{dataset}/splits.json' + # for lang in []:'eng', 'pes_Arab', 'zho_Hans', 'deu_Latn', 'arb_Arab', 'fra_Latn', 'spa_Latn', + # # 'pes_Arab.zho_Hans.deu_Latn.arb_Arab.fra_Latn.spa_Latn' + for lang in ['lao_Laoo', 'san_Deva']: + # if lang == 'en': + if lang == 'eng': + args.lang = lang + args.dname = f'{dataset}' + args.reviews = f'data/{dataset}/reviews.pkl' + else: + args.lang = lang + args.dname = f'{dataset}-{lang}' + args.reviews = f'data/{dataset}/reviews.{lang}.pkl' + print(args) + main(args)