From 934da2119d6ac96cd2ea76a59e14ce93e4b41a98 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 08:53:26 +0000 Subject: [PATCH 01/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 49 +++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 8bb192f3..1cdf1e6b 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import csv import logging import re @@ -28,7 +29,7 @@ def __init__(self, data_format: str) -> None: Args: data_format (str): The data format used. 'svm' for LibSVM format and 'txt' for LibMultiLabel format. """ - if not data_format in {'txt', 'svm'}: + if not data_format in {'txt', 'svm', 'dataframe'}: raise ValueError(f'unsupported data format {data_format}') self.data_format = data_format @@ -66,6 +67,8 @@ def load_data(self, training_file: str = None, if self.data_format == 'txt': data = self._load_txt(training_file, test_file, eval) + elif self.data_format == 'dataframe': + data = self._load_dataframe(training_file, test_file, eval) elif self.data_format == 'svm': data = self._load_svm(training_file, test_file, eval) @@ -89,10 +92,10 @@ def load_data(self, training_file: str = None, def _load_txt(self, training_file, test_file, eval) -> 'dict[str, dict]': datasets = defaultdict(dict) if test_file is not None: - test = read_libmultilabel_format(test_file) + test = read_libmultilabel_format_file(test_file) if not eval: - train = read_libmultilabel_format(training_file) + train = read_libmultilabel_format_file(training_file) self._generate_tfidf(train['text']) if self.classes or not self.include_test_labels: @@ -129,6 +132,30 @@ def _load_svm(self, training_file, test_file, eval) -> 'dict[str, dict]': datasets['test']['y'] = self.binarizer.transform(ty).astype('d') return dict(datasets) + def _load_dataframe(self, training_file, test_file, eval) -> 'dict[str, dict]': + datasets = defaultdict(dict) + if test_file is not None: + test = read_libmultilabel_format_dataframe(test_file) + + if not eval: + train = read_libmultilabel_format_dataframe(training_file) + self._generate_tfidf(train['text']) + + if self.classes or not self.include_test_labels: + self._generate_label_mapping(train['label'], self.classes) + else: + self._generate_label_mapping(train['label'] + test['label']) + datasets['train']['x'] = self.vectorizer.transform(train['text']) + datasets['train']['y'] = self.binarizer.transform( + train['label']).astype('d') + + if test_file is not None: + datasets['test']['x'] = self.vectorizer.transform(test['text']) + datasets['test']['y'] = self.binarizer.transform( + test['label']).astype('d') + + return dict(datasets) + def _generate_tfidf(self, texts): self.vectorizer = TfidfVectorizer() self.vectorizer.fit(texts) @@ -139,7 +166,21 @@ def _generate_label_mapping(self, labels, classes=None): self.binarizer.fit(labels) -def read_libmultilabel_format(path: str) -> 'dict[str,list[str]]': +def read_libmultilabel_format_dataframe(data: pd.DataFrame) -> 'dict[str,list[str]]': + data = data.astype(str) + if data.shape[1] == 2: + data.columns = ['label', 'text'] + data = data.reset_index() + elif data.shape[1] == 3: + data.columns = ['index', 'label', 'text'] + else: + raise ValueError(f'Expected 2 or 3 columns, got {data.shape[1]}.') + data['label'] = data['label'].map(lambda s: s.split()) + with open('test.json', 'w') as w: + json.dump(data.to_dict('list'), w) + return data.to_dict('list') + +def read_libmultilabel_format_file(path: str) -> 'dict[str,list[str]]': data = pd.read_csv(path, sep='\t', header=None, dtype=str, on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') From d1bf0802bef552300265d89b9e7adcadbd6cb380 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 09:22:56 +0000 Subject: [PATCH 02/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 1cdf1e6b..44e4fd97 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json import csv import logging import re @@ -236,10 +235,11 @@ def as_ints(str): except: raise ValueError( f'invalid svm format at line {i+1} of the file \'{file_path}\'') - + prob_x = scipy.frombuffer(prob_x, dtype='d') col_idx = scipy.frombuffer(col_idx, dtype='l') row_ptr = scipy.frombuffer(row_ptr, dtype='l') prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) return (prob_y, prob_x) + From 5a904b60cae43ae284bdb9f41c68185f42e2b050 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 09:24:17 +0000 Subject: [PATCH 03/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 44e4fd97..56a28091 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -235,11 +235,10 @@ def as_ints(str): except: raise ValueError( f'invalid svm format at line {i+1} of the file \'{file_path}\'') - + prob_x = scipy.frombuffer(prob_x, dtype='d') col_idx = scipy.frombuffer(col_idx, dtype='l') row_ptr = scipy.frombuffer(row_ptr, dtype='l') prob_x = sparse.csr_matrix((prob_x, col_idx, row_ptr)) return (prob_y, prob_x) - From 51d90ca97242416e1065155150cc816ca34b25fa Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 09:24:42 +0000 Subject: [PATCH 04/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 56a28091..1c5929d4 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -235,7 +235,6 @@ def as_ints(str): except: raise ValueError( f'invalid svm format at line {i+1} of the file \'{file_path}\'') - prob_x = scipy.frombuffer(prob_x, dtype='d') col_idx = scipy.frombuffer(col_idx, dtype='l') row_ptr = scipy.frombuffer(row_ptr, dtype='l') From 81d101a4bbb5ff29519ed2247eaa0f639c357b75 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 09:25:07 +0000 Subject: [PATCH 05/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 1c5929d4..633e7eb5 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -235,6 +235,7 @@ def as_ints(str): except: raise ValueError( f'invalid svm format at line {i+1} of the file \'{file_path}\'') + prob_x = scipy.frombuffer(prob_x, dtype='d') col_idx = scipy.frombuffer(col_idx, dtype='l') row_ptr = scipy.frombuffer(row_ptr, dtype='l') From 4e93e2363a21e1de86e6117fbdb5abb2db8909b0 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 09:25:45 +0000 Subject: [PATCH 06/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 633e7eb5..03093bdf 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -235,7 +235,7 @@ def as_ints(str): except: raise ValueError( f'invalid svm format at line {i+1} of the file \'{file_path}\'') - + prob_x = scipy.frombuffer(prob_x, dtype='d') col_idx = scipy.frombuffer(col_idx, dtype='l') row_ptr = scipy.frombuffer(row_ptr, dtype='l') From 74e401e45657ecaff2ca37875604b22d02ec790a Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 09:52:43 +0000 Subject: [PATCH 07/11] add api to read dataframe in libmultilabel format --- libmultilabel/linear/preprocessor.py | 50 ++++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 03093bdf..350204bf 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -33,8 +33,8 @@ def __init__(self, data_format: str) -> None: self.data_format = data_format - def load_data(self, training_file: str = None, - test_file: str = None, + def load_data(self, training_data: Union[str, pd.DataFrame] = None, + test_data: Union[str, pd.DataFrame] = None, eval: bool = False, label_file: str = None, include_test_labels: bool = False, @@ -42,8 +42,8 @@ def load_data(self, training_file: str = None, """Loads and preprocesses data. Args: - training_file (str): Training data file. Ignored if eval is True. Defaults to None. - test_file (str): Test data file. Ignored if test_file doesn't exist. Defaults to None. + training_data (str): Training data file or dataframe in LibMultiLabel format. Ignored if eval is True. Defaults to None. + test_data (str): Test data file or dataframe in LibMultiLabel format. Ignored if test_data doesn't exist. Defaults to None. eval (bool): If True, ignores training data and uses previously loaded state to preprocess test data. label_file (str, optional): Path to a file holding all labels. include_test_labels (bool, optional): Whether to include labels in the test dataset. Defaults to False. @@ -58,18 +58,18 @@ def load_data(self, training_file: str = None, with open(label_file, 'r') as fp: self.classes = sorted([s.strip() for s in fp.readlines()]) else: - if test_file is None and include_test_labels: + if test_data is None and include_test_labels: raise ValueError( f'Specified the inclusion of test labels but test file does not exist') self.classes = None self.include_test_labels = include_test_labels if self.data_format == 'txt': - data = self._load_txt(training_file, test_file, eval) + data = self._load_txt(training_data, test_data, eval) elif self.data_format == 'dataframe': - data = self._load_dataframe(training_file, test_file, eval) + data = self._load_dataframe(training_data, test_data, eval) elif self.data_format == 'svm': - data = self._load_svm(training_file, test_file, eval) + data = self._load_svm(training_data, test_data, eval) if 'train' in data: num_labels = data['train']['y'].getnnz(axis=1) @@ -77,24 +77,24 @@ def load_data(self, training_file: str = None, if num_no_label_data > 0: if remove_no_label_data: logging.info( - f'Remove {num_no_label_data} instances that have no labels from {training_file}.', + f'Remove {num_no_label_data} instances that have no labels from {training_data}.', extra={'collect': True}) data['train']['x'] = data['train']['x'][num_labels > 0] data['train']['y'] = data['train']['y'][num_labels > 0] else: logging.info( - f'Keep {num_no_label_data} instances that have no labels from {training_file}.', + f'Keep {num_no_label_data} instances that have no labels from {training_data}.', extra={'collect': True}) return data - def _load_txt(self, training_file, test_file, eval) -> 'dict[str, dict]': + def _load_txt(self, training_data, test_data, eval) -> 'dict[str, dict]': datasets = defaultdict(dict) - if test_file is not None: - test = read_libmultilabel_format_file(test_file) + if test_data is not None: + test = read_libmultilabel_format_file(test_data) if not eval: - train = read_libmultilabel_format_file(training_file) + train = read_libmultilabel_format_file(training_data) self._generate_tfidf(train['text']) if self.classes or not self.include_test_labels: @@ -105,20 +105,20 @@ def _load_txt(self, training_file, test_file, eval) -> 'dict[str, dict]': datasets['train']['y'] = self.binarizer.transform( train['label']).astype('d') - if test_file is not None: + if test_data is not None: datasets['test']['x'] = self.vectorizer.transform(test['text']) datasets['test']['y'] = self.binarizer.transform( test['label']).astype('d') return dict(datasets) - def _load_svm(self, training_file, test_file, eval) -> 'dict[str, dict]': + def _load_svm(self, training_data, test_data, eval) -> 'dict[str, dict]': datasets = defaultdict(dict) - if test_file is not None: - ty, tx = read_libsvm_format(test_file) + if test_data is not None: + ty, tx = read_libsvm_format(test_data) if not eval: - y, x = read_libsvm_format(training_file) + y, x = read_libsvm_format(training_data) if self.classes or not self.include_test_labels: self._generate_label_mapping(y, self.classes) else: @@ -126,18 +126,18 @@ def _load_svm(self, training_file, test_file, eval) -> 'dict[str, dict]': datasets['train']['x'] = x datasets['train']['y'] = self.binarizer.transform(y).astype('d') - if test_file is not None: + if test_data is not None: datasets['test']['x'] = tx datasets['test']['y'] = self.binarizer.transform(ty).astype('d') return dict(datasets) - def _load_dataframe(self, training_file, test_file, eval) -> 'dict[str, dict]': + def _load_dataframe(self, training_data, test_data, eval) -> 'dict[str, dict]': datasets = defaultdict(dict) - if test_file is not None: - test = read_libmultilabel_format_dataframe(test_file) + if test_data is not None: + test = read_libmultilabel_format_dataframe(test_data) if not eval: - train = read_libmultilabel_format_dataframe(training_file) + train = read_libmultilabel_format_dataframe(training_data) self._generate_tfidf(train['text']) if self.classes or not self.include_test_labels: @@ -148,7 +148,7 @@ def _load_dataframe(self, training_file, test_file, eval) -> 'dict[str, dict]': datasets['train']['y'] = self.binarizer.transform( train['label']).astype('d') - if test_file is not None: + if test_data is not None: datasets['test']['x'] = self.vectorizer.transform(test['text']) datasets['test']['y'] = self.binarizer.transform( test['label']).astype('d') From 89f42f97dfef3db39061b0033bfaa624a9979418 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Mon, 13 Feb 2023 11:14:43 +0000 Subject: [PATCH 08/11] update api document --- libmultilabel/linear/preprocessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 350204bf..a094b721 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -26,7 +26,7 @@ def __init__(self, data_format: str) -> None: """Initializes the preprocessor. Args: - data_format (str): The data format used. 'svm' for LibSVM format and 'txt' for LibMultiLabel format. + data_format (str): The data format used. 'svm' for LibSVM format, 'txt' for LibMultiLabel format in file and 'dataframe' for LibMultiLabel format in dataframe . """ if not data_format in {'txt', 'svm', 'dataframe'}: raise ValueError(f'unsupported data format {data_format}') @@ -42,8 +42,8 @@ def load_data(self, training_data: Union[str, pd.DataFrame] = None, """Loads and preprocesses data. Args: - training_data (str): Training data file or dataframe in LibMultiLabel format. Ignored if eval is True. Defaults to None. - test_data (str): Test data file or dataframe in LibMultiLabel format. Ignored if test_data doesn't exist. Defaults to None. + training_data (Union[str, pd.DataFrame]): Training data file or dataframe in LibMultiLabel format. Ignored if eval is True. Defaults to None. + test_data (Union[str, pd.DataFrame]): Test data file or dataframe in LibMultiLabel format. Ignored if test_data doesn't exist. Defaults to None. eval (bool): If True, ignores training data and uses previously loaded state to preprocess test data. label_file (str, optional): Path to a file holding all labels. include_test_labels (bool, optional): Whether to include labels in the test dataset. Defaults to False. From a23b941682407c04bea605f8e82ef726320e7dff Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Tue, 14 Feb 2023 08:26:29 +0000 Subject: [PATCH 09/11] clear code --- libmultilabel/linear/preprocessor.py | 60 ++++++---------------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index a094b721..f9408e51 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -64,10 +64,8 @@ def load_data(self, training_data: Union[str, pd.DataFrame] = None, self.classes = None self.include_test_labels = include_test_labels - if self.data_format == 'txt': - data = self._load_txt(training_data, test_data, eval) - elif self.data_format == 'dataframe': - data = self._load_dataframe(training_data, test_data, eval) + if self.data_format == 'txt' or 'dataframe': + data = self._load_libmultilabel(training_data, test_data, eval) elif self.data_format == 'svm': data = self._load_svm(training_data, test_data, eval) @@ -88,13 +86,19 @@ def load_data(self, training_data: Union[str, pd.DataFrame] = None, return data - def _load_txt(self, training_data, test_data, eval) -> 'dict[str, dict]': + def _load_libmultilabel(self, training_data, test_data, eval) -> 'dict[str, dict]': datasets = defaultdict(dict) if test_data is not None: - test = read_libmultilabel_format_file(test_data) + if self.data_format == 'txt': + test_data = pd.read_csv(test_data, sep='\t', header=None, + on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') + test = read_libmultilabel_format(test_data) if not eval: - train = read_libmultilabel_format_file(training_data) + if self.data_format == 'txt': + training_data = pd.read_csv(training_data, sep='\t', header=None, + on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') + train = read_libmultilabel_format(training_data) self._generate_tfidf(train['text']) if self.classes or not self.include_test_labels: @@ -131,30 +135,6 @@ def _load_svm(self, training_data, test_data, eval) -> 'dict[str, dict]': datasets['test']['y'] = self.binarizer.transform(ty).astype('d') return dict(datasets) - def _load_dataframe(self, training_data, test_data, eval) -> 'dict[str, dict]': - datasets = defaultdict(dict) - if test_data is not None: - test = read_libmultilabel_format_dataframe(test_data) - - if not eval: - train = read_libmultilabel_format_dataframe(training_data) - self._generate_tfidf(train['text']) - - if self.classes or not self.include_test_labels: - self._generate_label_mapping(train['label'], self.classes) - else: - self._generate_label_mapping(train['label'] + test['label']) - datasets['train']['x'] = self.vectorizer.transform(train['text']) - datasets['train']['y'] = self.binarizer.transform( - train['label']).astype('d') - - if test_data is not None: - datasets['test']['x'] = self.vectorizer.transform(test['text']) - datasets['test']['y'] = self.binarizer.transform( - test['label']).astype('d') - - return dict(datasets) - def _generate_tfidf(self, texts): self.vectorizer = TfidfVectorizer() self.vectorizer.fit(texts) @@ -165,7 +145,7 @@ def _generate_label_mapping(self, labels, classes=None): self.binarizer.fit(labels) -def read_libmultilabel_format_dataframe(data: pd.DataFrame) -> 'dict[str,list[str]]': +def read_libmultilabel_format(data: pd.DataFrame) -> 'dict[str,list[str]]': data = data.astype(str) if data.shape[1] == 2: data.columns = ['label', 'text'] @@ -175,25 +155,9 @@ def read_libmultilabel_format_dataframe(data: pd.DataFrame) -> 'dict[str,list[st else: raise ValueError(f'Expected 2 or 3 columns, got {data.shape[1]}.') data['label'] = data['label'].map(lambda s: s.split()) - with open('test.json', 'w') as w: - json.dump(data.to_dict('list'), w) - return data.to_dict('list') -def read_libmultilabel_format_file(path: str) -> 'dict[str,list[str]]': - data = pd.read_csv(path, sep='\t', header=None, - dtype=str, - on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') - if data.shape[1] == 2: - data.columns = ['label', 'text'] - data = data.reset_index() - elif data.shape[1] == 3: - data.columns = ['index', 'label', 'text'] - else: - raise ValueError(f'Expected 2 or 3 columns, got {data.shape[1]}.') - data['label'] = data['label'].map(lambda s: s.split()) return data.to_dict('list') - def read_libsvm_format(file_path: str) -> 'tuple[list[list[int]], sparse.csr_matrix]': """Read multi-label LIBSVM-format data. From 644e1facd65359414c48745f3efa8ffa013726db Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Tue, 14 Feb 2023 08:30:03 +0000 Subject: [PATCH 10/11] clear code --- libmultilabel/linear/preprocessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index f9408e51..6e4dd413 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -155,7 +155,6 @@ def read_libmultilabel_format(data: pd.DataFrame) -> 'dict[str,list[str]]': else: raise ValueError(f'Expected 2 or 3 columns, got {data.shape[1]}.') data['label'] = data['label'].map(lambda s: s.split()) - return data.to_dict('list') def read_libsvm_format(file_path: str) -> 'tuple[list[list[int]], sparse.csr_matrix]': From 285f143175f9d71587dea6fabaac5d82f581ee17 Mon Sep 17 00:00:00 2001 From: Gordon119 Date: Thu, 16 Feb 2023 08:35:48 +0000 Subject: [PATCH 11/11] fix read_csv option --- libmultilabel/linear/preprocessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmultilabel/linear/preprocessor.py b/libmultilabel/linear/preprocessor.py index 6e4dd413..ba451ffb 100644 --- a/libmultilabel/linear/preprocessor.py +++ b/libmultilabel/linear/preprocessor.py @@ -91,13 +91,13 @@ def _load_libmultilabel(self, training_data, test_data, eval) -> 'dict[str, dict if test_data is not None: if self.data_format == 'txt': test_data = pd.read_csv(test_data, sep='\t', header=None, - on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') + error_bad_lines=False, warn_bad_lines=True, quoting=csv.QUOTE_NONE).fillna('') test = read_libmultilabel_format(test_data) if not eval: if self.data_format == 'txt': training_data = pd.read_csv(training_data, sep='\t', header=None, - on_bad_lines='skip', quoting=csv.QUOTE_NONE).fillna('') + error_bad_lines=False, warn_bad_lines=True, quoting=csv.QUOTE_NONE).fillna('') train = read_libmultilabel_format(training_data) self._generate_tfidf(train['text'])