ready for 0.0.6

Signed-off-by: Zhiyuan Chen <[email protected]>
DLS5-Omics · Dec 11, 2024 · d93b9e9 · d93b9e9
1 parent 8024dc7
commit d93b9e9
Show file tree

Hide file tree

Showing 62 changed files with 1,644 additions and 342 deletions.
diff --git a/.codespell-whitelist.txt b/.codespell-whitelist.txt
@@ -1,3 +1,4 @@
+datas
 ser
 marz
 manuel

diff --git a/docs/docs/about/license-faq.md b/docs/docs/about/license-faq.md
@@ -52,7 +52,12 @@ We also consider research papers and manuscripts a special form of documentation
 
 Since research papers are considered a form of source code, publishers are legally required to open-source all materials on their server to comply with the _[License](license.md)_ if they publish papers using MultiMolecule. This is generally impractical for most publishers.
 
-As a special exemption under section 7 of the _[License](license.md)_, we grant permission to publish research papers using MultiMolecule in fully open access journals, conferences, or preprint servers, provided all published manuscripts are made available under the [GNU Free Documentation License (GFDL)](https://www.gnu.org/licenses/fdl.html), or a [Creative Commons license](https://creativecommons.org), or an [OSI-approved license](https://opensource.org/licenses) that permits the sharing of manuscripts.
+As a special exemption under section 7 of the _[License](license.md)_, we grant permission to publish research papers using MultiMolecule in fully open access journals, conferences, or preprint servers that do not charge any fee from authors, provided all published manuscripts are made available under the [GNU Free Documentation License (GFDL)](https://www.gnu.org/licenses/fdl.html), or a [Creative Commons license](https://creativecommons.org), or an [OSI-approved license](https://opensource.org/licenses) that permits the sharing of manuscripts.
+
+As a special exemption under section 7 of the _[License](license.md)_, we grant permission to publish research papers using MultiMolecule in certain non-profit journals, conferences, or preprint servers. Currently, the non-profit journals, conferences, or preprint servers we allow include:
+
+- All journals published by American Association for the Advancement of Science (AAAS)
+- eLife
 
 For publishing in closed access journals or conferences, you must obtain a separate license from us. This typically involves co-authorship, a fee to support the project, or both. Contact us at [[email protected]](mailto:[email protected]) for more information.
 

diff --git a/docs/docs/about/license-faq.zh.md b/docs/docs/about/license-faq.zh.md
@@ -60,7 +60,12 @@
 
 由于研究论文被视为源代码的一种形式，如果发表使用 MultiMolecule 的论文，出版商必须开源其服务器上的所有材料，以符合 _[许可协议](license.zh.md)_ 的要求。对于大多数出版商来说，这是不切实际的。
 
-作为 _[许可协议](license.zh.md)_ 第 7 条的特别豁免，我们允许在完全开放获取的期刊、会议或预印本服务器上发表使用 MultiMolecule 的研究论文，前提是所有发表的手稿都应按照允许共享手稿的[GNU 自由文档许可协议（GFDL）](https://www.gnu.org/licenses/fdl.html)或[知识共享许可协议](https://creativecommons.org)或[OSI 批准许可协议](https://opensource.org/licenses)提供。
+作为 _[许可协议](license.zh.md)_ 第 7 条的特别豁免，我们允许在不向作者收取任何费用的完全开放获取的期刊、会议或预印本服务器上发表使用 MultiMolecule 的研究论文，前提是所有发表的手稿都应按照允许共享手稿的[GNU 自由文档许可协议（GFDL）](https://www.gnu.org/licenses/fdl.html)或[知识共享许可协议](https://creativecommons.org)或[OSI 批准许可协议](https://opensource.org/licenses)提供。
+
+作为 _[许可协议](license.zh.md)_ 第 7 条的特别豁免，我们允许在部分非盈利性的杂志、会议或预印本服务器上发表使用 MultiMolecule 的研究论文。目前，我们允许的非盈利性杂志、会议或预印本服务器包括：
+
+- 美国科学促进会（AAAS）旗下的所有期刊
+- eLife
 
 要在封闭获取的期刊或会议上发表论文，您必须从我们这里获得单独的许可。这通常包括共同署名、支持项目的费用或两者兼而有之。请通过 [[email protected]](mailto:[email protected]) 与我们联系以获取更多信息。
 

diff --git a/multimolecule/__init__.py b/multimolecule/__init__.py
@@ -111,19 +111,16 @@
     HeadConfig,
     HeadRegistry,
     HeadTransformRegistry,
-    HeadTransformRegistryHF,
     IdentityTransform,
     LinearTransform,
     MaskedLMHead,
     MaskedLMHeadConfig,
     NonLinearTransform,
     PositionEmbeddingRegistry,
-    PositionEmbeddingRegistryHF,
     PredictionHead,
     RotaryEmbedding,
     SequencePredictionHead,
     SinusoidalEmbedding,
-    TokenHeadRegistryHF,
     TokenKMerHead,
     TokenPredictionHead,
 )
@@ -132,9 +129,14 @@
 from .utils import count_parameters
 
 __all__ = [
+    "train",
+    "evaluate",
+    "infer",
     "modeling_auto",
     "modeling_outputs",
     "Dataset",
+    "MultiMoleculeConfig",
+    "MultiMoleculeRunner",
     "PreTrainedConfig",
     "HeadConfig",
     "BaseHeadConfig",
@@ -233,21 +235,15 @@
     "HeadRegistry",
     "PredictionHead",
     "SequencePredictionHead",
-    "TokenHeadRegistryHF",
     "TokenPredictionHead",
     "TokenKMerHead",
-    "NucleotideHeadRegistryHF",
-    "NucleotidePredictionHead",
-    "NucleotideKMerHead",
     "ContactPredictionHead",
     "MaskedLMHead",
     "HeadTransformRegistry",
-    "HeadTransformRegistryHF",
     "LinearTransform",
     "NonLinearTransform",
     "IdentityTransform",
     "PositionEmbeddingRegistry",
-    "PositionEmbeddingRegistryHF",
     "RotaryEmbedding",
     "SinusoidalEmbedding",
     "Criterion",

diff --git a/multimolecule/data/__init__.py b/multimolecule/data/__init__.py
@@ -15,6 +15,14 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from .dataset import Dataset
+from .multitask import DistributedMultiTaskSampler, MultiTaskDataset, MultiTaskSampler
 from .utils import no_collate
 
-__all__ = ["Dataset", "no_collate"]
+__all__ = [
+    "Dataset",
+    "PandasDataset",
+    "MultiTaskDataset",
+    "MultiTaskSampler",
+    "DistributedMultiTaskSampler",
+    "no_collate",
+]
diff --git a/multimolecule/data/dataset.py b/multimolecule/data/dataset.py
@@ -80,10 +80,13 @@ class Dataset(datasets.Dataset):
         preprocess: Whether to preprocess the dataset.
             Preprocessing involves pre-tokenizing the sequences using the tokenizer.
             Defaults to `True`.
-        auto_rename_cols: Whether to automatically rename columns to standard names.
-            Only works when there is exactly one feature column / one label column.
-            You can control the naming through `multimolecule.defaults.SEQUENCE_COL_NAME` and
-            `multimolecule.defaults.LABEL_COL_NAME`.
+        auto_rename_sequence_col: Whether to automatically rename sequence columns to standard name.
+            Only works when there is exactly one sequence column
+            You can control the naming through `multimolecule.defaults.SEQUENCE_COL_NAME`.
+            For more refined control, use `column_names_map`.
+        auto_rename_label_cols: Whether to automatically rename label column to standard name.
+            Only works when there is exactly one label column.
+            You can control the naming through `multimolecule.defaults.LABEL_COL_NAME`.
             For more refined control, use `column_names_map`.
         column_names_map: A mapping of column names to new column names.
             This is useful for renaming columns to inputs that are expected by a model.
@@ -122,7 +125,8 @@ class Dataset(datasets.Dataset):
     _discrete_map: Mapping
 
     preprocess: bool = True
-    auto_rename_cols: bool = False
+    auto_rename_sequence_col: bool = True
+    auto_rename_label_col: bool = False
     column_names_map: Mapping[str, str] | None = None
     ignored_cols: List[str] = []
 
@@ -136,7 +140,8 @@ def __init__(
         label_cols: List | None = None,
         id_cols: List | None = None,
         preprocess: bool | None = None,
-        auto_rename_cols: bool | None = None,
+        auto_rename_sequence_col: bool | None = None,
+        auto_rename_label_col: bool | None = None,
         column_names_map: Mapping[str, str] | None = None,
         truncation: bool | None = None,
         max_seq_length: int | None = None,
@@ -149,8 +154,9 @@ def __init__(
         fingerprint: str | None = None,
         ignored_cols: List[str] | None = None,
     ):
+        self._tasks = NestedDict()
         if tasks is not None:
-            self._tasks = NestedDict(tasks)
+            self.tasks = tasks
         if discrete_map is not None:
             self._discrete_map = discrete_map
         arrow_table = self.build_table(
@@ -166,10 +172,12 @@ def __init__(
             preprocess=preprocess,
             truncation=truncation,
             max_seq_length=max_seq_length,
-            auto_rename_cols=auto_rename_cols,
+            auto_rename_sequence_col=auto_rename_sequence_col,
+            auto_rename_label_col=auto_rename_label_col,
             column_names_map=column_names_map,
         )
         self.ignored_cols = ignored_cols or self.id_cols
+        self.train = split == datasets.Split.TRAIN
 
     def build_table(
         self,
@@ -187,13 +195,13 @@ def build_table(
                 data = dl.load_pandas(data)
                 if isinstance(data, DataFrame):
                     data = data.loc[:, ~data.columns.str.contains("^Unnamed")]
-                    data = pa.Table.from_pandas(data)
+                    data = pa.Table.from_pandas(data, preserve_index=False)
         elif isinstance(data, dict):
             data = pa.Table.from_pydict(data)
         elif isinstance(data, list):
             data = pa.Table.from_pylist(data)
         elif isinstance(data, DataFrame):
-            data = pa.Table.from_pandas(data)
+            data = pa.Table.from_pandas(data, preserve_index=False)
         if feature_cols is not None and label_cols is not None:
             data = data.select(feature_cols + label_cols)
         data = self.process_nan(data, nan_process=nan_process, fill_value=fill_value)
@@ -206,15 +214,17 @@ def post(
         max_seq_length: int | None = None,
         truncation: bool | None = None,
         preprocess: bool | None = None,
-        auto_rename_cols: bool | None = None,
+        auto_rename_sequence_col: bool | None = None,
+        auto_rename_label_col: bool | None = None,
         column_names_map: Mapping[str, str] | None = None,
     ) -> None:
         r"""
         Perform pre-processing steps after initialization.
 
         It first identifies the special columns (sequence and structure columns) in the dataset.
         Then it sets the feature and label columns based on the input arguments.
-        If `auto_rename_cols` is `True`, it will automatically rename the columns to model inputs.
+        If `auto_rename_sequence_col` is `True`, it will automatically rename the sequence column.
+        If `auto_rename_label_col` is `True`, it will automatically rename the label column.
         Finally, it sets the [`transform`][datasets.Dataset.set_transform] function based on the `preprocess` flag.
         """
         if tokenizer is None:
@@ -237,19 +247,24 @@ def post(
             self.seq_length_offset += 1
         if preprocess is not None:
             self.preprocess = preprocess
-        if auto_rename_cols is not None:
-            self.auto_rename_cols = auto_rename_cols
-        if self.auto_rename_cols:
-            if column_names_map is not None:
-                raise ValueError("auto_rename_cols and column_names_map are mutually exclusive.")
+        if auto_rename_sequence_col is not None:
+            self.auto_rename_sequence_col = auto_rename_sequence_col
+        if auto_rename_label_col is not None:
+            self.auto_rename_label_col = auto_rename_label_col
+        if column_names_map is None:
             column_names_map = {}
-            if len(self.feature_cols) == 1:
-                column_names_map[self.feature_cols[0]] = defaults.SEQUENCE_COL_NAME
-            if len(self.label_cols) == 1:
-                column_names_map[self.label_cols[0]] = defaults.LABEL_COL_NAME
+        if self.auto_rename_sequence_col:
+            if len(self.sequence_cols) != 1:
+                raise ValueError("auto_rename_sequence_col can only be used when there is exactly one sequence column.")
+            column_names_map[self.sequence_cols[0]] = defaults.SEQUENCE_COL_NAME  # type: ignore[index]
+        if self.auto_rename_label_col:
+            if len(self.label_cols) != 1:
+                raise ValueError("auto_rename_label_col can only be used when there is exactly one label column.")
+            column_names_map[self.label_cols[0]] = defaults.LABEL_COL_NAME  # type: ignore[index]
         self.column_names_map = column_names_map
         if self.column_names_map:
             self.rename_columns(self.column_names_map)
+        self.infer_tasks()
 
         if self.preprocess:
             self.update(self.map(self.tokenization))
@@ -258,7 +273,7 @@ def post(
             if self.discrete_map:
                 self.update(self.map(self.map_discrete))
             fn_kwargs = {
-                "columns": [name for name, task in self.tasks.items() if task.level in ["nucleotide", "contact"]],
+                "columns": [name for name, task in self.tasks.items() if task.level in ["token", "contact"]],
                 "max_seq_length": self.max_seq_length - self.seq_length_offset,
             }
             if self.truncation and 0 < self.max_seq_length < 2**32:
@@ -297,20 +312,23 @@ def collate(self, col: str, data: Any) -> Tensor | NestedTensor | None:
         except ValueError:
             return NestedTensor(data)
 
-    def infer_tasks(self, tasks: Mapping | None = None, sequence_col: str | None = None) -> NestedDict:
-        self._tasks = tasks or NestedDict()
+    def infer_tasks(self, sequence_col: str | None = None) -> NestedDict:
         for col in self.label_cols:
-            if col not in self.tasks:
-                if col in self.secondary_structure_cols:
-                    task = Task(TaskType.Binary, level=TaskLevel.Contact, num_labels=1)
-                    self._tasks[col] = task  # type: ignore[index]
-                    warn(
-                        f"Secondary structure columns are assumed to be {task}."
-                        " Please explicitly specify the task if this is not the case."
-                    )
-                else:
-                    self._tasks[col] = self.infer_task(col, sequence_col)  # type: ignore[index]
-        return self._tasks
+            if col in self.tasks:
+                continue
+            if col in self.secondary_structure_cols:
+                task = Task(TaskType.Binary, level=TaskLevel.Contact, num_labels=1)
+                self.tasks[col] = task  # type: ignore[index]
+                warn(
+                    f"Secondary structure columns are assumed to be {task}. "
+                    "Please explicitly specify the task if this is not the case."
+                )
+            else:
+                try:
+                    self.tasks[col] = self.infer_task(col, sequence_col)  # type: ignore[index]
+                except ValueError:
+                    raise ValueError(f"Unable to infer task for column {col}.")
+        return self.tasks
 
     def infer_task(self, label_col: str, sequence_col: str | None = None) -> Task:
         if sequence_col is None:
@@ -346,8 +364,8 @@ def identify_special_cols(
         all_cols = self.data.column_names
         self._id_cols = id_cols or [i for i in all_cols if i in defaults.ID_COL_NAMES]
 
-        string_cols = [k for k, v in self.features.items() if k not in self.id_cols and v.dtype == "string"]
-        self._sequence_cols = [i for i in string_cols if i in defaults.SEQUENCE_COL_NAMES]
+        string_cols: list[str] = [k for k, v in self.features.items() if k not in self.id_cols and v.dtype == "string"]
+        self._sequence_cols = [i for i in string_cols if i.lower() in defaults.SEQUENCE_COL_NAMES]
         self._secondary_structure_cols = [i for i in string_cols if i in defaults.SECONDARY_STRUCTURE_COL_NAMES]
 
         data_cols = [i for i in all_cols if i not in self.id_cols]
@@ -404,7 +422,7 @@ def rename_columns(self, column_mapping: Mapping[str, str], new_fingerprint: str
         self._label_cols = [column_mapping.get(i, i) for i in self.label_cols]
         self._sequence_cols = [column_mapping.get(i, i) for i in self.sequence_cols]
         self._secondary_structure_cols = [column_mapping.get(i, i) for i in self.secondary_structure_cols]
-        self._tasks = {column_mapping.get(k, k): v for k, v in self.tasks.items()}
+        self.tasks = {column_mapping.get(k, k): v for k, v in self.tasks.items()}
         return self
 
     def rename_column(
@@ -418,7 +436,7 @@ def rename_column(
         self._secondary_structure_cols = [
             new_column_name if i == original_column_name else i for i in self.secondary_structure_cols
         ]
-        self._tasks = {new_column_name if k == original_column_name else k: v for k, v in self.tasks.items()}
+        self.tasks = {new_column_name if k == original_column_name else k: v for k, v in self.tasks.items()}
         return self
 
     def process_nan(self, data: Table, nan_process: str | None, fill_value: str | int | float = 0) -> Table:
@@ -470,9 +488,18 @@ def secondary_structure_cols(self) -> List:
     @property
     def tasks(self) -> NestedDict:
         if not hasattr(self, "_tasks"):
+            self._tasks = NestedDict()
             return self.infer_tasks()
         return self._tasks
 
+    @tasks.setter
+    def tasks(self, tasks: Mapping):
+        self._tasks = NestedDict()
+        for name, task in tasks.items():
+            if not isinstance(task, Task):
+                task = Task(**task)
+            self._tasks[name] = task
+
     @property
     def discrete_map(self) -> Mapping:
         if not hasattr(self, "_discrete_map"):
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    datas
     ser
     marz
     manuel
@@ Expand Down @@