Add missing models to docs (#419)

* Add AdapterModel to Model docs * Add NanotronLightevalModel to Model docs * Fix nanotron imports and type hints * Add VLLMModel to Model docs --------- Co-authored-by: Clémentine Fourrier <[email protected]>
huggingface · Dec 4, 2024 · 3929825 · 3929825
1 parent 6e2754e
commit 3929825
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 22 deletions.
diff --git a/docs/source/package_reference/models.mdx b/docs/source/package_reference/models.mdx
@@ -7,9 +7,8 @@
 ## Accelerate and Transformers Models
 ### BaseModel
 [[autodoc]] models.base_model.BaseModel
-[//]: # (TODO: Fix import error)
-[//]: # (### AdapterModel)
-[//]: # ([[autodoc]] models.adapter_model.AdapterModel)
+### AdapterModel
+[[autodoc]] models.adapter_model.AdapterModel
 ### DeltaModel
 [[autodoc]] models.delta_model.DeltaModel
 
@@ -19,12 +18,10 @@
 ### ModelClient
 [[autodoc]] models.tgi_model.ModelClient
 
-[//]: # (TODO: Fix import error)
-[//]: # (## Nanotron Model)
-[//]: # (### NanotronLightevalModel)
-[//]: # ([[autodoc]] models.nanotron_model.NanotronLightevalModel)
+## Nanotron Model
+### NanotronLightevalModel
+[[autodoc]] models.nanotron_model.NanotronLightevalModel
 
-[//]: # (TODO: Fix import error)
-[//]: # (## VLLM Model)
-[//]: # (### VLLMModel)
-[//]: # ([[autodoc]] models.vllm_model.VLLMModel)
+## VLLM Model
+### VLLMModel
+[[autodoc]] models.vllm_model.VLLMModel
diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py
@@ -23,20 +23,23 @@
 from dataclasses import dataclass
 from typing import Dict, Optional, Union
 
-from nanotron.config import Config
-from nanotron.config.parallelism_config import ParallelismArgs
-from nanotron.generation.sampler import SamplerType
-from nanotron.logging import get_logger
+from lighteval.utils.imports import is_nanotron_available
 
 
-logger = get_logger(__name__)
+if is_nanotron_available():
+    from nanotron.config import Config
+    from nanotron.config.parallelism_config import ParallelismArgs
+    from nanotron.generation.sampler import SamplerType
+    from nanotron.logging import get_logger
+
+    logger = get_logger(__name__)
 
 DEFAULT_GENERATION_SEED = 42
 
 
 @dataclass
 class GenerationArgs:
-    sampler: Optional[Union[str, SamplerType]] = None
+    sampler: Optional[Union[str, "SamplerType"]] = None
     temperature: Optional[float] = None
     top_k: Optional[int] = None
     top_p: Optional[float] = None
@@ -89,12 +92,12 @@ class LightEvalConfig:
 
     logging: LightEvalLoggingArgs
     tasks: LightEvalTasksArgs
-    parallelism: ParallelismArgs
+    parallelism: "ParallelismArgs"
     batch_size: int = 0
     generation: Optional[Union[GenerationArgs, Dict[str, GenerationArgs]]] = None
 
 
 @dataclass
 class FullNanotronConfig:
     lighteval_config: LightEvalConfig
-    nanotron_config: Config
+    nanotron_config: "Config"
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
@@ -78,7 +78,7 @@
     from nanotron.serialize import load_weights
     from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters
 
-logger = logging.get_logger(__name__)
+    logger = logging.get_logger(__name__)
 
 
 class NanotronLightevalModel(LightevalModel):
@@ -90,7 +90,7 @@ def __init__(
         self,
         checkpoint_path: str,
         nanotron_config: FullNanotronConfig,
-        parallel_context: ParallelContext,
+        parallel_context: "ParallelContext",
         max_gen_toks: Optional[int] = 256,
         max_length: Optional[int] = None,
         add_special_tokens: Optional[bool] = True,
@@ -591,7 +591,7 @@ def prepare_batch(
             input_ids=input_ids, input_mask=input_mask, input_lengths=input_lengths, truncated=truncated, padded=padded
         )
 
-    def gather(self, output_tensor: torch.Tensor, process_group: dist.ProcessGroup = None) -> torch.Tensor:
+    def gather(self, output_tensor: torch.Tensor, process_group: "dist.ProcessGroup" = None) -> torch.Tensor:
         """Gather together tensors of (possibly) various size spread on separate GPUs (first exchange the lengths and then pad and gather)"""
         if process_group is None:
             process_group = self.parallel_context.dp_pg