Skip to content

Commit

Permalink
Merge branch 'main' of github.com:iamgroot42/mimir into michael/defau…
Browse files Browse the repository at this point in the history
…lt_load_hf
  • Loading branch information
Michael C. Duan committed Feb 20, 2024
2 parents 2fbe889 + c413117 commit 94aaa97
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 133 deletions.
6 changes: 3 additions & 3 deletions docs/config.html
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ <h1 class="title">Module <code>mimir.config</code></h1>
&#34;Dump data to cache? Exits program after dumping&#34;
load_from_cache: Optional[bool] = False
&#34;&#34;&#34;Load data from cache?&#34;&#34;&#34;
load_from_hf: Optional[bool] = False
load_from_hf: Optional[bool] = True
&#34;&#34;&#34;Load data from HuggingFace?&#34;&#34;&#34;
blackbox_attacks: Optional[List[str]] = field(
default_factory=lambda: None
Expand Down Expand Up @@ -352,7 +352,7 @@ <h3>Class variables</h3>
</dd>
<dt id="mimir.config.ExperimentConfig"><code class="flex name class">
<span>class <span class="ident">ExperimentConfig</span></span>
<span>(</span><span>experiment_name: str, base_model: str, dataset_member: str, dataset_nonmember: str, output_name: str = None, dataset_nonmember_other_sources: Optional[List[str]] = &lt;factory&gt;, pretokenized: Optional[bool] = False, revision: Optional[str] = None, presampled_dataset_member: Optional[str] = None, presampled_dataset_nonmember: Optional[str] = None, token_frequency_map: Optional[str] = None, dataset_key: Optional[str] = None, specific_source: Optional[str] = None, full_doc: Optional[bool] = False, max_substrs: Optional[int] = 20, dump_cache: Optional[bool] = False, load_from_cache: Optional[bool] = False, load_from_hf: Optional[bool] = False, blackbox_attacks: Optional[List[str]] = &lt;factory&gt;, tokenization_attack: Optional[bool] = False, quantile_attack: Optional[bool] = False, n_samples: Optional[int] = 200, max_tokens: Optional[int] = 512, max_data: Optional[int] = 5000, min_words: Optional[int] = 100, max_words: Optional[int] = 200, max_words_cutoff: Optional[bool] = True, batch_size: Optional[int] = 50, chunk_size: Optional[int] = 20, scoring_model_name: Optional[str] = None, top_k: Optional[int] = 40, do_top_k: Optional[bool] = False, top_p: Optional[float] = 0.96, do_top_p: Optional[bool] = False, pre_perturb_pct: Optional[float] = 0.0, pre_perturb_span_length: Optional[int] = 5, tok_by_tok: Optional[bool] = False, fpr_list: Optional[List[float]] = &lt;factory&gt;, random_seed: Optional[int] = 0, ref_config: Optional[<a title="mimir.config.ReferenceConfig" href="#mimir.config.ReferenceConfig">ReferenceConfig</a>] = None, neighborhood_config: Optional[<a title="mimir.config.NeighborhoodConfig" href="#mimir.config.NeighborhoodConfig">NeighborhoodConfig</a>] = None, env_config: Optional[<a title="mimir.config.EnvironmentConfig" href="#mimir.config.EnvironmentConfig">EnvironmentConfig</a>] = None, openai_config: Optional[<a title="mimir.config.OpenAIConfig" href="#mimir.config.OpenAIConfig">OpenAIConfig</a>] = None)</span>
<span>(</span><span>experiment_name: str, base_model: str, dataset_member: str, dataset_nonmember: str, output_name: str = None, dataset_nonmember_other_sources: Optional[List[str]] = &lt;factory&gt;, pretokenized: Optional[bool] = False, revision: Optional[str] = None, presampled_dataset_member: Optional[str] = None, presampled_dataset_nonmember: Optional[str] = None, token_frequency_map: Optional[str] = None, dataset_key: Optional[str] = None, specific_source: Optional[str] = None, full_doc: Optional[bool] = False, max_substrs: Optional[int] = 20, dump_cache: Optional[bool] = False, load_from_cache: Optional[bool] = False, load_from_hf: Optional[bool] = True, blackbox_attacks: Optional[List[str]] = &lt;factory&gt;, tokenization_attack: Optional[bool] = False, quantile_attack: Optional[bool] = False, n_samples: Optional[int] = 200, max_tokens: Optional[int] = 512, max_data: Optional[int] = 5000, min_words: Optional[int] = 100, max_words: Optional[int] = 200, max_words_cutoff: Optional[bool] = True, batch_size: Optional[int] = 50, chunk_size: Optional[int] = 20, scoring_model_name: Optional[str] = None, top_k: Optional[int] = 40, do_top_k: Optional[bool] = False, top_p: Optional[float] = 0.96, do_top_p: Optional[bool] = False, pre_perturb_pct: Optional[float] = 0.0, pre_perturb_span_length: Optional[int] = 5, tok_by_tok: Optional[bool] = False, fpr_list: Optional[List[float]] = &lt;factory&gt;, random_seed: Optional[int] = 0, ref_config: Optional[<a title="mimir.config.ReferenceConfig" href="#mimir.config.ReferenceConfig">ReferenceConfig</a>] = None, neighborhood_config: Optional[<a title="mimir.config.NeighborhoodConfig" href="#mimir.config.NeighborhoodConfig">NeighborhoodConfig</a>] = None, env_config: Optional[<a title="mimir.config.EnvironmentConfig" href="#mimir.config.EnvironmentConfig">EnvironmentConfig</a>] = None, openai_config: Optional[<a title="mimir.config.OpenAIConfig" href="#mimir.config.OpenAIConfig">OpenAIConfig</a>] = None)</span>
</code></dt>
<dd>
<div class="desc"><p>Config for attacks</p></div>
Expand Down Expand Up @@ -403,7 +403,7 @@ <h3>Class variables</h3>
&#34;Dump data to cache? Exits program after dumping&#34;
load_from_cache: Optional[bool] = False
&#34;&#34;&#34;Load data from cache?&#34;&#34;&#34;
load_from_hf: Optional[bool] = False
load_from_hf: Optional[bool] = True
&#34;&#34;&#34;Load data from HuggingFace?&#34;&#34;&#34;
blackbox_attacks: Optional[List[str]] = field(
default_factory=lambda: None
Expand Down
42 changes: 14 additions & 28 deletions docs/custom_datasets.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ <h1 class="title">Module <code>mimir.custom_datasets</code></h1>

DATASETS = [&#39;writing&#39;, &#39;english&#39;, &#39;german&#39;, &#39;pubmed&#39;]

SOURCES_UPLOADED = [
&#34;arxiv&#34;,
&#34;dm_mathematics&#34;,
&#34;github&#34;,
&#34;hackernews&#34;,
&#34;pile_cc&#34;,
&#34;pubmed_central&#34;,
&#34;wikipedia_(en)&#34;,
&#34;full_pile&#34;,
&#34;c4&#34;,
&#34;temporal_arxiv&#34;,
&#34;temporal_wiki&#34;
]


def load_pubmed(cache_dir):
data = datasets.load_dataset(&#39;pubmed_qa&#39;, &#39;pqa_labeled&#39;, split=&#39;train&#39;, cache_dir=cache_dir)
Expand Down Expand Up @@ -70,20 +84,6 @@ <h1 class="title">Module <code>mimir.custom_datasets</code></h1>
if not filename.startswith(&#34;the_pile&#34;):
raise ValueError(f&#34;HuggingFace data only available for The Pile.&#34;)

SOURCES_UPLOADED = [
&#34;arxiv&#34;,
&#34;dm_mathematics&#34;,
&#34;github&#34;,
&#34;hackernews&#34;,
&#34;pile_cc&#34;,
&#34;pubmed_central&#34;,
&#34;wikipedia_(en)&#34;,
&#34;full_pile&#34;,
&#34;c4&#34;,
&#34;temporal_arxiv&#34;,
&#34;temporal_wiki&#34;
]

for source in SOURCES_UPLOADED:
# Got a match
if source in filename and filename.startswith(f&#34;the_pile_{source}&#34;):
Expand Down Expand Up @@ -292,20 +292,6 @@ <h2 class="section-title" id="header-functions">Functions</h2>
if not filename.startswith(&#34;the_pile&#34;):
raise ValueError(f&#34;HuggingFace data only available for The Pile.&#34;)

SOURCES_UPLOADED = [
&#34;arxiv&#34;,
&#34;dm_mathematics&#34;,
&#34;github&#34;,
&#34;hackernews&#34;,
&#34;pile_cc&#34;,
&#34;pubmed_central&#34;,
&#34;wikipedia_(en)&#34;,
&#34;full_pile&#34;,
&#34;c4&#34;,
&#34;temporal_arxiv&#34;,
&#34;temporal_wiki&#34;
]

for source in SOURCES_UPLOADED:
# Got a match
if source in filename and filename.startswith(f&#34;the_pile_{source}&#34;):
Expand Down
38 changes: 34 additions & 4 deletions docs/data_utils.html
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ <h1 class="title">Module <code>mimir.data_utils</code></h1>


class Data:
&#34;&#34;&#34;
Data class to load and cache datasets.
&#34;&#34;&#34;
def __init__(self, name,
config: ExperimentConfig,
presampled: str = None,
Expand All @@ -65,6 +68,9 @@ <h1 class="title">Module <code>mimir.data_utils</code></h1>
model: str = &#34;bert&#34;,
in_place_swap: bool = False,
):
&#34;&#34;&#34;
Load neighbors from cache (local or from HF)
&#34;&#34;&#34;
data_split = &#34;train&#34; if train else &#34;test&#34;
data_split += &#34;_neighbors&#34;
filename = self._get_name_to_save() + &#34;_neighbors_{}_{}&#34;.format(
Expand Down Expand Up @@ -92,6 +98,9 @@ <h1 class="title">Module <code>mimir.data_utils</code></h1>
model: str = &#34;bert&#34;,
in_place_swap: bool = False,
):
&#34;&#34;&#34;
Dump neighbors to cache local cache.
&#34;&#34;&#34;
data_split = &#34;train&#34; if train else &#34;test&#34;
data_split += &#34;_neighbors&#34;
filename = self._get_name_to_save() + &#34;_neighbors_{}_{}&#34;.format(
Expand Down Expand Up @@ -338,6 +347,9 @@ <h1 class="title">Module <code>mimir.data_utils</code></h1>


def sourcename_process(x: str):
&#34;&#34;&#34;
Helper function to process source name.
&#34;&#34;&#34;
return x.replace(&#34; &#34;, &#34;_&#34;).replace(&#34;-&#34;, &#34;_&#34;).lower()


Expand Down Expand Up @@ -398,12 +410,15 @@ <h2 class="section-title" id="header-functions">Functions</h2>
<span>def <span class="ident">sourcename_process</span></span>(<span>x: str)</span>
</code></dt>
<dd>
<div class="desc"></div>
<div class="desc"><p>Helper function to process source name.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">def sourcename_process(x: str):
&#34;&#34;&#34;
Helper function to process source name.
&#34;&#34;&#34;
return x.replace(&#34; &#34;, &#34;_&#34;).replace(&#34;-&#34;, &#34;_&#34;).lower()</code></pre>
</details>
</dd>
Expand Down Expand Up @@ -476,12 +491,15 @@ <h2 class="section-title" id="header-classes">Classes</h2>
<span>(</span><span>name, config: <a title="mimir.config.ExperimentConfig" href="config.html#mimir.config.ExperimentConfig">ExperimentConfig</a>, presampled: str = None, name_key_mapping: dict = {'the_pile': 'text', 'xsum': 'document'})</span>
</code></dt>
<dd>
<div class="desc"></div>
<div class="desc"><p>Data class to load and cache datasets.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
</summary>
<pre><code class="python">class Data:
&#34;&#34;&#34;
Data class to load and cache datasets.
&#34;&#34;&#34;
def __init__(self, name,
config: ExperimentConfig,
presampled: str = None,
Expand All @@ -508,6 +526,9 @@ <h2 class="section-title" id="header-classes">Classes</h2>
model: str = &#34;bert&#34;,
in_place_swap: bool = False,
):
&#34;&#34;&#34;
Load neighbors from cache (local or from HF)
&#34;&#34;&#34;
data_split = &#34;train&#34; if train else &#34;test&#34;
data_split += &#34;_neighbors&#34;
filename = self._get_name_to_save() + &#34;_neighbors_{}_{}&#34;.format(
Expand Down Expand Up @@ -535,6 +556,9 @@ <h2 class="section-title" id="header-classes">Classes</h2>
model: str = &#34;bert&#34;,
in_place_swap: bool = False,
):
&#34;&#34;&#34;
Dump neighbors to cache local cache.
&#34;&#34;&#34;
data_split = &#34;train&#34; if train else &#34;test&#34;
data_split += &#34;_neighbors&#34;
filename = self._get_name_to_save() + &#34;_neighbors_{}_{}&#34;.format(
Expand Down Expand Up @@ -739,7 +763,7 @@ <h3>Methods</h3>
<span>def <span class="ident">dump_neighbors</span></span>(<span>self, data, train: bool, num_neighbors: int, model: str = 'bert', in_place_swap: bool = False)</span>
</code></dt>
<dd>
<div class="desc"></div>
<div class="desc"><p>Dump neighbors to cache local cache.</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
Expand All @@ -752,6 +776,9 @@ <h3>Methods</h3>
model: str = &#34;bert&#34;,
in_place_swap: bool = False,
):
&#34;&#34;&#34;
Dump neighbors to cache local cache.
&#34;&#34;&#34;
data_split = &#34;train&#34; if train else &#34;test&#34;
data_split += &#34;_neighbors&#34;
filename = self._get_name_to_save() + &#34;_neighbors_{}_{}&#34;.format(
Expand Down Expand Up @@ -967,7 +994,7 @@ <h3>Methods</h3>
<span>def <span class="ident">load_neighbors</span></span>(<span>self, train: bool, num_neighbors: int, model: str = 'bert', in_place_swap: bool = False)</span>
</code></dt>
<dd>
<div class="desc"></div>
<div class="desc"><p>Load neighbors from cache (local or from HF)</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
Expand All @@ -979,6 +1006,9 @@ <h3>Methods</h3>
model: str = &#34;bert&#34;,
in_place_swap: bool = False,
):
&#34;&#34;&#34;
Load neighbors from cache (local or from HF)
&#34;&#34;&#34;
data_split = &#34;train&#34; if train else &#34;test&#34;
data_split += &#34;_neighbors&#34;
filename = self._get_name_to_save() + &#34;_neighbors_{}_{}&#34;.format(
Expand Down
Loading

0 comments on commit 94aaa97

Please sign in to comment.