Skip to content

Commit

Permalink
Release 1.0.0
Browse files Browse the repository at this point in the history
Release 1.0.0
  • Loading branch information
PrinsINT authored Jun 4, 2024
2 parents 35171aa + 27c4142 commit 3dbc5fa
Show file tree
Hide file tree
Showing 22 changed files with 784 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "galahad-corpus-data"]
path = galahad-corpus-data
url = https://github.com/INL/galahad-corpus-data
[submodule "galahad-taggers-dockerized"]
path = galahad-taggers-dockerized
url = https://github.com/INL/galahad-taggers-dockerized
2 changes: 1 addition & 1 deletion Readme.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# galahad-train-battery (0.9.1)
# galahad-train-battery (1.0.0)
Python program for training linguistic annotation taggers based on a configuration file and list of datasets. It prepares the resulting trained models for dockerization and adds relevant metadata. It is tagger software agnostic as long as a simple Python shell is built around it.

### GaLAHaD-related Repositories
Expand Down
4 changes: 2 additions & 2 deletions codemeta.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"@context": "https://w3id.org/codemeta/3.0",
"@type": "SoftwareSourceCode",
"version": "0.9.1",
"dateModified": "2024-05-31",
"version": "1.0.0",
"dateModified": "2024-06-04",
"dateCreated": "2024-05-31",
"datePublished": "2024-05-31",
"applicationCategory": [
Expand Down
File renamed without changes.
14 changes: 14 additions & 0 deletions configs/pie/TDN-1400-1600/datasets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "1400-1600",
"datasets": [
"dbnl-excerpts-15",
"dbnl-excerpts-16",
"dictionary-quotations-15",
"dictionary-quotations-16",
"clvn"
],
"tagset": "TDN-Core",
"version": "1.0",
"eraFrom": "1400",
"eraTo": "1600"
}
113 changes: 113 additions & 0 deletions configs/pie/TDN-1600-1900/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"verbose": true,
"report_freq": 10000,
"modelname": "DEFINE_IN_ENVIRONMENT",
"modelpath": "DEFINE_IN_ENVIRONMENT",
"input_path": "DEFINE_IN_ENVIRONMENT",
"dev_path": "DEFINE_IN_ENVIRONMENT",
"test_path": "",
"breakline_ref": "pos",
"breakline_data": ".$",
"max_sent_len": 35,
"max_sents": 1000000,
"word_max_size": 50000,
"word_min_freq": 1,
"word_lower": false,
"char_max_size": 500,
"char_min_freq": 1,
"char_lower": false,
"char_eos": true,
"char_bos": true,
"utfnorm": false,
"utfnorm_type": "NFKD",
"drop_diacritics": false,
"header": false,
"sep": "\t",
"tasks_order": [
"pos",
"lemma"
],
"tasks": [
{
"name": "lemma",
"level": "char",
"decoder": "attentional",
"context": "sentence",
"layer": -1,
"settings": {
"bos": true,
"eos": true,
"lower": true,
"target": "lemma"
},
"target": true,
"default": "copy",
"read_only": false
},
{
"name": "pos",
"level": "token",
"decoder": "linear",
"context": "sentence",
"layer": -1,
"settings": {
"lower": false,
"target": "pos"
},
"target": false,
"default": "copy",
"read_only": false
}
],
"task_defaults": {
"level": "token",
"layer": -1,
"decoder": "linear",
"context": "sentence"
},
"patience": 5,
"factor": 0.5,
"threshold": 0,
"min_weight": 0,
"include_lm": true,
"lm_shared_softmax": true,
"lm_schedule": {
"patience": 2,
"factor": 0.5,
"weight": 0.2,
"mode": "min"
},
"buffer_size": 10000,
"cache_dataset": false,
"minimize_pad": false,
"epochs": 100,
"batch_size": 25,
"shuffle": true,
"device": "cuda",
"run_test": false,
"pretrain_embeddings": false,
"load_pretrained_embeddings": "",
"load_pretrained_encoder": "",
"freeze_embeddings": false,
"dropout": 0.25,
"word_dropout": 0,
"optimizer": "Adam",
"clip_norm": 5.0,
"lr": 0.001,
"lr_factor": 0.75,
"min_lr": 0.000001,
"lr_patience": 2,
"checks_per_epoch": 1,
"wemb_dim": 0,
"cemb_dim": 300,
"cemb_type": "rnn",
"custom_cemb_cell": false,
"cemb_layers": 2,
"merge_type": "concat",
"scorer": "general",
"linear_layers": 1,
"hidden_size": 150,
"num_layers": 1,
"cell": "GRU",
"init_rnn": "default"
}
17 changes: 17 additions & 0 deletions configs/pie/TDN-1600-1900/datasets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "1600-1900",
"datasets": [
"dbnl-excerpts-17",
"dbnl-excerpts-18",
"dbnl-excerpts-19",
"dictionary-quotations-17",
"dictionary-quotations-18",
"dictionary-quotations-19",
"couranten",
"letters-as-loot"
],
"tagset": "TDN-Core",
"version": "1.0",
"eraFrom": "1600",
"eraTo": "1900"
}
113 changes: 113 additions & 0 deletions configs/pie/TDN-ALL/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"verbose": true,
"report_freq": 10000,
"modelname": "DEFINE_IN_ENVIRONMENT",
"modelpath": "DEFINE_IN_ENVIRONMENT",
"input_path": "DEFINE_IN_ENVIRONMENT",
"dev_path": "DEFINE_IN_ENVIRONMENT",
"test_path": "",
"breakline_ref": "pos",
"breakline_data": ".$",
"max_sent_len": 35,
"max_sents": 1000000,
"word_max_size": 50000,
"word_min_freq": 1,
"word_lower": false,
"char_max_size": 500,
"char_min_freq": 1,
"char_lower": false,
"char_eos": true,
"char_bos": true,
"utfnorm": false,
"utfnorm_type": "NFKD",
"drop_diacritics": false,
"header": false,
"sep": "\t",
"tasks_order": [
"pos",
"lemma"
],
"tasks": [
{
"name": "lemma",
"level": "char",
"decoder": "attentional",
"context": "sentence",
"layer": -1,
"settings": {
"bos": true,
"eos": true,
"lower": true,
"target": "lemma"
},
"target": true,
"default": "copy",
"read_only": false
},
{
"name": "pos",
"level": "token",
"decoder": "linear",
"context": "sentence",
"layer": -1,
"settings": {
"lower": false,
"target": "pos"
},
"target": false,
"default": "copy",
"read_only": false
}
],
"task_defaults": {
"level": "token",
"layer": -1,
"decoder": "linear",
"context": "sentence"
},
"patience": 5,
"factor": 0.5,
"threshold": 0,
"min_weight": 0,
"include_lm": true,
"lm_shared_softmax": true,
"lm_schedule": {
"patience": 2,
"factor": 0.5,
"weight": 0.2,
"mode": "min"
},
"buffer_size": 10000,
"cache_dataset": false,
"minimize_pad": false,
"epochs": 100,
"batch_size": 25,
"shuffle": true,
"device": "cuda",
"run_test": false,
"pretrain_embeddings": false,
"load_pretrained_embeddings": "",
"load_pretrained_encoder": "",
"freeze_embeddings": false,
"dropout": 0.25,
"word_dropout": 0,
"optimizer": "Adam",
"clip_norm": 5.0,
"lr": 0.001,
"lr_factor": 0.75,
"min_lr": 0.000001,
"lr_patience": 2,
"checks_per_epoch": 1,
"wemb_dim": 0,
"cemb_dim": 300,
"cemb_type": "rnn",
"custom_cemb_cell": false,
"cemb_layers": 2,
"merge_type": "concat",
"scorer": "general",
"linear_layers": 1,
"hidden_size": 150,
"num_layers": 1,
"cell": "GRU",
"init_rnn": "default"
}
22 changes: 22 additions & 0 deletions configs/pie/TDN-ALL/datasets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"name": "ALL",
"datasets": [
"dbnl-excerpts-15",
"dbnl-excerpts-16",
"dbnl-excerpts-17",
"dbnl-excerpts-18",
"dbnl-excerpts-19",
"dictionary-quotations-15",
"dictionary-quotations-16",
"dictionary-quotations-17",
"dictionary-quotations-18",
"dictionary-quotations-19",
"clvn",
"couranten",
"letters-as-loot"
],
"tagset": "TDN-Core",
"version": "1.0",
"eraFrom": "1400",
"eraTo": "1900"
}
Loading

0 comments on commit 3dbc5fa

Please sign in to comment.