config_example.yaml

mode: "train"
cache_folder: "data" #"/backup/IR/cache"
corpora:
    name: "small_bioasq"
    folder: "data\\pubmed_small.tar.gz" #/backup/pubmed_archive_json/pubmed_small.tar.gz
    files_are_compressed: true #(optinal) default is false
queries:
    train_file: "data\\small_train.json" #/backup/BioASQ-training7b
    validation_file: "data\\small_validation.json"
pipeline:
    - BM25_ES:
        top_k: 2500
        address: "<ip:port>"
        tokenizer:
            Regex:
                n_process: 20
                stem: true
        evaluation: true
    - DeepRank:
        top_k: 10
        evaluation: true
        tokenizer:
            Regex:
                n_process: 20
                stem: false
                sw_file: "tokenizers\\stop_words.json"
                queries_sw: true # stop words
                articles_sw: false # stop words
        embedding:
            FastText:
                trainable: false
                path: "data\\fast_text.bin"
        input_network:
            Q: 13 #number max of query tokens
            P: 5 #number max of snippets per query token
            S: 15 #number max of snippet tokens
        measure_network:
            MeasureNetwork:
                activation: "selu"
                filters: 100
                gru_bidirectional: False
                gru_dim: 56
                kernel:
                    - 3 #x
                    - 3 #y
        aggregation_network:
            AggregationNetwork:
                activation: "selu"
        hyperparameters:
            epoch: 120
            batch_size: 128
            optimizer:
                name: "adadelta" #(optinal) default is AdaDelta
                learning_rate: 2
            l2_regularization: 0.0001 #(optinal) default is 0.0001
            num_partially_positive_samples: 3
            num_negative_samples: 4