Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LC-501: standardize stage properties to camelCase #203

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion application-example.conf
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ pipelines: [
dest: ["output1", "output2", "output3"],

# stage-specific parameter
update_mode: "overwrite"
updateMode: "overwrite"
}
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
public enum UpdateMode {
APPEND("append"), OVERWRITE("overwrite"), SKIP("skip");

public static final String CONFIG_PATH = "update_mode";
public static final String CONFIG_PATH = "updateMode";
public static final UpdateMode DEFAULT = OVERWRITE;

private String text;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ public class AddRandomField extends Stage {
private List<String> uniqueValues;

public AddRandomField(Config config) throws StageException {
super(config, new StageSpec().withOptionalProperties("input_data_path", "field_name", "range_size", "min_num_of_terms",
"max_num_of_terms", "is_nested"));
this.inputDataPath = ConfigUtils.getOrDefault(config, "input_data_path", null);
this.fieldName = ConfigUtils.getOrDefault(config, "field_name", "data");
this.minNumOfTerms = ConfigUtils.getOrDefault(config, "min_num_of_terms", null);
this.maxNumOfTerms = ConfigUtils.getOrDefault(config, "max_num_of_terms", null);
this.isNested = ConfigUtils.getOrDefault(config, "is_nested", false);
super(config, new StageSpec().withOptionalProperties("inputDataPath", "fieldName", "rangeSize", "minNumOfTerms",
"maxNumOfTerms", "isNested"));
this.inputDataPath = ConfigUtils.getOrDefault(config, "inputDataPath", null);
this.fieldName = ConfigUtils.getOrDefault(config, "fieldName", "data");
this.minNumOfTerms = ConfigUtils.getOrDefault(config, "minNumOfTerms", null);
this.maxNumOfTerms = ConfigUtils.getOrDefault(config, "maxNumOfTerms", null);
this.isNested = ConfigUtils.getOrDefault(config, "isNested", false);
this.dataArr = null;
this.rangeSize = null;
this.uniqueValues = null;
Expand All @@ -86,7 +86,7 @@ public AddRandomField(Config config) throws StageException {
@Override
public void start() throws StageException {
this.dataArr = this.inputDataPath != null ? getFileData(this.inputDataPath) : null;
this.rangeSize = ConfigUtils.getOrDefault(config, "range_size",
this.rangeSize = ConfigUtils.getOrDefault(config, "rangeSize",
this.dataArr != null ? this.dataArr.size() : this.maxNumOfTerms);
this.uniqueValues = getUniqueValues(this.dataArr != null, this.dataArr);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
* - regex (String) : A regex expression to find matches for. Matches will be extracted and placed in the destination fields.
* If the regex includes capturing groups, the value of the first group will be used.
* <br>
* - update_mode (String. Optional) : Determines how writing will be handling if the destination field is already populated.
* - updateMode (String. Optional) : Determines how writing will be handling if the destination field is already populated.
* <br>
* Can be 'overwrite', 'append' or 'skip'. Defaults to 'overwrite'.
* - ignore_case (Boolean, Optional) : Determines whether the regex matcher should ignore case. Defaults to false.
* - ignoreCase (Boolean, Optional) : Determines whether the regex matcher should ignore case. Defaults to false.
* <br>
* - multiline (Boolean, Optional) : Determines whether the regex matcher should allow matches across multiple lines. Defaults to false.
* <br>
Expand All @@ -52,14 +52,14 @@ public class ApplyRegex extends Stage {

public ApplyRegex(Config config) {
super(config, new StageSpec().withRequiredProperties("source", "dest", "regex")
.withOptionalProperties("update_mode", "ignore_case", "multiline", "dotall", "literal"));
.withOptionalProperties("updateMode", "ignoreCase", "multiline", "dotall", "literal"));

this.sourceFields = config.getStringList("source");
this.destFields = config.getStringList("dest");
this.regexExpr = config.getString("regex");
this.updateMode = UpdateMode.fromConfig(config);

this.ignoreCase = ConfigUtils.getOrDefault(config, "ignore_case", false);
this.ignoreCase = ConfigUtils.getOrDefault(config, "ignoreCase", false);
this.multiline = ConfigUtils.getOrDefault(config, "multiline", false);
this.dotall = ConfigUtils.getOrDefault(config, "dotall", false);
this.literal = ConfigUtils.getOrDefault(config, "literal", false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ public class Base64Decode extends Stage {
private String outputField;

public Base64Decode(Config config) {
super(config, new StageSpec().withRequiredProperties("input_field", "output_field"));
inputField = config.getString("input_field");
outputField = config.getString("output_field");
super(config, new StageSpec().withRequiredProperties("inputField", "outputField"));
inputField = config.getString("inputField");
outputField = config.getString("outputField");
}

@Override
Expand Down
68 changes: 34 additions & 34 deletions lucille-core/src/main/java/com/kmwllc/lucille/stage/ChunkText.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,25 @@
* - source (String) : field of which Chunking Stage will chunk the text.
* - dest (String, optional): the name of the field that will hold the chunk contents in the children documents.
* Defaults to "chunk".
* - chunking_method (Type Enum, optional) : how to split contents in source. Defaults to Sentence chunking
* - chunkingMethod (Type Enum, optional) : how to split contents in source. Defaults to Sentence chunking
* 1. fixed chunking ("fixed"): split by variable lengthToSplit
* 2. paragraph chunking ("paragraph"): split by 2 consecutive line break sequence (\n, \r, \r\n) with optional whitespaces between,
* e.g. \n\n \n \n
* 3. sentence chunking ("sentence"): use openNLP sentence model for splitting
* 4. custom chunking ("custom"): regex option in config required, used to split content
* - regex (String, only for custom chunking): regEx that will be used to split chunks
* - length_to_split (Integer, only for fixed chunking): length of characters of each initial chunk before processing
* - pre_merge_min_chunk_len (Integer, optional): removes and append chunk to the neighboring chunk if below given number of characters,
* - lengthToSplit (Integer, only for fixed chunking): length of characters of each initial chunk before processing
* - preMergeMinChunkLen (Integer, optional): removes and append chunk to the neighboring chunk if below given number of characters,
* defaults appending to next chunk.
* - pre_merge_max_chunk_len (Integer, optional): truncates the chunks if over given amount, applies before merging and overlapping
* - chunks_to_merge (Integer, optional) : how many chunks to merge into the final new Chunk before overlapping is taken place.
* - preMergeMaxChunkLen (Integer, optional): truncates the chunks if over given amount, applies before merging and overlapping
* - chunksToMerge (Integer, optional) : how many chunks to merge into the final new Chunk before overlapping is taken place.
* defaults to 1, keeping the chunks as they were after splitting.
* e.g. chunks_to_merge: 2 -> { chunk1/chunk2, chunk3/chunk4, chunk5/chunk6}
* - overlap_percentage (Integer, optional) : adds on neighboring chunk's content based on percentage of current chunk, defaults to 0
* - chunks_to_overlap (Integer, optional) : indicate the number of overlap of smaller chunks to overlap while merging into final chunk
* e.g. chunks_to_overlap: 1 -> { chunk1/chunk2/chunk3, chunk3/chunk4/chunk5, chunk5/chunk6/chunk7}
* chunks_to_overlap: 2 -> { chunk1/chunk2/chunk3, chunk2/chunk3/chunk4, chunk3/chunk4/chunk5}
* - character_limit (Integer, optional) : hard limit number of characters in the final chunk. Truncate rest. Performed after
* e.g. chunksToMerge: 2 -> { chunk1/chunk2, chunk3/chunk4, chunk5/chunk6}
* - overlapPercentage (Integer, optional) : adds on neighboring chunk's content based on percentage of current chunk, defaults to 0
* - chunksToOverlap (Integer, optional) : indicate the number of overlap of smaller chunks to overlap while merging into final chunk
* e.g. chunksToOverlap: 1 -> { chunk1/chunk2/chunk3, chunk3/chunk4/chunk5, chunk5/chunk6/chunk7}
* chunksToOverlap: 2 -> { chunk1/chunk2/chunk3, chunk2/chunk3/chunk4, chunk3/chunk4/chunk5}
* - characterLimit (Integer, optional) : hard limit number of characters in the final chunk. Truncate rest. Performed after
* merging & overlapping if they are set.
*
* - child document fields:
Expand All @@ -53,25 +53,25 @@
* - "offset" : number of character offset from start of document
* - "length" : number of characters in this chunk
* - "chunk_number" : chunk number
* - "total_chunk_number" : total chunk number produced from parent document
* - "total_chunks" : total chunk number produced from parent document
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[question] Should these document field names be camel case too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kiratraynor to answer both of your questions. I did have the same thoughts, and short answer is, that we can, but this would make it a nightmare to create a script for conversion. Some external libraries (like Tika for TikaExtractor) produces snake_case for lucille documents. And so any other stages further down the pipeline that takes in the name of the field (like source: "field_produced_by_tika") cannot be changed in the script convertor. Also, there is the case where CSVConnector, SolrConnector, VFSConnector, DatabaseConnector could produce fields with snake_case, meaning that the script would have to know which fields not to convert in cases like this, especially for CSVConnector and DatabaseConnector, which is dependent on the input.

* - "chunk" : the chunk contents. field name can be changed with config option "dest"
*
* e.g. of paragraph chunking configuration, with a minimum size of 50 characters per chunk
* {
* source: "text"
* chunking_method: "paragraph"
* pre_merge_min_chunk_len: 50
* clean_chunks: true
* chunkingMethod: "paragraph"
* preMergeMinChunkLen: 50
* cleanChunks: true
* }
*
* e.g. of sentence chunking configuration with 5 sentences per chunk and 1 sentence of overlap, with a limit of 2000 characters
* {
* source: "text"
* chunking_method: "sentence"
* chunks_to_merge: 5
* chunks_to_overlap: 1
* clean_chunks: true
* character_limit: 2000
* chunkingMethod: "sentence"
* chunksToMerge: 5
* chunksToOverlap: 1
* cleanChunks: true
* characterLimit: 2000
* }
*/

Expand All @@ -94,25 +94,25 @@ public class ChunkText extends Stage {

public ChunkText(Config config) throws StageException {
super(config, new StageSpec()
.withOptionalProperties("chunking_method", "chunks_to_merge", "dest", "regex", "character_limit",
"clean_chunks", "overlap_percentage", "length_to_split", "pre_merge_min_chunk_len", "pre_merge_max_chunk_len",
"chunks_to_overlap")
.withOptionalProperties("chunkingMethod", "chunksToMerge", "dest", "regex", "characterLimit",
"cleanChunks", "overlapPercentage", "lengthToSplit", "preMergeMinChunkLen", "preMergeMaxChunkLen",
"chunksToOverlap")
.withRequiredProperties("source"));
this.source = config.getString("source");
this.dest = config.hasPath("dest") ? config.getString("dest") : "chunk";
this.method = ChunkingMethod.fromConfig(config);
this.regEx = config.hasPath("regex") ? config.getString("regex") : "";
this.lengthToSplit = config.hasPath("length_to_split") && config.getInt("length_to_split") > 0
? config.getInt("length_to_split") : null;
this.cleanChunks = config.hasPath("clean_chunks") ? config.getBoolean("clean_chunks") : false;
this.preMergeMinChunkLen = config.hasPath("pre_merge_min_chunk_len") && config.getInt("pre_merge_min_chunk_len") > 0
? config.getInt("pre_merge_min_chunk_len") : -1;
this.preMergeMaxChunkLen = config.hasPath("pre_merge_max_chunk_len") && config.getInt("pre_merge_max_chunk_len") > 0
? config.getInt("pre_merge_max_chunk_len") : -1;
this.chunksToMerge = config.hasPath("chunks_to_merge") ? config.getInt("chunks_to_merge") : 1;
this.chunksToOverlap = config.hasPath("chunks_to_overlap") ? config.getInt("chunks_to_overlap") : null;
this.overlapPercentage = config.hasPath("overlap_percentage") ? config.getInt("overlap_percentage") : 0;
this.characterLimit = config.hasPath("character_limit") ? config.getInt("character_limit") : -1;
this.lengthToSplit = config.hasPath("lengthToSplit") && config.getInt("lengthToSplit") > 0
? config.getInt("lengthToSplit") : null;
this.cleanChunks = config.hasPath("cleanChunks") ? config.getBoolean("cleanChunks") : false;
this.preMergeMinChunkLen = config.hasPath("preMergeMinChunkLen") && config.getInt("preMergeMinChunkLen") > 0
? config.getInt("preMergeMinChunkLen") : -1;
this.preMergeMaxChunkLen = config.hasPath("preMergeMaxChunkLen") && config.getInt("preMergeMaxChunkLen") > 0
? config.getInt("preMergeMaxChunkLen") : -1;
this.chunksToMerge = config.hasPath("chunksToMerge") ? config.getInt("chunksToMerge") : 1;
this.chunksToOverlap = config.hasPath("chunksToOverlap") ? config.getInt("chunksToOverlap") : null;
this.overlapPercentage = config.hasPath("overlapPercentage") ? config.getInt("overlapPercentage") : 0;
this.characterLimit = config.hasPath("characterLimit") ? config.getInt("characterLimit") : -1;
if (chunksToMerge < 1) {
throw new StageException("Chunks to merge configuration must be greater than 1 if merging chunks is desired or equal to 1 if undesired.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
*
* - source (List&lt;String&gt;) : list of source field names
* - dest (String) : Destination field. This Stage only supports supplying a single destination field.
* - format_string (String) : The format String, which will have field values substituted into its placeholders
* - formatString (String) : The format String, which will have field values substituted into its placeholders
* - defualt_inputs (Map&lt;String, String&gt;, Optional) : Mapping of input fields to a default value. You do not have to
* supply a default for every input field, if a default is not provided, the default behavior will be to leave the
* wildcard for the field in place. Defaults to an empty Map.
* - update_mode (String, Optional) : Determines how writing will be handling if the destination field is already populated.
* - updateMode (String, Optional) : Determines how writing will be handling if the destination field is already populated.
* Can be 'overwrite', 'append' or 'skip'. Defaults to 'overwrite'.
*/
public class Concatenate extends Stage {
Expand All @@ -37,14 +37,14 @@ public class Concatenate extends Stage {

public Concatenate(Config config) {
super(config, new StageSpec()
.withRequiredProperties("dest", "format_string")
.withOptionalProperties("update_mode")
.withOptionalParents("default_inputs"));
.withRequiredProperties("dest", "formatString")
.withOptionalProperties("updateMode")
.withOptionalParents("defaultInputs"));

this.destField = config.getString("dest");
this.formatStr = config.getString("format_string");
this.defaultInputs = config.hasPath("default_inputs") ?
config.getConfig("default_inputs").root().unwrapped() : new HashMap<>();
this.formatStr = config.getString("formatString");
this.defaultInputs = config.hasPath("defaultInputs") ?
config.getConfig("defaultInputs").root().unwrapped() : new HashMap<>();
// defaultInputs = set.stream().collect(Collectors.toMap(Entry::getKey, Entry::getValue));
this.updateMode = UpdateMode.fromConfig(config);
this.fields = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* - source (List&lt;String&gt;) : list of source field names
* - dest (List&lt;String&gt;) : list of destination field names. You can either supply the same number of source and destination fields
* for a 1-1 mapping of results or supply one destination field for all of the source fields to be mapped into.
* - update_mode (String, Optional) : Determines how writing will be handling if the destination field is already populated.
* - updateMode (String, Optional) : Determines how writing will be handling if the destination field is already populated.
* Can be 'overwrite', 'append' or 'skip'. Defaults to 'overwrite'.
*/
public class CopyFields extends Stage {
Expand All @@ -33,7 +33,7 @@ public class CopyFields extends Stage {
public CopyFields(Config config) {
super(config, new StageSpec()
.withRequiredProperties("source", "dest")
.withOptionalProperties("update_mode"));
.withOptionalProperties("updateMode"));
this.sourceFields = config.getStringList("source");
this.destFields = config.getStringList("dest");
this.updateMode = UpdateMode.fromConfig(config);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* - dest (List&lt;String&gt;) : list of destination field names. You can either supply the same number of source and destination fields
* for a 1-1 mapping of results or supply one destination field for all of the source fields to be mapped into.
* - maxLength (Integer) : The maximum number of characters to include in the extracted teaser.
* - update_mode (String, Optional) : Determines how writing will be handling if the destination field is already populated.
* - updateMode (String, Optional) : Determines how writing will be handling if the destination field is already populated.
* Can be 'overwrite', 'append' or 'skip'. Defaults to 'overwrite'.
*/
public class CreateStaticTeaser extends Stage {
Expand All @@ -34,7 +34,7 @@ public class CreateStaticTeaser extends Stage {
public CreateStaticTeaser(Config config) {
super(config, new StageSpec()
.withRequiredProperties("source", "dest", "maxLength")
.withOptionalProperties("update_mode"));
.withOptionalProperties("updateMode"));

this.sourceFields = config.getStringList("source");
this.destFields = config.getStringList("dest");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
* - source (List&lt;String&gt;) : List of source field names.
* - dest (List&lt;String&gt;) : List of destination field names. You can either supply the same number of source and destination fields
* for a 1-1 mapping of results or supply one destination field for all of the source fields to be mapped into.
* - min_length (Integer) : The min length of Strings to be considered for language detection. Shorter Strings will be ignored.
* - max_length (Integer) : The max length of Strings to be considered for language detection. Longer Strings will be truncated.
* - min_probability (Double) : The min probability for a language result to be considered valid. Results below this threshold
* - minLength (Integer) : The min length of Strings to be considered for language detection. Shorter Strings will be ignored.
* - maxLength (Integer) : The max length of Strings to be considered for language detection. Longer Strings will be truncated.
* - minProbability (Double) : The min probability for a language result to be considered valid. Results below this threshold
* will be ignored.
*/
public class DetectLanguage extends Stage {
Expand All @@ -52,17 +52,17 @@ public class DetectLanguage extends Stage {
private Detector detector;

public DetectLanguage(Config config) {
super(config, new StageSpec().withRequiredProperties("source", "language_field")
.withOptionalProperties("language_confidence_field", "min_length", "max_length",
"min_probability", "update_mode"));
super(config, new StageSpec().withRequiredProperties("source", "languageField")
.withOptionalProperties("languageConfidenceField", "minLength", "maxLength",
"minProbability", "updateMode"));

this.sourceFields = config.getStringList("source");
this.languageField = config.getString("language_field");
this.languageConfidenceField = config.hasPath("language_confidence_field") ?
config.getString("language_confidence_field") : "languageConfidence";
this.minLength = config.hasPath("min_length") ? config.getInt("min_length") : 50;
this.maxLength = config.hasPath("max_length") ? config.getInt("max_length") : 10_000;
this.minProbability = config.hasPath("min_probability") ? config.getDouble("min_probability") : .95;
this.languageField = config.getString("languageField");
this.languageConfidenceField = config.hasPath("languageConfidenceField") ?
config.getString("languageConfidenceField") : "languageConfidence";
this.minLength = config.hasPath("minLength") ? config.getInt("minLength") : 50;
this.maxLength = config.hasPath("maxLength") ? config.getInt("maxLength") : 10_000;
this.minProbability = config.hasPath("minProbability") ? config.getDouble("minProbability") : .95;
this.updateMode = UpdateMode.fromConfig(config);
}

Expand Down
Loading
Loading