diff --git a/docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb b/docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb index 754476f6d..30c6811bf 100644 --- a/docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb +++ b/docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb @@ -409,7 +409,7 @@ "This can be done as follows\n", "\n", "\n", - "You can see individual cleaning functions in `lightwood.data.cleaner`. If you want to entirely replace a cleaning technique given a particular data-type, we invite you to change `lightwood.data.cleaner.get_cleaning_func` using the argument `custom_cleaning_functions`; in this dictionary, for a datatype (specified in `api.dtype`), you can assign your own function to override our defaults. " + "You can see individual cleaning functions in `lightwood.data.cleaner`. If you want to entirely replace a cleaning technique given a particular data-type, we invite you to change `lightwood.data.cleaner.get_cleaning_func` using the argument `custom_cleaning_functions`; in this dictionary, for a datatype (specified in `type_infer.dtype`), you can assign your own function to override our defaults." ] }, { @@ -438,8 +438,8 @@ "\n", "import numpy as np\n", "import pandas as pd\n", + "from type_infer.dtype import dtype\n", "\n", - "from lightwood.api.dtype import dtype\n", "from lightwood.helpers import text\n", "from lightwood.helpers.log import log\n", "from lightwood.api.types import TimeseriesSettings\n", diff --git a/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb b/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb index 736ee4a90..10f9a14e6 100644 --- a/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb +++ b/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb @@ -239,30 +239,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1462817:Dropping features: []\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Analyzing a sample of 6920\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:from a total population of 10668, this is equivalent to 64.9% of your data.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Using 7 processes to deduct types.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: model\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: year\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: price\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: transmission\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: fuelType\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: mileage\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: tax\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column year has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column price has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: mpg\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Infering type for: engineSize\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column tax has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column mileage has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column engineSize has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column mpg has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column transmission has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column fuelType has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Column model has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Starting statistical analysis\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Finished statistical analysis\u001b[0m\n" + "\u001B[32mINFO:lightwood-1462817:Dropping features: []\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Analyzing a sample of 6920\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:from a total population of 10668, this is equivalent to 64.9% of your data.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Using 7 processes to deduct types.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: model\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: year\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: price\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: transmission\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: fuelType\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: mileage\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: tax\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column year has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column price has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: mpg\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Infering type for: engineSize\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column tax has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column mileage has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column engineSize has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column mpg has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column transmission has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column fuelType has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Column model has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Starting statistical analysis\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Finished statistical analysis\u001B[0m\n" ] } ], @@ -748,39 +748,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1462817:Performing statistical analysis on data\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Starting statistical analysis\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Finished statistical analysis\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1462817: `analyze_data` runtime: 0.14 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Cleaning the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1462817: `preprocess` runtime: 0.05 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Splitting the data into train/test\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1462817: `split` runtime: 0.0 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Preparing the encoders\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 1\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 2\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 3\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 4\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 5\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 6\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 7\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 8\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Encoder prepping dict length of: 9\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Categories Detected = 1\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Categories Detected = 1\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Categories Detected = 1\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: price\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: model\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: year\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: transmission\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: mileage\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: fuelType\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: tax\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: mpg\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Done running for: engineSize\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1462817: `prepare` runtime: 0.16 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1462817:Featurizing the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1462817: `featurize` runtime: 0.0 seconds\u001b[0m\n" + "\u001B[32mINFO:lightwood-1462817:Performing statistical analysis on data\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Starting statistical analysis\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Finished statistical analysis\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1462817: `analyze_data` runtime: 0.14 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Cleaning the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1462817: `preprocess` runtime: 0.05 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Splitting the data into train/test\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1462817: `split` runtime: 0.0 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Preparing the encoders\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 1\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 2\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 3\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 4\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 5\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 6\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 7\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 8\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 9\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: price\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: model\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: year\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: transmission\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: mileage\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: fuelType\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: tax\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: mpg\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Done running for: engineSize\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1462817: `prepare` runtime: 0.16 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1462817:Featurizing the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1462817: `featurize` runtime: 0.0 seconds\u001B[0m\n" ] } ], @@ -967,4 +967,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docssrc/source/tutorials/custom_mixer/custom_mixer.ipynb b/docssrc/source/tutorials/custom_mixer/custom_mixer.ipynb index dba1e24f7..18b7b2ea6 100644 --- a/docssrc/source/tutorials/custom_mixer/custom_mixer.ipynb +++ b/docssrc/source/tutorials/custom_mixer/custom_mixer.ipynb @@ -67,7 +67,7 @@ "from lightwood.mixer import BaseMixer\n", "from lightwood.api.types import PredictionArguments\n", "from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs\n", - "from lightwood import dtype\n", + "from type_infer.dtype import dtype\n", "from lightwood.encoder import BaseEncoder\n", "\n", "import torch\n", @@ -144,40 +144,40 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1468487:Dropping features: []\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Analyzing a sample of 298\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:from a total population of 303, this is equivalent to 98.3% of your data.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Using 7 processes to deduct types.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: cp\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: age\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: sex\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: trestbps\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: chol\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: restecg\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: fbs\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column cp has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column chol has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column sex has data type binary\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column restecg has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column trestbps has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column age has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column fbs has data type binary\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: thalach\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: exang\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: ca\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: slope\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: oldpeak\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column thalach has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: thal\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column exang has data type binary\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column ca has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Infering type for: target\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column thal has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column target has data type binary\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column oldpeak has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Column slope has data type categorical\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Starting statistical analysis\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Finished statistical analysis\u001b[0m\n" + "\u001B[32mINFO:lightwood-1468487:Dropping features: []\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Analyzing a sample of 298\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:from a total population of 303, this is equivalent to 98.3% of your data.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Using 7 processes to deduct types.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: cp\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: age\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: sex\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: trestbps\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: chol\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: restecg\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: fbs\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column cp has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column chol has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column sex has data type binary\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column restecg has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column trestbps has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column age has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column fbs has data type binary\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: thalach\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: exang\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: ca\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: slope\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: oldpeak\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column thalach has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: thal\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column exang has data type binary\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column ca has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Infering type for: target\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column thal has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column target has data type binary\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column oldpeak has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Column slope has data type categorical\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Starting statistical analysis\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Finished statistical analysis\u001B[0m\n" ] }, { @@ -428,66 +428,66 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1468487:Dropping features: []\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Performing statistical analysis on data\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Starting statistical analysis\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Finished statistical analysis\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `analyze_data` runtime: 0.01 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Cleaning the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `preprocess` runtime: 0.01 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Splitting the data into train/test\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `split` runtime: 0.01 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Preparing the encoders\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 1\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 2\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 3\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 4\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 5\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 6\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 7\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 8\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 9\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 10\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 11\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 12\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 13\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoder prepping dict length of: 14\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: target\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: age\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: sex\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: cp\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: trestbps\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: chol\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: fbs\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: restecg\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: thalach\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: exang\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: oldpeak\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: slope\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: ca\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Done running for: thal\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `prepare` runtime: 0.17 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Featurizing the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `featurize` runtime: 0.0 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Training the mixers\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Ensembling the mixer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Mixer: RandomForestMixer got accuracy: 0.8348214285714286\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Picked best mixer: RandomForestMixer\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `fit` runtime: 1.39 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Analyzing the ensemble of mixers\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:The block ICP is now running its analyze() method\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:The block AccStats is now running its analyze() method\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:The block ConfStats is now running its analyze() method\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `analyze_ensemble` runtime: 0.02 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Adjustment on validation requested.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Updating the mixers\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `adjust` runtime: 0.0 seconds\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `learn` runtime: 1.62 seconds\u001b[0m\n" + "\u001B[32mINFO:lightwood-1468487:Dropping features: []\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Performing statistical analysis on data\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Starting statistical analysis\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Finished statistical analysis\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `analyze_data` runtime: 0.01 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Cleaning the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `preprocess` runtime: 0.01 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Splitting the data into train/test\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `split` runtime: 0.01 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Preparing the encoders\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 1\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 2\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 3\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 4\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 5\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 6\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 7\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 8\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 9\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 10\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 11\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 12\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 13\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoder prepping dict length of: 14\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Encoding UNKNOWN categories as index 0\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: target\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: age\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: sex\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: cp\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: trestbps\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: chol\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: fbs\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: restecg\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: thalach\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: exang\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: oldpeak\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: slope\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: ca\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Done running for: thal\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `prepare` runtime: 0.17 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Featurizing the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `featurize` runtime: 0.0 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Training the mixers\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Ensembling the mixer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Mixer: RandomForestMixer got accuracy: 0.8348214285714286\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Picked best mixer: RandomForestMixer\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `fit` runtime: 1.39 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Analyzing the ensemble of mixers\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:The block ICP is now running its analyze() method\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:The block AccStats is now running its analyze() method\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:The block ConfStats is now running its analyze() method\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `analyze_ensemble` runtime: 0.02 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Adjustment on validation requested.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Updating the mixers\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `adjust` runtime: 0.0 seconds\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `learn` runtime: 1.62 seconds\u001B[0m\n" ] } ], @@ -518,17 +518,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1468487:Dropping features: []\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Cleaning the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `preprocess` runtime: 0.0 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:Featurizing the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `featurize` runtime: 0.0 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:The block ICP is now running its explain() method\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:The block AccStats is now running its explain() method\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:AccStats.explain() has not been implemented, no modifications will be done to the data insights.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:The block ConfStats is now running its explain() method\u001b[0m\n", - "\u001b[32mINFO:lightwood-1468487:ConfStats.explain() has not been implemented, no modifications will be done to the data insights.\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1468487: `predict` runtime: 0.03 seconds\u001b[0m\n" + "\u001B[32mINFO:lightwood-1468487:Dropping features: []\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Cleaning the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `preprocess` runtime: 0.0 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:Featurizing the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `featurize` runtime: 0.0 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:The block ICP is now running its explain() method\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:The block AccStats is now running its explain() method\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:AccStats.explain() has not been implemented, no modifications will be done to the data insights.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:The block ConfStats is now running its explain() method\u001B[0m\n", + "\u001B[32mINFO:lightwood-1468487:ConfStats.explain() has not been implemented, no modifications will be done to the data insights.\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1468487: `predict` runtime: 0.03 seconds\u001B[0m\n" ] }, { @@ -577,4 +577,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/docssrc/source/tutorials/custom_splitter/custom_splitter.ipynb b/docssrc/source/tutorials/custom_splitter/custom_splitter.ipynb index 0251e7dc1..0218297c7 100644 --- a/docssrc/source/tutorials/custom_splitter/custom_splitter.ipynb +++ b/docssrc/source/tutorials/custom_splitter/custom_splitter.ipynb @@ -372,74 +372,74 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1467152:Dropping features: []\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Analyzing a sample of 18424\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:from a total population of 284807, this is equivalent to 6.5% of your data.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Using 7 processes to deduct types.\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: Time\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V2\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V4\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V6\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V8\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V10\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V12\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column Time has data type integer\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V1\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V6 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V7\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V2 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V3\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V8 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V9\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V4 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V5\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V10 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V11\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V12 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V13\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V7 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V1 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V14\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V16\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V5 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V18\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V3 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V20\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V9 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V14 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V15\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V22\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V11 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V24\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V13 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V26\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V16 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V17\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V18 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V19\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V17 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V28\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V20 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V21\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V22 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V24 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V25\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V23\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V15 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: Class\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V26 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: V27\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column Class has data type binary\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V23 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V28 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Infering type for: Amount\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V19 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V21 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V27 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column V25 has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Column Amount has data type float\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Starting statistical analysis\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Finished statistical analysis\u001b[0m\n" + "\u001B[32mINFO:lightwood-1467152:Dropping features: []\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Analyzing a sample of 18424\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:from a total population of 284807, this is equivalent to 6.5% of your data.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Using 7 processes to deduct types.\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: Time\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V2\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V4\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V6\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V8\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V10\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V12\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column Time has data type integer\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V1\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V6 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V7\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V2 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V3\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V8 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V9\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V4 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V5\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V10 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V11\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V12 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V13\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V7 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V1 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V14\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V16\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V5 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V18\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V3 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V20\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V9 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V14 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V15\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V22\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V11 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V24\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V13 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V26\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V16 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V17\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V18 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V19\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V17 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V28\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V20 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V21\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V22 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V24 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V25\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V23\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V15 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: Class\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V26 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: V27\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column Class has data type binary\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V23 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V28 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Infering type for: Amount\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V19 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V21 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V27 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column V25 has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Column Amount has data type float\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Starting statistical analysis\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Finished statistical analysis\u001B[0m\n" ] } ], @@ -511,7 +511,7 @@ "source": [ "%%writefile MyCustomSplitter.py\n", "\n", - "from lightwood.api.dtype import dtype\n", + "from type_infer.dtype import dtype\n", "import pandas as pd\n", "import numpy as np\n", "from typing import List, Dict\n", @@ -1191,9 +1191,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32mINFO:lightwood-1467152:Cleaning the data\u001b[0m\n", - "\u001b[37mDEBUG:lightwood-1467152: `preprocess` runtime: 7.83 seconds\u001b[0m\n", - "\u001b[32mINFO:lightwood-1467152:Splitting the data into train/test\u001b[0m\n", + "\u001B[32mINFO:lightwood-1467152:Cleaning the data\u001B[0m\n", + "\u001B[37mDEBUG:lightwood-1467152: `preprocess` runtime: 7.83 seconds\u001B[0m\n", + "\u001B[32mINFO:lightwood-1467152:Splitting the data into train/test\u001B[0m\n", "\n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", @@ -1204,7 +1204,7 @@ "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - "\u001b[37mDEBUG:lightwood-1467152: `split` runtime: 2.04 seconds\u001b[0m\n" + "\u001B[37mDEBUG:lightwood-1467152: `split` runtime: 2.04 seconds\u001B[0m\n" ] } ], @@ -1311,4 +1311,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/docssrc/source/tutorials/tutorial_data_analysis/tutorial_data_analysis.ipynb b/docssrc/source/tutorials/tutorial_data_analysis/tutorial_data_analysis.ipynb index 3c4e5d1bd..6d9a00655 100644 --- a/docssrc/source/tutorials/tutorial_data_analysis/tutorial_data_analysis.ipynb +++ b/docssrc/source/tutorials/tutorial_data_analysis/tutorial_data_analysis.ipynb @@ -245,7 +245,7 @@ "source": [ "Notice how, even though we only defined what the `target` was, there are a bunch of additional parameters that have been assigned a default value. That is fine for our purposes, but remember that you can set any of these according to your own predictive needs.\n", "\n", - "We also need to infer the type of each column. There is a method for this, `infer_types`, that we can use:" + "We also need to infer the type of each column. There is a method for this, `type_infer.infer.infer_types`, that we can use:" ] }, { @@ -295,8 +295,8 @@ } ], "source": [ - "from lightwood.data import infer_types\n", - "from lightwood.api.types import TypeInformation\n", + "from type_infer.infer import infer_types\n", + "from type_infer.base import TypeInformation\n", "\n", "type_information = infer_types(df, problem_definition.pct_invalid)\n", "\n", diff --git a/lightwood/__about__.py b/lightwood/__about__.py index 6a45fe23c..dcdf8d0db 100755 --- a/lightwood/__about__.py +++ b/lightwood/__about__.py @@ -1,6 +1,6 @@ __title__ = 'lightwood' __package_name__ = 'lightwood' -__version__ = '22.10.4.2' +__version__ = '22.11.2.0' __description__ = "Lightwood is a toolkit for automatic machine learning model building" __email__ = "community@mindsdb.com" __author__ = 'MindsDB Inc' diff --git a/lightwood/__init__.py b/lightwood/__init__.py index 674a97fa3..1e6458c07 100644 --- a/lightwood/__init__.py +++ b/lightwood/__init__.py @@ -4,9 +4,9 @@ from lightwood.api import __all__ as api_all_list from lightwood.api import * # noqa from lightwood import data -from lightwood.data import infer_types, statistical_analysis +from lightwood.data import statistical_analysis from lightwood.__about__ import __package_name__ as name, __version__ os.environ['TOKENIZERS_PARALLELISM'] = 'true' -__all__ = ['data', 'infer_types', 'statistical_analysis', 'name', '__version__', *api_all_list] +__all__ = ['data', 'statistical_analysis', 'name', '__version__', *api_all_list] diff --git a/lightwood/analysis/helpers/acc_stats.py b/lightwood/analysis/helpers/acc_stats.py index 8ca07e9a3..b3ccc6d61 100644 --- a/lightwood/analysis/helpers/acc_stats.py +++ b/lightwood/analysis/helpers/acc_stats.py @@ -5,7 +5,7 @@ import numpy as np from sklearn.metrics import confusion_matrix -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.analysis.base import BaseAnalysisBlock from lightwood.helpers.general import evaluate_accuracy from lightwood.helpers.log import log diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py index 95c66594c..97dfdfb79 100644 --- a/lightwood/analysis/nc/calibrate.py +++ b/lightwood/analysis/nc/calibrate.py @@ -7,7 +7,7 @@ import pandas as pd from sklearn.preprocessing import OneHotEncoder -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.api.types import PredictionArguments from lightwood.helpers.ts import add_tn_num_conf_bounds, add_tn_cat_conf_bounds, get_ts_groups diff --git a/lightwood/analysis/nc/norm.py b/lightwood/analysis/nc/norm.py index 3a7c32a4f..cde508c6c 100644 --- a/lightwood/analysis/nc/norm.py +++ b/lightwood/analysis/nc/norm.py @@ -6,7 +6,7 @@ from sklearn.linear_model import Ridge from sklearn.metrics import mean_absolute_error -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.mixer import BaseMixer from lightwood.api.types import PredictionArguments from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs diff --git a/lightwood/analysis/nc/util.py b/lightwood/analysis/nc/util.py index fc2554f0d..1fc6dba1e 100644 --- a/lightwood/analysis/nc/util.py +++ b/lightwood/analysis/nc/util.py @@ -5,7 +5,7 @@ import pandas as pd from torch.nn.functional import softmax -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype def t_softmax(x, t=1.0, axis=1): diff --git a/lightwood/api/__init__.py b/lightwood/api/__init__.py index 919555c57..05b130ccd 100644 --- a/lightwood/api/__init__.py +++ b/lightwood/api/__init__.py @@ -1,7 +1,6 @@ -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.api.types import ( JsonAI, - TypeInformation, StatisticalAnalysis, ProblemDefinition, TimeseriesSettings, @@ -29,7 +28,6 @@ "code_from_json_ai", "json_ai_from_problem", "JsonAI", - "TypeInformation", "StatisticalAnalysis", "ProblemDefinition", "TimeseriesSettings", diff --git a/lightwood/api/dtype.py b/lightwood/api/dtype.py deleted file mode 100644 index 59ca35672..000000000 --- a/lightwood/api/dtype.py +++ /dev/null @@ -1,48 +0,0 @@ -class dtype: - """ - Definitions of all data types currently supported. Dtypes currently supported include: - - - - **Numerical**: Data that should be represented in the form of a number. Currently ``integer``, ``float``, and ``quantity`` are supported. - - **Categorical**: Data that represents a class or label and is discrete. Currently ``binary``, ``categorical``, and ``tags`` are supported. - - **Date/Time**: Time-series data that is temporal/sequential. Currently ``date``, and ``datetime`` are supported. - - **Text**: Data that can be considered as language information. Currently ``short_text``, and ``rich_text`` are supported. Short text has a small vocabulary (~ 100 words) and is generally a limited number of characters. Rich text is anything with greater complexity. - - **Complex**: Data types that require custom techniques. Currently ``audio``, ``video`` and ``image`` are available, but highly experimental. - - **Array**: Data in the form of a sequence where order must be preserved. ``tsarray`` dtypes are for "normal" columns that will be transformed to arrays at a row-level because they will be treated as time series. - - **Miscellaneous**: Miscellaneous data descriptors include ``empty``, an explicitly unknown value versus ``invalid``, a data type not currently supported. - - Custom data types may be implemented here as a flag for subsequent treatment and processing. You are welcome to include your own definitions, so long as they do not override the existing type names (alternatively, if you do, please edit subsequent parts of the preprocessing pipeline to correctly indicate how you want to deal with these data types). - """ # noqa - - # Numerical type data - integer = "integer" - float = "float" - quantity = "quantity" - - # Categorical type data - binary = "binary" - categorical = "categorical" - tags = "tags" - - # Dates and Times (time-series) - date = "date" - datetime = "datetime" - - # Text - short_text = "short_text" - rich_text = "rich_text" - - # Complex Data types - image = "image" - audio = "audio" - video = "video" - - # Series/Sequences - num_array = "num_array" - cat_array = "cat_array" - num_tsarray = 'num_tsarray' - cat_tsarray = 'cat_tsarray' - - # Misc (Unk/NaNs) - empty = "empty" - invalid = "invalid" diff --git a/lightwood/api/high_level.py b/lightwood/api/high_level.py index c4d8ce2f1..14de7f8f9 100644 --- a/lightwood/api/high_level.py +++ b/lightwood/api/high_level.py @@ -5,7 +5,7 @@ import pandas as pd from lightwood.api.types import DataAnalysis, JsonAI, ProblemDefinition from lightwood.data import statistical_analysis -from lightwood.data import infer_types +from type_infer.infer import infer_types from lightwood.api.predictor import PredictorInterface from lightwood.api.json_ai import generate_json_ai import tempfile diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 2f19a2b98..3768c603e 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1,11 +1,11 @@ # TODO: _add_implicit_values unit test ensures NO changes for a fully specified file. from copy import deepcopy +from type_infer.base import TypeInformation from lightwood.helpers.templating import call, inline_dict, align from lightwood.helpers.templating import _consolidate_analysis_blocks from lightwood.api import dtype from lightwood.api.types import ( JsonAI, - TypeInformation, StatisticalAnalysis, ProblemDefinition, ) @@ -183,7 +183,7 @@ def generate_json_ai( """ Given ``TypeInformation``, ``StatisticalAnalysis``, and the ``ProblemDefinition``, generate a JSON config file with the necessary elements of the ML pipeline populated. - :param TypeInformation: Specifies what data types each column within the dataset are + :param TypeInformation: Specifies what data types each column within the dataset are. Generated by `mindsdb/type_infer`. :param statistical_analysis: :param problem_definition: Specifies details of the model training/building procedure, as defined by ``ProblemDefinition`` diff --git a/lightwood/api/types.py b/lightwood/api/types.py index 2453ce05e..9e54c3073 100644 --- a/lightwood/api/types.py +++ b/lightwood/api/types.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union import sys -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype if sys.version_info >= (3, 8): from typing import TypedDict @@ -14,6 +14,7 @@ from lightwood.helpers.log import log from dataclasses_json import dataclass_json from dataclasses_json.core import _asdict, Json +from type_infer.base import TypeInformation import json @@ -30,29 +31,6 @@ class Module(TypedDict): args: Dict[str, str] -@dataclass_json -@dataclass -class TypeInformation: - """ - For a dataset, provides information on columns types, how they're used, and any other potential identifiers. - - TypeInformation is generated within ``data.infer_types``, where small samples of each column are evaluated in a custom framework to understand what kind of data type the model is. The user may override data types, but it is recommended to do so within a JSON-AI config file. - - :param dtypes: For each column's name, the associated data type inferred. - :param additional_info: Any possible sub-categories or additional descriptive information. - :param identifiers: Columns within the dataset highly suspected of being identifiers or IDs. These do not contain informatic value, therefore will be ignored in subsequent training/analysis procedures unless manually indicated. - """ # noqa - - dtypes: Dict[str, str] - additional_info: Dict[str, object] - identifiers: Dict[str, str] - - def __init__(self): - self.dtypes = dict() - self.additional_info = dict() - self.identifiers = dict() - - @dataclass_json @dataclass class StatisticalAnalysis: diff --git a/lightwood/data/__init__.py b/lightwood/data/__init__.py index 1fe88866c..b164142b3 100644 --- a/lightwood/data/__init__.py +++ b/lightwood/data/__init__.py @@ -1,4 +1,3 @@ -from lightwood.data.infer_types import infer_types from lightwood.data.statistical_analysis import statistical_analysis from lightwood.data.cleaner import cleaner from lightwood.data.splitter import splitter @@ -6,5 +5,5 @@ from lightwood.data.timeseries_analyzer import timeseries_analyzer from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs -__all__ = ['infer_types', 'statistical_analysis', 'cleaner', 'splitter', 'transform_timeseries', 'timeseries_analyzer', +__all__ = ['statistical_analysis', 'cleaner', 'splitter', 'transform_timeseries', 'timeseries_analyzer', 'EncodedDs', 'ConcatedEncodedDs'] diff --git a/lightwood/data/cleaner.py b/lightwood/data/cleaner.py index a576ce105..b7cba6e72 100644 --- a/lightwood/data/cleaner.py +++ b/lightwood/data/cleaner.py @@ -7,12 +7,12 @@ import numpy as np import pandas as pd -from lightwood.api.dtype import dtype -from lightwood.helpers import text +from type_infer.dtype import dtype +from type_infer.helpers import is_nan_numeric, clean_float + from lightwood.helpers.log import log from lightwood.helpers.imputers import BaseImputer from lightwood.api.types import TimeseriesSettings -from lightwood.helpers.numeric import is_nan_numeric def cleaner( @@ -244,7 +244,7 @@ def _clean_float(element: object) -> Optional[float]: Given an element, converts it into float numeric format. If element is NaN, or inf, then returns None. """ try: - cleaned_float = text.clean_float(element) + cleaned_float = clean_float(element) if is_nan_numeric(cleaned_float): return None return cleaned_float diff --git a/lightwood/data/infer_types.py b/lightwood/data/infer_types.py deleted file mode 100644 index 15ef9597b..000000000 --- a/lightwood/data/infer_types.py +++ /dev/null @@ -1,432 +0,0 @@ -from collections import Counter -import random -from typing import List -import dateutil -from scipy.stats import norm -import pandas as pd -import numpy as np -import imghdr -import sndhdr -import multiprocessing as mp -from lightwood.api.types import TypeInformation -from lightwood.api.dtype import dtype -from lightwood.helpers.parallelism import get_nr_procs -from lightwood.helpers.text import (get_identifier_description_mp, cast_string_to_python_type, get_language_dist, - analyze_sentences) -from lightwood.helpers.log import log -import re -from lightwood.helpers.numeric import is_nan_numeric -from lightwood.helpers.seed import seed - - -# @TODO: hardcode for distance, time, subunits of currency (e.g. cents) and other common units -# @TODO: The json ml will contain the pattern we want to extract out of our quantity column, for the user modify (unit+multiplier) # noqa -# @TODO: Add tests with plenty of examples -def get_quantity_col_info(col_data: List[object]) -> str: - char_const = None - nr_map = set() - for val in col_data: - val = str(val) - char_part = re.sub("[0-9.,]", '', val) - numeric_bit = re.sub("[^0-9.,]", '', val).replace(',', '.') - - if len(char_part) == 0: - char_part = None - - if len(re.sub("[^0-9]", '', numeric_bit)) == 0 or numeric_bit.count('.') > 1: - numeric_bit = None - else: - numeric_bit = float(numeric_bit) - - if numeric_bit is None: - return False, None - else: - nr_map.add(numeric_bit) - - if char_const is None: - char_const = char_part - - if char_part is None or char_part != char_const: - return False, None - - if len(nr_map) > 20 and len(nr_map) > len(col_data) / 200: - return True, {char_const: { - 'multiplier': 1 - }} - else: - return False, None - - -def get_binary_type(element: object) -> str: - try: - is_img = imghdr.what(element) - if is_img is not None: - return dtype.image - - # @TODO: currently we don differentiate between audio and video - is_audio = sndhdr.what(element) - # apparently `sndhdr` is really bad.. - for audio_ext in ['.wav', '.mp3']: - if element.endswith(audio_ext): - is_audio = True - if is_audio is not None: - return dtype.audio - except Exception: - # Not a file or file doesn't exist - return None - - -def get_numeric_type(element: object) -> str: - """ Returns the subtype inferred from a number string, or False if its not a number""" - string_as_nr = cast_string_to_python_type(str(element)) - - try: - if string_as_nr == int(string_as_nr): - string_as_nr = int(string_as_nr) - except Exception: - pass - - if isinstance(string_as_nr, float): - return dtype.float - elif isinstance(string_as_nr, int): - return dtype.integer - else: - try: - if is_nan_numeric(element): - return dtype.integer - else: - return None - except Exception: - return None - - -def type_check_sequence(element: object) -> str: - dtype_guess = None - - if isinstance(element, List): - all_nr = all([get_numeric_type(ele) for ele in element]) - if all_nr: - dtype_guess = dtype.num_array - else: - dtype_guess = dtype.cat_array - else: - for sep_char in [',', '\t', '|', ' ']: - all_nr = True - if '[' in element: - ele_arr = element.rstrip(']').lstrip('[').split(sep_char) - else: - ele_arr = element.rstrip(')').lstrip('(').split(sep_char) - - for ele in ele_arr: - if not get_numeric_type(ele): - all_nr = False - break - - if len(ele_arr) > 1 and all_nr: - dtype_guess = dtype.num_array - - return dtype_guess - - -def type_check_date(element: object) -> str: - try: - dt = dateutil.parser.parse(str(element)) - - # Not accurate 100% for a single datetime str, but should work in aggregate - if dt.hour == 0 and dt.minute == 0 and dt.second == 0 and len(str(element)) <= 16: - return dtype.date - else: - return dtype.datetime - - except ValueError: - return None - - -def count_data_types_in_column(data): - dtype_counts = Counter() - - type_checkers = [get_numeric_type, - type_check_sequence, - get_binary_type, - type_check_date] - - for element in data: - for type_checker in type_checkers: - try: - dtype_guess = type_checker(element) - except Exception: - dtype_guess = None - if dtype_guess is not None: - dtype_counts[dtype_guess] += 1 - break - else: - dtype_counts[dtype.invalid] += 1 - - return dtype_counts - - -def get_column_data_type(arg_tup): - """ - Provided the column data, define its data type and data subtype. - - :param data: an iterable containing a sample of the data frame - :param full_data: an iterable containing the whole column of a data frame - - :return: type and type distribution, we can later use type_distribution to determine data quality - NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE - """ - data, full_data, col_name, pct_invalid = arg_tup - log.info(f'Infering type for: {col_name}') - additional_info = {'other_potential_dtypes': []} - - warn = [] - info = [] - if len(data) == 0: - warn.append(f'Column {col_name} has no data in it. ') - warn.append(f'Please remove {col_name} from the training file or fill in some of the values !') - return None, None, additional_info, warn, info - - dtype_counts = count_data_types_in_column(data) - - known_dtype_dist = {k: v for k, v in dtype_counts.items()} - if dtype.float in known_dtype_dist and dtype.integer in known_dtype_dist: - known_dtype_dist[dtype.float] += known_dtype_dist[dtype.integer] - del known_dtype_dist[dtype.integer] - - if dtype.datetime in known_dtype_dist and dtype.date in known_dtype_dist: - known_dtype_dist[dtype.datetime] += known_dtype_dist[dtype.date] - del known_dtype_dist[dtype.date] - - max_known_dtype, max_known_dtype_count = max( - known_dtype_dist.items(), - key=lambda kv: kv[1] - ) - - actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data) - if max_known_dtype is None or max_known_dtype == dtype.invalid: - curr_dtype = None - elif actual_pct_invalid > pct_invalid: - if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * pct_invalid: - curr_dtype = max_known_dtype - else: - curr_dtype = None - else: - curr_dtype = max_known_dtype - - nr_vals = len(full_data) - nr_distinct_vals = len(set([str(x) for x in full_data])) - - # Is it a quantity? - if curr_dtype not in (dtype.datetime, dtype.date): - is_quantity, quantitiy_info = get_quantity_col_info(full_data) - if is_quantity: - additional_info['quantitiy_info'] = quantitiy_info - curr_dtype = dtype.quantity - known_dtype_dist = { - dtype.quantity: nr_vals - } - - # Check for Tags subtype - if curr_dtype not in (dtype.quantity, dtype.num_array): - lengths = [] - unique_tokens = set() - - can_be_tags = False - if all(isinstance(x, str) for x in data): - can_be_tags = True - delimiter = ',' - for item in data: - item_tags = [t.strip() for t in item.split(delimiter)] - lengths.append(len(item_tags)) - unique_tokens = unique_tokens.union(set(item_tags)) - - # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa - if (can_be_tags and np.mean(lengths) > 1.3 and - 6 <= len(unique_tokens) <= 30 and - len(unique_tokens) / np.mean(lengths) < (len(data) / 4)): - curr_dtype = dtype.tags - - # Categorical based on unique values - if curr_dtype not in (dtype.date, dtype.datetime, dtype.tags, dtype.cat_array): - if curr_dtype in (dtype.integer, dtype.float): - is_categorical = nr_distinct_vals < 10 - else: - is_categorical = nr_distinct_vals < min(max((nr_vals / 100), 10), 3000) - - if is_categorical: - if curr_dtype is not None: - additional_info['other_potential_dtypes'].append(curr_dtype) - curr_dtype = dtype.categorical - - # If curr_data_type is still None, then it's text or category - if curr_dtype is None: - log.info(f'Doing text detection for column: {col_name}') - lang_dist = get_language_dist(data) - - # Normalize lang probabilities - for lang in lang_dist: - lang_dist[lang] /= len(data) - - # If most cells are unknown language then it's categorical - if lang_dist['Unknown'] > 0.5: - curr_dtype = dtype.categorical - else: - nr_words, word_dist, nr_words_dist = analyze_sentences(data) - - if 1 in nr_words_dist and nr_words_dist[1] == nr_words: - curr_dtype = dtype.categorical - else: - if len(word_dist) > 500 and nr_words / len(data) > 5: - curr_dtype = dtype.rich_text - else: - curr_dtype = dtype.short_text - - return curr_dtype, {curr_dtype: len(data)}, additional_info, warn, info - - if curr_dtype in [dtype.categorical, dtype.rich_text, dtype.short_text, dtype.cat_array]: - known_dtype_dist = {curr_dtype: len(data)} - - if nr_distinct_vals < 3 and curr_dtype == dtype.categorical: - curr_dtype = dtype.binary - known_dtype_dist[dtype.binary] = known_dtype_dist[dtype.categorical] - del known_dtype_dist[dtype.categorical] - - log.info(f'Column {col_name} has data type {curr_dtype}') - return curr_dtype, known_dtype_dist, additional_info, warn, info - - -def calculate_sample_size( - population_size, - margin_error=.05, - confidence_level=.99, - sigma=1 / 2 -): - """ - Calculate the minimal sample size to use to achieve a certain - margin of error and confidence level for a sample estimate - of the population mean. - Inputs - ------- - population_size: integer - Total size of the population that the sample is to be drawn from. - margin_error: number - Maximum expected difference between the true population parameter, - such as the mean, and the sample estimate. - confidence_level: number in the interval (0, 1) - If we were to draw a large number of equal-size samples - from the population, the true population parameter - should lie within this percentage - of the intervals (sample_parameter - e, sample_parameter + e) - where e is the margin_error. - sigma: number - The standard deviation of the population. For the case - of estimating a parameter in the interval [0, 1], sigma=1/2 - should be sufficient. - """ - alpha = 1 - (confidence_level) - # dictionary of confidence levels and corresponding z-scores - # computed via norm.ppf(1 - (alpha/2)), where norm is - # a normal distribution object in scipy.stats. - # Here, ppf is the percentile point function. - zdict = { - .90: 1.645, - .91: 1.695, - .99: 2.576, - .97: 2.17, - .94: 1.881, - .93: 1.812, - .95: 1.96, - .98: 2.326, - .96: 2.054, - .92: 1.751 - } - if confidence_level in zdict: - z = zdict[confidence_level] - else: - # Inf fix - if alpha == 0.0: - alpha += 0.001 - z = norm.ppf(1 - (alpha / 2)) - N = population_size - M = margin_error - numerator = z**2 * sigma**2 * (N / (N - 1)) - denom = M**2 + ((z**2 * sigma**2) / (N - 1)) - return numerator / denom - - -def sample_data(df: pd.DataFrame): - population_size = len(df) - if population_size <= 50: - sample_size = population_size - else: - sample_size = int(round(calculate_sample_size(population_size, 0.01, 1 - 0.005))) - - population_size = len(df) - input_data_sample_indexes = random.sample(range(population_size), sample_size) - return df.iloc[input_data_sample_indexes] - - -def infer_types(data: pd.DataFrame, pct_invalid: float, seed_nr: int = 420, mp_cutoff: int = 1e4) -> TypeInformation: - seed(seed_nr) - type_information = TypeInformation() - sample_df = sample_data(data) - sample_size = len(sample_df) - population_size = len(data) - log.info(f'Analyzing a sample of {sample_size}') - log.info( - f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa - - nr_procs = get_nr_procs(data) - if data.size > mp_cutoff and nr_procs > 1: - log.info(f'Using {nr_procs} processes to deduct types.') - pool = mp.Pool(processes=nr_procs) - # Make type `object` so that dataframe cells can be python lists - answer_arr = pool.map(get_column_data_type, [ - (sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values - ]) - pool.close() - pool.join() - else: - answer_arr = [] - for x in sample_df.columns.values: - answer_arr.append(get_column_data_type([sample_df[x].dropna(), data[x], x, pct_invalid])) - - for i, col_name in enumerate(sample_df.columns.values): - (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i] - - for msg in warn: - log.warning(msg) - for msg in info: - log.info(msg) - - if data_dtype is None: - data_dtype = dtype.invalid - - type_information.dtypes[col_name] = data_dtype - type_information.additional_info[col_name] = { - 'dtype_dist': data_dtype_dist - } - - if data.size > mp_cutoff and nr_procs > 1: - pool = mp.Pool(processes=nr_procs) - answer_arr = pool.map(get_identifier_description_mp, [ - (data[x], x, type_information.dtypes[x]) - for x in sample_df.columns.values - ]) - pool.close() - pool.join() - else: - answer_arr = [] - for x in sample_df.columns.values: - answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]]) - answer_arr.append(answer) - - for i, col_name in enumerate(sample_df.columns.values): - # work with the full data - if answer_arr[i] is not None: - log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"') - type_information.identifiers[col_name] = answer_arr[i] - - # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere - - return type_information diff --git a/lightwood/data/splitter.py b/lightwood/data/splitter.py index eb167998a..a2609292f 100644 --- a/lightwood/data/splitter.py +++ b/lightwood/data/splitter.py @@ -3,9 +3,9 @@ import numpy as np import pandas as pd +from type_infer.dtype import dtype from lightwood.helpers.log import log -from lightwood.api.dtype import dtype from lightwood.api.types import TimeseriesSettings diff --git a/lightwood/data/statistical_analysis.py b/lightwood/data/statistical_analysis.py index 2fb48aa35..6e494a1b0 100644 --- a/lightwood/data/statistical_analysis.py +++ b/lightwood/data/statistical_analysis.py @@ -10,7 +10,7 @@ from lightwood.helpers.seed import seed from lightwood.data.cleaner import cleaner from lightwood.helpers.log import log -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from scipy.stats import entropy from lightwood.data.cleaner import _clean_float diff --git a/lightwood/data/timeseries_analyzer.py b/lightwood/data/timeseries_analyzer.py index 28d2f7525..0f723522b 100644 --- a/lightwood/data/timeseries_analyzer.py +++ b/lightwood/data/timeseries_analyzer.py @@ -9,7 +9,7 @@ from sktime.transformations.series.detrend import ConditionalDeseasonalizer from lightwood.api.types import TimeseriesSettings -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.helpers.ts import get_ts_groups, get_delta, get_group_matches, Differencer from lightwood.helpers.log import log from lightwood.encoder.time_series.helpers.common import generate_target_group_normalizers diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index b6afb673f..251fd1ae6 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -6,7 +6,7 @@ from lightwood.encoder.base import BaseEncoder from lightwood.helpers.log import log from lightwood.helpers.general import is_none -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype class NumericEncoder(BaseEncoder): diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py index 1fd06fb6c..e7f68186e 100644 --- a/lightwood/encoder/text/short.py +++ b/lightwood/encoder/text/short.py @@ -2,7 +2,7 @@ import torch from lightwood.encoder import BaseEncoder from lightwood.encoder.categorical import CategoricalAutoEncoder -from lightwood.helpers.text import tokenize_text +from type_infer.helpers import tokenize_text from lightwood.helpers.torch import concat_vectors_and_pad, average_vectors import pandas as pd diff --git a/lightwood/encoder/time_series/helpers/common.py b/lightwood/encoder/time_series/helpers/common.py index 87bec3133..038695ad0 100644 --- a/lightwood/encoder/time_series/helpers/common.py +++ b/lightwood/encoder/time_series/helpers/common.py @@ -2,7 +2,7 @@ import pandas as pd from lightwood.api.types import TimeseriesSettings -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.encoder.helpers import MinMaxNormalizer, CatNormalizer from lightwood.helpers.ts import get_group_matches diff --git a/lightwood/ensemble/best_of.py b/lightwood/ensemble/best_of.py index 00d08b393..1b627bf10 100644 --- a/lightwood/ensemble/best_of.py +++ b/lightwood/ensemble/best_of.py @@ -4,7 +4,7 @@ import pandas as pd from lightwood.helpers.log import log -from lightwood.helpers.numeric import is_nan_numeric +from type_infer.helpers import is_nan_numeric from lightwood.mixer.base import BaseMixer from lightwood.ensemble.base import BaseEnsemble from lightwood.api.types import PredictionArguments, SubmodelData diff --git a/lightwood/ensemble/mode_ensemble.py b/lightwood/ensemble/mode_ensemble.py index 4f0d407c7..9f74e91c2 100644 --- a/lightwood/ensemble/mode_ensemble.py +++ b/lightwood/ensemble/mode_ensemble.py @@ -9,7 +9,7 @@ from lightwood.data.encoded_ds import EncodedDs from lightwood import dtype from lightwood.helpers.general import evaluate_accuracy -from lightwood.helpers.numeric import is_nan_numeric +from type_infer.helpers import is_nan_numeric from lightwood.helpers.log import log diff --git a/lightwood/ensemble/ts_stacked_ensemble.py b/lightwood/ensemble/ts_stacked_ensemble.py index 12ccce5bf..69ed051a0 100644 --- a/lightwood/ensemble/ts_stacked_ensemble.py +++ b/lightwood/ensemble/ts_stacked_ensemble.py @@ -11,7 +11,7 @@ from lightwood.ensemble.stacked_ensemble import StackedEnsemble from lightwood.encoder.array.ts_num_array import TsArrayNumericEncoder from lightwood.api.types import PredictionArguments -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.data.encoded_ds import EncodedDs from lightwood.helpers.log import log diff --git a/lightwood/ensemble/weighted_mean_ensemble.py b/lightwood/ensemble/weighted_mean_ensemble.py index c7853492e..70307b23a 100644 --- a/lightwood/ensemble/weighted_mean_ensemble.py +++ b/lightwood/ensemble/weighted_mean_ensemble.py @@ -4,7 +4,7 @@ import pandas as pd from lightwood.helpers.log import log -from lightwood.helpers.numeric import is_nan_numeric +from type_infer.helpers import is_nan_numeric from lightwood.mixer.base import BaseMixer from lightwood.ensemble.base import BaseEnsemble from lightwood.api.types import PredictionArguments diff --git a/lightwood/helpers/__init__.py b/lightwood/helpers/__init__.py index 153fb18ce..47ba7b9ce 100644 --- a/lightwood/helpers/__init__.py +++ b/lightwood/helpers/__init__.py @@ -7,20 +7,18 @@ add_tn_cat_conf_bounds from lightwood.helpers.io import read_from_path_or_url from lightwood.helpers.parallelism import get_nr_procs, mut_method_call, run_mut_method -from lightwood.helpers.numeric import is_nan_numeric, filter_nan_and_none +from lightwood.helpers.numeric import filter_nan_and_none from lightwood.helpers.seed import seed -from lightwood.helpers.text import tokenize_text, analyze_sentences, decontracted, contains_alnum,\ - get_identifier_description, get_identifier_description_mp, get_pct_auto_increment, extract_digits, isascii,\ - hashtext, splitRecursive, cast_string_to_python_type, gen_chars, clean_float, word_tokenize, get_language_dist +from lightwood.helpers.text import get_pct_auto_increment, extract_digits, isascii,\ + hashtext, splitRecursive, gen_chars, word_tokenize from lightwood.helpers.torch import average_vectors, concat_vectors_and_pad, LightwoodAutocast __all__ = ['to_binary', 'f1_score', 'recall_score', 'precision_score', 'r2_score', 'is_cuda_compatible', 'get_devices', 'get_group_matches', 'get_ts_groups', 'mase', 'is_none', 'evaluate_accuracy', 'evaluate_num_array_accuracy', - 'evaluate_array_accuracy', 'evaluate_cat_array_accuracy', 'bounded_ts_accuracy', + 'evaluate_array_accuracy', 'evaluate_cat_array_accuracy', 'bounded_ts_accuracy', 'get_pct_auto_increment', 'evaluate_multilabel_accuracy', 'evaluate_regression_accuracy', 'read_from_path_or_url', 'get_nr_procs', - 'mut_method_call', 'run_mut_method', 'tokenize_text', 'analyze_sentences', 'decontracted', 'contains_alnum', - 'get_identifier_description', 'get_identifier_description_mp', 'get_pct_auto_increment', + 'mut_method_call', 'run_mut_method', 'extract_digits', 'isascii', 'get_inferred_timestamps', 'add_tn_num_conf_bounds', 'add_tn_cat_conf_bounds', - 'hashtext', 'splitRecursive', 'cast_string_to_python_type', 'gen_chars', 'clean_float', 'word_tokenize', - 'get_language_dist', 'average_vectors', 'concat_vectors_and_pad', 'LightwoodAutocast', 'is_nan_numeric', 'filter_nan_and_none', 'seed'] # noqa + 'hashtext', 'splitRecursive', 'gen_chars', 'word_tokenize', + 'average_vectors', 'concat_vectors_and_pad', 'LightwoodAutocast', 'filter_nan_and_none', 'seed'] # noqa diff --git a/lightwood/helpers/general.py b/lightwood/helpers/general.py index ff54fdfad..78051ce1c 100644 --- a/lightwood/helpers/general.py +++ b/lightwood/helpers/general.py @@ -4,8 +4,8 @@ import numpy as np import pandas as pd from sklearn.metrics import r2_score, f1_score, mean_absolute_error, balanced_accuracy_score +from type_infer.helpers import is_nan_numeric from sktime.performance_metrics.forecasting import mean_absolute_percentage_error -from lightwood.helpers.numeric import is_nan_numeric from lightwood.helpers.ts import get_group_matches diff --git a/lightwood/helpers/numeric.py b/lightwood/helpers/numeric.py index eac03d643..037ecaa71 100644 --- a/lightwood/helpers/numeric.py +++ b/lightwood/helpers/numeric.py @@ -1,24 +1,5 @@ from typing import Iterable - - -def is_nan_numeric(value: object) -> bool: - """ - Determines if **value** might be `nan` or `inf` or some other numeric value (i.e. which can be cast as `float`) that is not actually a number. - """ # noqa - - try: - value = str(value) - value = float(value) - except Exception: - return False - - try: - if isinstance(value, float): - a = int(value) # noqa - isnan = False - except Exception: - isnan = True - return isnan +from type_infer.helpers import is_nan_numeric def filter_nan_and_none(series: Iterable) -> list: diff --git a/lightwood/helpers/templating.py b/lightwood/helpers/templating.py index 80e30056f..f2987d882 100644 --- a/lightwood/helpers/templating.py +++ b/lightwood/helpers/templating.py @@ -2,7 +2,7 @@ import numpy as np -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype def is_allowed(v): diff --git a/lightwood/helpers/text.py b/lightwood/helpers/text.py index d261bb600..a17f2f2c5 100644 --- a/lightwood/helpers/text.py +++ b/lightwood/helpers/text.py @@ -8,18 +8,9 @@ * permission of MindsDB Inc ******************************************************* """ -from collections import Counter, defaultdict -import string import json -import re import hashlib -from typing import Iterable -import numpy as np -import scipy.stats as st -from langid.langid import LanguageIdentifier -from langid.langid import model as langid_model import nltk -from lightwood.api.dtype import dtype try: @@ -34,57 +25,6 @@ nltk.download('stopwords', quiet=True) -def get_language_dist(data): - lang_dist = defaultdict(lambda: 0) - lang_dist['Unknown'] = 0 - lang_probs_cache = dict() - identifier = LanguageIdentifier.from_modelstring(langid_model, norm_probs=True) - for text in data: - text = str(text) - text = ''.join([c for c in text if c not in string.punctuation]) - if text not in lang_probs_cache: - try: - lang_probs = identifier.classify(text) - except Exception: - lang_probs = [] - lang_probs_cache[text] = lang_probs - - lang_probs = lang_probs_cache[text] - if len(lang_probs) > 0 and lang_probs[1] > 10 * (1 / len(identifier.nb_classes)): - lang_dist[lang_probs[0]] += 1 - else: - lang_dist['Unknown'] += 1 - - return dict(lang_dist) - - -def analyze_sentences(data): - """ - :param data: list of str - - :returns: - tuple( - int: nr words total, - dict: word_dist, - dict: nr_words_dist - ) - """ - nr_words = 0 - word_dist = defaultdict(int) - nr_words_dist = defaultdict(int) - stop_words = set(stopwords.words('english')) - for text in map(str, data): - text = text.lower() - tokens = tokenize_text(text) - tokens_no_stop = [x for x in tokens if x not in stop_words] - nr_words_dist[len(tokens)] += 1 - nr_words += len(tokens) - for tok in tokens_no_stop: - word_dist[tok] += 1 - - return nr_words, dict(word_dist), dict(nr_words_dist) - - def word_tokenize(string): sep_tag = '{#SEP#}' for separator in WORD_SEPARATORS: @@ -95,23 +35,6 @@ def word_tokenize(string): return num_words -def clean_float(val): - if isinstance(val, (int, float)): - return float(val) - - if isinstance(val, float): - return val - - val = str(val).strip(' ') - val = val.replace(',', '.') - val = val.rstrip('"').lstrip('"') - - if val in ('', '.', 'None', 'nan'): - return None - - return float(val) - - def gen_chars(length, character): """ # lambda to Generates a string consisting of `length` consiting of repeating `character` @@ -122,24 +45,6 @@ def gen_chars(length, character): return ''.join([character for _ in range(length)]) -def cast_string_to_python_type(string): - """ Returns None, an integer, float or a string from a string""" - if string is None or string == '': - return None - - if string.isnumeric(): - # Did you know you can write fractions in unicode, and they are numeric but can't be cast to integers !? - try: - return int(string) - except Exception: - return None - - try: - return clean_float(string) - except Exception: - return string - - def splitRecursive(word, tokens): words = [str(word)] for token in tokens: @@ -156,24 +61,6 @@ def hashtext(cell): return hashlib.md5(text.encode('utf8')).hexdigest() -def _is_foreign_key_name(name): - for endings in ['id', 'ID', 'Id']: - for add in ['-', '_', ' ']: - if name.endswith(add + endings): - return True - for endings in ['ID', 'Id']: - if name.endswith(endings): - return True - return False - - -def _is_identifier_name(name): - for keyword in ['account', 'uuid', 'identifier', 'user']: - if keyword in name: - return True - return False - - def isascii(string): """ Used instead of str.isascii because python 3.6 doesn't have that @@ -203,94 +90,3 @@ def get_pct_auto_increment(data): prev_nr = nr return increase_by_one / (len(data) - 1) - - -def get_identifier_description_mp(arg_tup): - data, column_name, data_dtype = arg_tup - return get_identifier_description(data, column_name, data_dtype) - - -def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype): - data = list(data) - if isinstance(data[0], list): - nr_unique = len(set(tuple(x) for x in data)) - else: - nr_unique = len(set(data)) - - if nr_unique == 1: - return 'No Information' - - unique_pct = nr_unique / len(data) - - spaces = [len(str(x).split(' ')) - 1 for x in data] - mean_spaces = np.mean(spaces) - - # Detect auto incrementing index - # -- some cases where I guess people do want to use this for learning, so ignoring this check for now... - # if data_dtype == dtype.integer: - # if get_pct_auto_increment(data) > 0.98 and unique_pct > 0.99: - # return 'Auto-incrementing identifier' - - # Detect hash - all_same_length = all(len(str(data[0])) == len(str(x)) for x in data) - uuid_charset = set('0123456789abcdefABCDEF-') - all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data) - is_uuid = all_uuid_charset and all_same_length - - if all_same_length and len(data) == nr_unique and data_dtype not in (dtype.integer, dtype.float): - str_data = [str(x) for x in data] - randomness_per_index = [] - for i, _ in enumerate(str_data[0]): - N = len(set(x[i] for x in str_data)) - S = st.entropy([*Counter(x[i] for x in str_data).values()]) - randomness_per_index.append(S / np.log(N)) - - if np.mean(randomness_per_index) > 0.95: - return 'Hash-like identifier' - - # Detect foreign key - if data_dtype == dtype.integer: - if _is_foreign_key_name(column_name): - return 'Foreign key' - - if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): - if unique_pct > 0.98: - if is_uuid: - return 'UUID' - else: - return 'Unknown identifier' - - # Everything is unique and it's too short to be rich text - if data_dtype in (dtype.categorical, dtype.binary, dtype.short_text, dtype.rich_text) and \ - unique_pct > 0.99999 and mean_spaces < 1: - return 'Unknown identifier' - - return None - - -def contains_alnum(text): - for c in text: - if c.isalnum(): - return True - return False - - -def decontracted(phrase): - # specific - phrase = re.sub(r"won\'t", "will not", phrase) - phrase = re.sub(r"can\'t", "can not", phrase) - - # general - phrase = re.sub(r"n\'t", " not", phrase) - phrase = re.sub(r"\'re", " are", phrase) - phrase = re.sub(r"\'s", " is", phrase) - phrase = re.sub(r"\'d", " would", phrase) - phrase = re.sub(r"\'ll", " will", phrase) - phrase = re.sub(r"\'t", " not", phrase) - phrase = re.sub(r"\'ve", " have", phrase) - phrase = re.sub(r"\'m", " am", phrase) - return phrase - - -def tokenize_text(text): - return [t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t)] diff --git a/lightwood/mixer/gluonts.py b/lightwood/mixer/gluonts.py index 1136e7b2e..233ccb8a1 100644 --- a/lightwood/mixer/gluonts.py +++ b/lightwood/mixer/gluonts.py @@ -1,3 +1,4 @@ +import importlib from copy import deepcopy from typing import Dict, Union @@ -10,6 +11,7 @@ from gluonts.model.deepar import DeepAREstimator # @TODO: support for other estimators from gluonts.mx import Trainer from gluonts.mx.trainer.callback import TrainingHistory +from gluonts.mx.distribution.student_t import StudentTOutput from lightwood.helpers.log import log from lightwood.mixer.base import BaseMixer @@ -35,6 +37,7 @@ def __init__( ts_analysis: Dict, n_epochs: int = 10, early_stop_patience: int = 3, + distribution_output: str = '', seed: int = 0, ): """ @@ -66,6 +69,13 @@ def __init__( self.patience = early_stop_patience self.seed = seed + dist_module = importlib.import_module('.'.join(['gluonts.mx.distribution', + *distribution_output.split(".")[:-1]])) + try: + self.distribution = getattr(dist_module, distribution_output.split(".")[-1])() + except AttributeError: + self.distribution = StudentTOutput() # use StudentTOutput when the provided distribution does not exist + def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: """ Fits the model. """ # noqa log.info('Started fitting GluonTS forecasting model') @@ -77,6 +87,7 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: estimator = DeepAREstimator( freq=train_ds.freq, prediction_length=self.horizon, + distr_output=self.distribution, trainer=Trainer(epochs=self.n_epochs, callbacks=[EarlyStop(patience=self.patience)]) ) self.model = estimator.train(train_ds) diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py index 3a821cd19..7a3dc2ea2 100644 --- a/lightwood/mixer/random_forest.py +++ b/lightwood/mixer/random_forest.py @@ -6,8 +6,9 @@ import optuna from typing import Dict, Union from optuna import trial as trial_module -from sklearn.model_selection import check_cv, cross_val_predict +from sklearn import clone from sklearn.metrics import mean_squared_error +from sklearn.model_selection import check_cv, cross_val_predict from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from lightwood.api import dtype @@ -96,18 +97,20 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: train_data = ConcatedEncodedDs([train_data, dev_data]) # initialize the model + init_params = { + 'n_estimators': 50, + 'max_depth': 5, + 'max_features': 1., + 'bootstrap': True, + 'n_jobs': -1, + 'random_state': 0 + } + if self.is_classifier: X = train_data.get_encoded_data(include_target=False) Y = train_data.get_column_original_data(self.target) - self.model = RandomForestClassifier( - n_estimators=50, - max_depth=5, - max_features=1., - bootstrap=True, - n_jobs=-1, - random_state=0 - ) + self.model = RandomForestClassifier(**init_params) self.model.fit(X, Y) # sample_weight @@ -116,14 +119,7 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: X = train_data.get_encoded_data(include_target=False) Y = train_data.get_encoded_column_data(self.target) - self.model = RandomForestRegressor( - n_estimators=50, - max_depth=5, - max_features=1., - bootstrap=True, - n_jobs=-1, - random_state=0 - ) + self.model = RandomForestRegressor(**init_params) self.model.fit(X, Y) # sample_weight @@ -136,7 +132,7 @@ def objective(trial: trial_module.Trial): ["gini", "entropy"]) if self.is_classifier else 'squared_error' params = { - 'n_estimators': trial.suggest_int('num_estimators', 2, 512), + 'n_estimators': trial.suggest_int('n_estimators', 2, 512), 'max_depth': trial.suggest_int('max_depth', 2, 15), 'min_samples_split': trial.suggest_int("min_samples_split", 2, 20), 'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 20), @@ -153,14 +149,27 @@ def objective(trial: trial_module.Trial): return score elapsed = time.time() - started - num_trials = max(min(int(self.stop_after / elapsed) - 1, self.num_trials), 0) + num_trials = max(min(int(self.stop_after / elapsed) - 2, self.num_trials), 0) if self.use_optuna: log.info(f'The number of trials (Optuna) is {num_trials}.') if self.use_optuna and num_trials > 0: + init_score = metric(Y, getattr(self.model, predict_method)(X)) + study = optuna.create_study(direction='minimize') study.optimize(objective, n_trials=num_trials) - log.info(f'RandomForest parameters of the best trial: {study.best_params}') + + opt_model = clone(self.model) + opt_model.set_params(**study.best_params) + opt_model.fit(X, Y) + optuna_score = metric(Y, getattr(opt_model, predict_method)(X)) + log.info(f'init_score: {init_score}, optuna_score: {optuna_score}') + + if init_score <= optuna_score: + self.model.set_params(**init_params) + else: + self.model = opt_model + log.info(f'RandomForest parameters of the best trial: {study.best_params}') # evaluate model effects if self.fit_on_dev: diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py index 6eda7420b..021747d9c 100644 --- a/lightwood/mixer/regression.py +++ b/lightwood/mixer/regression.py @@ -4,7 +4,7 @@ from sklearn.linear_model import Ridge from lightwood.helpers.log import log -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype from lightwood.mixer import BaseMixer from lightwood.encoder.base import BaseEncoder from lightwood.api.types import PredictionArguments diff --git a/requirements.txt b/requirements.txt index 6255315b9..3ad43b0d4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ +type_infer>=0.0.6 numpy >=1.18.0,<=1.22.0 -NLTK >=3,<3.6 +nltk >=3,<3.6 python-dateutil <2.8.1,>=2.1 pandas >=1.1.5,<=1.3.3 schema >=0.6.8 diff --git a/tests/integration/advanced/test_timeseries.py b/tests/integration/advanced/test_timeseries.py index fc1034eb5..c52164890 100644 --- a/tests/integration/advanced/test_timeseries.py +++ b/tests/integration/advanced/test_timeseries.py @@ -463,31 +463,31 @@ def test_8_time_series_double_grouped_regression(self): target = 'MA' order_by = 'saledate' window = 8 - horizon = 4 - train, _, test = stratify(data, pct_train=0.8, pct_dev=0, pct_test=0.2, stratify_on=gby, seed=1, - reshuffle=False) - jai = json_ai_from_problem(train, - ProblemDefinition.from_dict({'target': target, - 'time_aim': 30, - 'timeseries_settings': { - 'group_by': gby, - 'horizon': horizon, - 'order_by': order_by, - 'window': window - }})) - code = code_from_json_ai(jai) - pred = predictor_from_code(code) + for horizon in [1, 4]: + train, _, test = stratify(data, pct_train=0.8, pct_dev=0, pct_test=0.2, stratify_on=gby, seed=1, + reshuffle=False) + jai = json_ai_from_problem(train, + ProblemDefinition.from_dict({'target': target, + 'time_aim': 30, # short time aim + 'timeseries_settings': { + 'group_by': gby, + 'horizon': horizon, + 'order_by': order_by, + 'window': window + }})) + code = code_from_json_ai(jai) + pred = predictor_from_code(code) - # Test with a short time aim with inferring mode, check timestamps are further into the future than test dates - test['__mdb_forecast_offset'] = 1 - train_and_check_time_aim(pred, train, ignore_time_aim=True) - preds = pred.predict(test) - self.check_ts_prediction_df(preds, horizon, [order_by]) + # Test with inferring mode, check timestamps are further into the future than test dates + test['__mdb_forecast_offset'] = 1 + train_and_check_time_aim(pred, train, ignore_time_aim=True) + preds = pred.predict(test) + self.check_ts_prediction_df(preds, horizon, [order_by]) - for idx, row in preds.iterrows(): - row[f'order_{order_by}'] = [row[f'order_{order_by}']] if horizon == 1 else row[f'order_{order_by}'] - for timestamp in row[f'order_{order_by}']: - assert timestamp > pd.to_datetime(test[order_by]).max().timestamp() + for idx, row in preds.iterrows(): + row[f'order_{order_by}'] = [row[f'order_{order_by}']] if horizon == 1 else row[f'order_{order_by}'] + for timestamp in row[f'order_{order_by}']: + assert timestamp > pd.to_datetime(test[order_by]).max().timestamp() def test_9_ts_dedupe(self): """ Test time series de-duplication procedures """ diff --git a/tests/integration/basic/test_regression.py b/tests/integration/basic/test_regression.py index 8e66799c7..5784c2c0c 100644 --- a/tests/integration/basic/test_regression.py +++ b/tests/integration/basic/test_regression.py @@ -1,4 +1,4 @@ -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype import unittest import pandas as pd from sklearn.metrics import r2_score diff --git a/tests/unit_tests/data/test_infer_types.py b/tests/unit_tests/data/test_infer_types.py deleted file mode 100644 index fb5969a91..000000000 --- a/tests/unit_tests/data/test_infer_types.py +++ /dev/null @@ -1,9 +0,0 @@ -import unittest - -from lightwood.api.dtype import dtype -from lightwood.data.infer_types import type_check_date - - -class TestTypeInference(unittest.TestCase): - def test_0_type_check_dates(self): - self.assertEqual(type_check_date('31/12/2010'), dtype.date) diff --git a/tests/unit_tests/encoder/text/test_pretrained.py b/tests/unit_tests/encoder/text/test_pretrained.py index 3f9484771..e6cf1f1cb 100644 --- a/tests/unit_tests/encoder/text/test_pretrained.py +++ b/tests/unit_tests/encoder/text/test_pretrained.py @@ -6,7 +6,7 @@ from sklearn.metrics import accuracy_score from lightwood.encoder import BinaryEncoder, NumericEncoder from lightwood.encoder.text import PretrainedLangEncoder -from lightwood.api.dtype import dtype +from type_infer.dtype import dtype import pandas as pd import os import pathlib diff --git a/tests/unit_tests/encoder/text/test_short.py b/tests/unit_tests/encoder/text/test_short.py index 9a261513f..1cb669118 100644 --- a/tests/unit_tests/encoder/text/test_short.py +++ b/tests/unit_tests/encoder/text/test_short.py @@ -1,7 +1,7 @@ import random import unittest from lightwood.encoder.text.short import ShortTextEncoder -from lightwood.helpers.text import tokenize_text +from type_infer.helpers import tokenize_text import torch VOCAB = [ @@ -76,13 +76,6 @@ def generate_sentences(min_, max_, vocab_size): class TestShortTextEncoder(unittest.TestCase): - def test_get_tokens(self): - sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world'] - for sent in sentences: - assert tokenize_text(sent) == ['hello', 'world'] - - assert tokenize_text("don't wouldn't") == ['do', 'not', 'would', 'not'] - def test_smallvocab_target_auto_mode(self): priming_data = generate_sentences(2, 6, vocab_size=99) test_data = random.sample(priming_data, len(priming_data) // 5)