diff --git a/rules/S6973/metadata.json b/rules/S6973/metadata.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/rules/S6973/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S6973/python/metadata.json b/rules/S6973/python/metadata.json new file mode 100644 index 00000000000..2dd11141c81 --- /dev/null +++ b/rules/S6973/python/metadata.json @@ -0,0 +1,24 @@ +{ + "title": "Important hyperparameters should be specified for Scikit-Learn estimators", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-6973", + "sqKey": "S6973", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "unknown", + "code": { + "impacts": { + "MAINTAINABILITY": "HIGH", + "RELIABILITY": "LOW" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S6973/python/rule.adoc b/rules/S6973/python/rule.adoc new file mode 100644 index 00000000000..92109deff49 --- /dev/null +++ b/rules/S6973/python/rule.adoc @@ -0,0 +1,139 @@ +This rule raises an issue when a Scikit-learn model is instantiated without specifying the important hyperparameters. + + + +== Why is this an issue? + +When instantiating a Scikit-learn estimator, it will use default values for the hyperparameters that are not specified. +Relying on the default values can lead to non-reproducible results across diffferent versions of the library. + +Furthermore, the default values might not be the best choice for the specific problem at hand and can lead to suboptimal performance. + +Here are the estimators and the parameters considered by this rule : +[cols="1,1"] +|=== +|*Estimator* +|*Hyperparameters* + +|AdaBoostClassifier +|learning_rate +|AdaBoostRegressor +|learning_rate +|GradientBoostingClassifier +|learning_rate +|GradientBoostingRegressor +|learning_rate +|HistGradientBoostingClassifier +|learning_rate +|HistGradientBoostingRegressor +|learning_rate +|RandomForestClassifier +|min_samples_leaf, max_features +|RandomForestRegressor +|min_samples_leaf, max_features +|ElasticNet +|alpha, l1_ratio +|NearestNeighbors +|n_neighbors +|KNeighborsClassifier +|n_neighbors +|KNeighborsRegressor +|n_neighbors +|NuSVC +|nu, kernel, gamma +|NuSVR +|C, kernel, gamma +|SVC +|C, kernel, gamma +|SVR +|C, kernel, gamma +|DecisionTreeClassifier +|ccp_alpha +|DecisionTreeRegressor +|ccp_alpha + +|MLPClassifier +|hidden_layer_sizes +|MLPRegressor +|hidden_layer_sizes + +|PolynomialFeatures +|degree, interaction_only +|=== + +== How to fix it +Specify the hyperparameters when instantiating the estimator. + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +from sklearn.neighbors import KNeighborsClassifier + +clf = KNeighborsClassifier() # Noncompliant : n_neighbors is not specified, different values can change the behaviour of the predictor significantly +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +from sklearn.neighbors import KNeighborsClassifier + +clf = KNeighborsClassifier( # Compliant + n_neighbors=5 +) +---- + +ifdef::env-github,rspecator-view[] + +(visible only on this page) + +== Implementation specification + +Implementation will be quite tricky if we want to avoid false positives. + +Abort if : + +- In a Pipeline/make_pipeline used for hyperparameter search + +https://github.com/SERG-Delft/dslinter/blob/main/dslinter/checkers/hyperparameters_scikitlearn.py#L48-L70[List of DSLinter estimators] + +Possible baby step : only check for some estimators ( for exemple the meta-learners) + +Ignore parameters : + +- n_jobs + +- that ends in `param` ? + +=== Message + +Specify all hyperparameters when instantiating a Scikit-learn estimator. + +=== Issue location + +Primary : name of the estimator + +No secondary location +=== Quickfix + +There is a possible quickfix : add all the missing parameters at their default values + +endif::env-github,rspecator-view[] + +== Resources + +=== Articles & blog posts + +* Probst, P., Boulesteix, A. L., & Bischl, B. (2019). Tunability: Importance of + Hyperparameters of Machine Learning Algorithms. Journal of Machine Learning Research, + 20(53), 1-32. +* van Rijn, J. N., & Hutter, F. (2018, July). Hyperparameter importance across datasets. + In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & + Data Mining (pp. 2367-2376). + +=== External coding guidelines +* Code Smells for Machine Learning Applications - https://hynn01.github.io/ml-smells/posts/codesmells/11-hyperparameter-not-explicitly-set/[Hyperparameter not Explicitly Set] +