ML2SQL compiles trained machine learning models (DecisionTree, RandomForest, AdaBoost, XGBoost, LGBM) into pure SQL inference queries, allowing model predictions to run directly inside the database without deploying a separate inference service.
Core workflow:
Train model → Export model structure → Generate SQL → Run inference in database
| Feature | DecisionTree | RandomForest | AdaBoost | XGBoost | LGBM (official) |
|---|---|---|---|---|---|
| Categorical feature support | ✅ | ✅ | ✅ | ✅ | ✅ |
| Binary classification | ✅ | ✅ | ✅ | ✅ | ✅ |
| Multi-class classification | ✅ | ✅ | ✅ | ❌ | ✅ |
| Sample weight support | ✅ | ✅ | |||
| Early stopping support | ✅ | ✅ | ✅ | ✅ | |
| Feature subsampling | ✅ | ✅ | ✅ | ||
| Row subsampling | ✅ | ✅ | ✅ | ||
| Pruning support | ✅ | ✅ | ✅ | ||
| Export to SQL inference | ✅ | ✅ | ✅ | ✅ | ✅ |
| Export to JSON | ✅ | ✅ | ✅ | ✅ | ❌ |
| Visualization (dot) | ✅ | ✅ | ✅ | ✅ | ✅ |
pip install -e .Train with the official LightGBM, then use LightGBMManualPredictor to compile the model into SQL.
from Classifiers.BoostClassifier import LightGBMManualPredictor from utils import generate_synthetic_data import lightgbm as lgb from sklearn.metrics import accuracy_score if __name__ == "__main__": df = generate_synthetic_data(n=200000) test_df = generate_synthetic_data(n=100, seed=43) X = df.drop(columns=["play"]) y = (df["play"] == "Yes").astype(int) X_test = test_df.drop(columns=["play"]) y_test = (test_df["play"] == "Yes").astype(int) cat_cols = ["humidity", "wind", "outlook"] for col in cat_cols: X[col] = X[col].astype("category").cat.set_categories(sorted(X[col].unique())) X_test[col] = X_test[col].astype("category").cat.set_categories(sorted(X[col].unique())) feature_cat_mapping = { col: {i: cat for i, cat in enumerate(X[col].cat.categories)} for col in cat_cols } lgbm = lgb.LGBMClassifier(max_depth=3, n_estimators=3, learning_rate=0.3) lgbm.fit(X, y, categorical_feature=cat_cols) print(f"LightGBM Accuracy: {accuracy_score(y_test, lgbm.predict(X_test)):.4f}") lgbm.booster_.save_model("lgb_model.txt") model = lgb.Booster(model_file="lgb_model.txt") feature_names = X.columns.tolist() predictor = LightGBMManualPredictor(model.dump_model(), feature_names, feature_cat_mapping) sql = predictor.generate_sql(threshold=0.5) print(sql)
Example SQL output (click to expand)
WITH predictions AS ( SELECT id, CASE WHEN outlook = 'Overcast' THEN CASE WHEN cloud_cover <= 60.5 THEN 0.2905411340700754 ELSE CASE WHEN wind = 'Weak' THEN -0.3534347768742837 ELSE 0.2905411340700754 END END ELSE CASE WHEN temperature <= 32.5 THEN -0.9885879150965038 ELSE CASE WHEN wind = 'Weak' THEN -0.9885879150965038 ELSE 0.2905411340700754 END END END AS tree_1_score, -- tree_2_score, tree_3_score ... FROM input_data ), logits AS ( SELECT id, 0.0 + (tree_1_score + tree_2_score + tree_3_score) AS logit FROM predictions ) SELECT id, 1.0 / (1.0 + EXP(-logit)) AS probability, CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction FROM logits;
A custom-implemented Linear Boosting classifier. The generated SQL is a linear formula with high readability.
from Classifiers.BoostClassifier import LinearBoostingClassifier from utils import generate_linear_sql, generate_synthetic_data from sklearn.metrics import accuracy_score if __name__ == "__main__": data = generate_synthetic_data(n=200000, seed=42) X = data.drop(columns=["play"]) y = (data["play"] == "Yes").astype(int) test_df = generate_synthetic_data(n=100, seed=43) X_test = test_df.drop(columns=["play"]) y_test = (test_df["play"] == "Yes").astype(int) clf = LinearBoostingClassifier( n_estimators=200, learning_rate=0.3, early_stopping_rounds=20, colsample_bytree=1, subsample=0.8, ) clf.fit(X, y, (X_test, y_test)) print("Accuracy:", accuracy_score(y_test, clf.predict(X_test))) clf.save("linear_boost_model.json") model = LinearBoostingClassifier.load("linear_boost_model.json") # Generate full SQL (each tree expanded independently) sql = generate_linear_sql(model) print(sql) # Generate merged SQL (sum all tree coefficients, more concise) sql = generate_merged_linear_sql(model) print(sql) # Feature importance features = model.compute_feature_contributions(X_test, plot=True, normalize=True) # Example output: # temperature 0.492538 # outlook_Sunny 0.162274 # wind_Weak 0.096765 # Per-sample feature contributions per_sample = model.per_sample_contributions(X_test) # per_sample['raw'], per_sample['normalized'], per_sample['signed']
Example SQL output — Full version (click to expand)
WITH linear_score AS ( SELECT id, -0.508096 + 0.3 * (0.315567 + 0.090054 * temperature + -0.005837 * cloud_cover + 0.321502 * humidity_Low + -0.979199 * wind_Weak + -3.287133 * outlook_Rainy + -3.289431 * outlook_Sunny) + 0.3 * (-0.166722 + 0.077027 * temperature + -0.004919 * cloud_cover + 0.271886 * humidity_Low + -0.833306 * wind_Weak + -2.371971 * outlook_Rainy + -2.369790 * outlook_Sunny) + -- more trees... AS raw_score FROM input_data ) SELECT *, 1 / (1 + EXP(-raw_score)) AS prob, CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction FROM linear_score;
Example SQL output — Merged version (click to expand)
WITH linear_score AS ( SELECT id, -1.499403 + -0.011357 * cloud_cover + 0.619758 * humidity_Low + -4.512565 * outlook_Rainy + -4.519462 * outlook_Sunny + 0.177894 * temperature + -1.979982 * wind_Weak AS raw_score FROM input_data ) SELECT *, 1 / (1 + EXP(-raw_score)) AS prob, CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction FROM linear_score;
A custom-implemented XGBoost classifier.
from Classifiers.BoostClassifier import XGBoostClassifier from utils import generate_synthetic_data, generate_xgboost_sql if __name__ == "__main__": data = generate_synthetic_data(n=100, seed=42) X = data.drop(columns=["play"]) y = (data["play"] == "Yes").astype(int) test_df = generate_synthetic_data(n=100, seed=43) X_test = test_df.drop(columns=["play"]) y_test = (test_df["play"] == "Yes").astype(int) feature_types = { "temperature": "numerical", "humidity": "categorical", "wind": "categorical", "outlook": "categorical", "cloud_cover": "numerical" } clf = XGBoostClassifier( n_estimators=15, max_depth=3, learning_rate=0.3, reg_lambda=1.0, feature_types=feature_types, colsample_bytree=1.0, subsample=1.0, ) clf.fit(X, y, (X_test, y_test)) from sklearn.metrics import accuracy_score print("Accuracy:", accuracy_score(y_test, clf.predict(X_test))) clf.save("xgboost_model.json") model = XGBoostClassifier.load("xgboost_model.json") sql = generate_xgboost_sql(model) print(sql)
Example SQL output (click to expand)
WITH predictions AS ( SELECT id, CASE WHEN outlook = 'Overcast' THEN CASE WHEN cloud_cover <= 70 THEN 2.412220 ELSE CASE WHEN wind = 'Strong' THEN 1.454234 ELSE -0.204320 END END ELSE CASE WHEN temperature <= 32 THEN -1.353249 ELSE CASE WHEN wind = 'Strong' THEN 1.628352 ELSE -0.934034 END END END AS tree_1_score, -- tree_2_score ... tree_N_score FROM input_data ), logits AS ( SELECT id, -0.7537718 + 0.3 * (tree_1_score + tree_2_score + ... + tree_N_score) AS logit FROM predictions ) SELECT id, 1.0 / (1.0 + EXP(-logit)) AS probability, CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction FROM logits;
A custom-implemented AdaBoost classifier that generates SQL using weighted voting.
from Classifiers.BoostClassifier import AdaBoostClassifier from Classifiers.TreeClassifier import DecisionTreeClassifier from utils import generate_adaboost_sql, generate_synthetic_data from sklearn.metrics import accuracy_score if __name__ == "__main__": data = generate_synthetic_data(n=100, seed=42) feature_types = { "temperature": "numerical", "humidity": "categorical", "wind": "categorical", "outlook": "categorical", "cloud_cover": "numerical" } boost = AdaBoostClassifier( base_estimator_class=DecisionTreeClassifier, feature_types=feature_types, n_estimators=10, max_depth=3 ) boost.fit(data, target_key="play") boost.save("adaboost.json") model = AdaBoostClassifier.load("adaboost.json", DecisionTreeClassifier) test_df = generate_synthetic_data(n=100, seed=43) X_test = test_df.drop(columns=["play"]) y_test = test_df["play"] print("Accuracy:", model.score(X_test, y_test, metric="accuracy")[0]) print("Recall:", model.score(X_test, y_test, metric="recall", average="binary")[0]) print("F1 score:", model.score(X_test, y_test, metric="f1", average="binary")[0]) sql = generate_adaboost_sql(model) print(sql)
Example SQL output (click to expand)
WITH predictions AS ( SELECT id, CASE WHEN outlook = 'Rainy' THEN CASE WHEN temperature <= 33.5 THEN 'No' ELSE CASE WHEN wind = 'Strong' THEN 'Yes' ELSE 'No' END END WHEN outlook = 'Overcast' THEN CASE WHEN cloud_cover <= 59.0 THEN 'Yes' ELSE CASE WHEN wind = 'Strong' THEN 'Yes' ELSE 'No' END END ELSE 'No' END AS tree_1_pred, -- tree_2_pred ... tree_N_pred FROM input_data ), weighted_scores AS ( SELECT id, (CASE WHEN tree_1_pred = 'No' THEN 3.8918 ELSE 0 END) + ... AS score_No, (CASE WHEN tree_1_pred = 'Yes' THEN 3.8918 ELSE 0 END) + ... AS score_Yes FROM predictions ) SELECT id, CASE WHEN score_Yes >= score_No THEN 'Yes' ELSE 'No' END AS prediction FROM weighted_scores;
A custom-implemented Random Forest classifier with majority-vote SQL support.
from Classifiers.ForestClassifier import RandomForestClassifier from utils import generate_forest_sql, generate_synthetic_data from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt if __name__ == "__main__": data = generate_synthetic_data(n=100, seed=42) feature_types = { "temperature": "numerical", "humidity": "categorical", "wind": "categorical", "outlook": "categorical", "cloud_cover": "numerical" } rf = RandomForestClassifier( feature_types=feature_types, n_estimators=5, max_depth=5, bootstrap=True, oob_score=True, max_features=None, criterion="gini", min_samples_split=2, min_samples_leaf=1, tie_break_random=True, random_state=42 ) rf.fit(data, target_key="play") rf.save("random_forest_model.json") rf_loaded = RandomForestClassifier.load("random_forest_model.json") test_df = generate_synthetic_data(n=100, seed=43) X_test = test_df.drop(columns=["play"]) y_test = test_df["play"] print("Accuracy:", rf.score(X_test, y_test, metric="accuracy")[0]) print("Recall:", rf.score(X_test, y_test, metric="recall", average="binary")[0]) print("F1 score:", rf.score(X_test, y_test, metric="f1", average="binary")[0]) print("OOB Score:", rf.oob_score_) # requires bootstrap=True, oob_score=True sql = generate_forest_sql(rf, input_table="input_data", id_col="id") print(sql) # Feature importance importances = rf.feature_importances_() items = sorted(importances.items(), key=lambda x: x[1], reverse=True) features, scores = zip(*items) plt.barh(features, scores) plt.xlabel("Importance") plt.title("Random Forest Feature Importances") plt.gca().invert_yaxis() plt.tight_layout() plt.show()
Example SQL output (click to expand)
WITH predictions AS ( SELECT id, CASE WHEN outlook = 'Rainy' THEN CASE WHEN temperature <= 34.0 THEN 'No' ELSE CASE WHEN cloud_cover <= 49.0 THEN 'Yes' ELSE 'No' END END WHEN outlook = 'Overcast' THEN CASE WHEN wind = 'Strong' THEN 'Yes' ELSE CASE WHEN cloud_cover <= 52.0 THEN 'Yes' ELSE CASE WHEN humidity = 'High' THEN 'No' ELSE 'Yes' END END END WHEN outlook = 'Sunny' THEN 'No' ELSE 'No' END AS tree_1_pred, -- tree_2_pred ... tree_N_pred FROM input_data ) SELECT id, CASE WHEN (tree_1_pred = 'Yes')::int + ... >= 3 THEN 'Yes' ELSE 'No' END AS prediction FROM predictions;