Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

LLMSystems/ML2SQL

Folders and files

NameName
Last commit message
Last commit date

Latest commit

History

32 Commits

Repository files navigation

Compile tree-based machine learning models into SQL inference queries, enabling model predictions to run directly inside the database.

English | 中文


1. Introduction

ML2SQL compiles trained machine learning models (DecisionTree, RandomForest, AdaBoost, XGBoost, LGBM) into pure SQL inference queries, allowing model predictions to run directly inside the database without deploying a separate inference service.

Core workflow:

Train model → Export model structure → Generate SQL → Run inference in database

2. Supported Models & Features

Feature DecisionTree RandomForest AdaBoost XGBoost LGBM (official)
Categorical feature support
Binary classification
Multi-class classification
Sample weight support
Early stopping support
Feature subsampling
Row subsampling
Pruning support
Export to SQL inference
Export to JSON
Visualization (dot)

3. Installation

pip install -e .

4. Usage

4.1 LGBMClassifier

Train with the official LightGBM, then use LightGBMManualPredictor to compile the model into SQL.

from Classifiers.BoostClassifier import LightGBMManualPredictor
from utils import generate_synthetic_data
import lightgbm as lgb
from sklearn.metrics import accuracy_score
if __name__ == "__main__":
 df = generate_synthetic_data(n=200000)
 test_df = generate_synthetic_data(n=100, seed=43)
 X = df.drop(columns=["play"])
 y = (df["play"] == "Yes").astype(int)
 X_test = test_df.drop(columns=["play"])
 y_test = (test_df["play"] == "Yes").astype(int)
 cat_cols = ["humidity", "wind", "outlook"]
 for col in cat_cols:
 X[col] = X[col].astype("category").cat.set_categories(sorted(X[col].unique()))
 X_test[col] = X_test[col].astype("category").cat.set_categories(sorted(X[col].unique()))
 feature_cat_mapping = {
 col: {i: cat for i, cat in enumerate(X[col].cat.categories)}
 for col in cat_cols
 }
 lgbm = lgb.LGBMClassifier(max_depth=3, n_estimators=3, learning_rate=0.3)
 lgbm.fit(X, y, categorical_feature=cat_cols)
 print(f"LightGBM Accuracy: {accuracy_score(y_test, lgbm.predict(X_test)):.4f}")
 lgbm.booster_.save_model("lgb_model.txt")
 model = lgb.Booster(model_file="lgb_model.txt")
 feature_names = X.columns.tolist()
 predictor = LightGBMManualPredictor(model.dump_model(), feature_names, feature_cat_mapping)
 sql = predictor.generate_sql(threshold=0.5)
 print(sql)
Example SQL output (click to expand)
WITH predictions AS (
 SELECT
 id,
 CASE
 WHEN outlook = 'Overcast' THEN
 CASE
 WHEN cloud_cover <= 60.5 THEN 0.2905411340700754
 ELSE
 CASE
 WHEN wind = 'Weak' THEN -0.3534347768742837
 ELSE 0.2905411340700754
 END
 END
 ELSE
 CASE
 WHEN temperature <= 32.5 THEN -0.9885879150965038
 ELSE
 CASE
 WHEN wind = 'Weak' THEN -0.9885879150965038
 ELSE 0.2905411340700754
 END
 END
 END AS tree_1_score,
 -- tree_2_score, tree_3_score ...
 FROM input_data
),
logits AS (
 SELECT
 id,
 0.0 + (tree_1_score + tree_2_score + tree_3_score) AS logit
 FROM predictions
)
SELECT
 id,
 1.0 / (1.0 + EXP(-logit)) AS probability,
 CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM logits;

4.2 LinearBoostingClassifier

A custom-implemented Linear Boosting classifier. The generated SQL is a linear formula with high readability.

from Classifiers.BoostClassifier import LinearBoostingClassifier
from utils import generate_linear_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
if __name__ == "__main__":
 data = generate_synthetic_data(n=200000, seed=42)
 X = data.drop(columns=["play"])
 y = (data["play"] == "Yes").astype(int)
 test_df = generate_synthetic_data(n=100, seed=43)
 X_test = test_df.drop(columns=["play"])
 y_test = (test_df["play"] == "Yes").astype(int)
 clf = LinearBoostingClassifier(
 n_estimators=200,
 learning_rate=0.3,
 early_stopping_rounds=20,
 colsample_bytree=1,
 subsample=0.8,
 )
 clf.fit(X, y, (X_test, y_test))
 print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))
 clf.save("linear_boost_model.json")
 model = LinearBoostingClassifier.load("linear_boost_model.json")
 # Generate full SQL (each tree expanded independently)
 sql = generate_linear_sql(model)
 print(sql)
 # Generate merged SQL (sum all tree coefficients, more concise)
 sql = generate_merged_linear_sql(model)
 print(sql)
 # Feature importance
 features = model.compute_feature_contributions(X_test, plot=True, normalize=True)
 # Example output:
 # temperature 0.492538
 # outlook_Sunny 0.162274
 # wind_Weak 0.096765
 # Per-sample feature contributions
 per_sample = model.per_sample_contributions(X_test)
 # per_sample['raw'], per_sample['normalized'], per_sample['signed']
Example SQL output — Full version (click to expand)
WITH linear_score AS (
 SELECT
 id,
 -0.508096 +
 0.3 * (0.315567 + 0.090054 * temperature + -0.005837 * cloud_cover + 0.321502 * humidity_Low + -0.979199 * wind_Weak + -3.287133 * outlook_Rainy + -3.289431 * outlook_Sunny) +
 0.3 * (-0.166722 + 0.077027 * temperature + -0.004919 * cloud_cover + 0.271886 * humidity_Low + -0.833306 * wind_Weak + -2.371971 * outlook_Rainy + -2.369790 * outlook_Sunny) +
 -- more trees...
 AS raw_score
 FROM input_data
)
SELECT *,
 1 / (1 + EXP(-raw_score)) AS prob,
 CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM linear_score;
Example SQL output — Merged version (click to expand)
WITH linear_score AS (
 SELECT
 id,
 -1.499403
 + -0.011357 * cloud_cover
 + 0.619758 * humidity_Low
 + -4.512565 * outlook_Rainy
 + -4.519462 * outlook_Sunny
 + 0.177894 * temperature
 + -1.979982 * wind_Weak
 AS raw_score
 FROM input_data
)
SELECT *,
 1 / (1 + EXP(-raw_score)) AS prob,
 CASE WHEN 1 / (1 + EXP(-raw_score)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM linear_score;

4.3 XGBoostClassifier

A custom-implemented XGBoost classifier.

from Classifiers.BoostClassifier import XGBoostClassifier
from utils import generate_synthetic_data, generate_xgboost_sql
if __name__ == "__main__":
 data = generate_synthetic_data(n=100, seed=42)
 X = data.drop(columns=["play"])
 y = (data["play"] == "Yes").astype(int)
 test_df = generate_synthetic_data(n=100, seed=43)
 X_test = test_df.drop(columns=["play"])
 y_test = (test_df["play"] == "Yes").astype(int)
 feature_types = {
 "temperature": "numerical",
 "humidity": "categorical",
 "wind": "categorical",
 "outlook": "categorical",
 "cloud_cover": "numerical"
 }
 clf = XGBoostClassifier(
 n_estimators=15,
 max_depth=3,
 learning_rate=0.3,
 reg_lambda=1.0,
 feature_types=feature_types,
 colsample_bytree=1.0,
 subsample=1.0,
 )
 clf.fit(X, y, (X_test, y_test))
 from sklearn.metrics import accuracy_score
 print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))
 clf.save("xgboost_model.json")
 model = XGBoostClassifier.load("xgboost_model.json")
 sql = generate_xgboost_sql(model)
 print(sql)
Example SQL output (click to expand)
WITH predictions AS (
 SELECT
 id,
 CASE
 WHEN outlook = 'Overcast' THEN
 CASE
 WHEN cloud_cover <= 70 THEN 2.412220
 ELSE
 CASE
 WHEN wind = 'Strong' THEN 1.454234
 ELSE -0.204320
 END
 END
 ELSE
 CASE
 WHEN temperature <= 32 THEN -1.353249
 ELSE
 CASE
 WHEN wind = 'Strong' THEN 1.628352
 ELSE -0.934034
 END
 END
 END AS tree_1_score,
 -- tree_2_score ... tree_N_score
 FROM input_data
),
logits AS (
 SELECT
 id,
 -0.7537718 + 0.3 * (tree_1_score + tree_2_score + ... + tree_N_score) AS logit
 FROM predictions
)
SELECT
 id,
 1.0 / (1.0 + EXP(-logit)) AS probability,
 CASE WHEN 1.0 / (1.0 + EXP(-logit)) >= 0.5 THEN 1 ELSE 0 END AS prediction
FROM logits;

4.4 AdaBoostClassifier

A custom-implemented AdaBoost classifier that generates SQL using weighted voting.

from Classifiers.BoostClassifier import AdaBoostClassifier
from Classifiers.TreeClassifier import DecisionTreeClassifier
from utils import generate_adaboost_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
if __name__ == "__main__":
 data = generate_synthetic_data(n=100, seed=42)
 feature_types = {
 "temperature": "numerical",
 "humidity": "categorical",
 "wind": "categorical",
 "outlook": "categorical",
 "cloud_cover": "numerical"
 }
 boost = AdaBoostClassifier(
 base_estimator_class=DecisionTreeClassifier,
 feature_types=feature_types,
 n_estimators=10,
 max_depth=3
 )
 boost.fit(data, target_key="play")
 boost.save("adaboost.json")
 model = AdaBoostClassifier.load("adaboost.json", DecisionTreeClassifier)
 test_df = generate_synthetic_data(n=100, seed=43)
 X_test = test_df.drop(columns=["play"])
 y_test = test_df["play"]
 print("Accuracy:", model.score(X_test, y_test, metric="accuracy")[0])
 print("Recall:", model.score(X_test, y_test, metric="recall", average="binary")[0])
 print("F1 score:", model.score(X_test, y_test, metric="f1", average="binary")[0])
 sql = generate_adaboost_sql(model)
 print(sql)
Example SQL output (click to expand)
WITH predictions AS (
 SELECT
 id,
 CASE
 WHEN outlook = 'Rainy' THEN
 CASE
 WHEN temperature <= 33.5 THEN 'No'
 ELSE
 CASE
 WHEN wind = 'Strong' THEN 'Yes'
 ELSE 'No'
 END
 END
 WHEN outlook = 'Overcast' THEN
 CASE
 WHEN cloud_cover <= 59.0 THEN 'Yes'
 ELSE
 CASE
 WHEN wind = 'Strong' THEN 'Yes'
 ELSE 'No'
 END
 END
 ELSE 'No'
 END AS tree_1_pred,
 -- tree_2_pred ... tree_N_pred
 FROM input_data
),
weighted_scores AS (
 SELECT
 id,
 (CASE WHEN tree_1_pred = 'No' THEN 3.8918 ELSE 0 END) + ... AS score_No,
 (CASE WHEN tree_1_pred = 'Yes' THEN 3.8918 ELSE 0 END) + ... AS score_Yes
 FROM predictions
)
SELECT
 id,
 CASE WHEN score_Yes >= score_No THEN 'Yes' ELSE 'No' END AS prediction
FROM weighted_scores;

4.5 RandomForestClassifier

A custom-implemented Random Forest classifier with majority-vote SQL support.

from Classifiers.ForestClassifier import RandomForestClassifier
from utils import generate_forest_sql, generate_synthetic_data
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
if __name__ == "__main__":
 data = generate_synthetic_data(n=100, seed=42)
 feature_types = {
 "temperature": "numerical",
 "humidity": "categorical",
 "wind": "categorical",
 "outlook": "categorical",
 "cloud_cover": "numerical"
 }
 rf = RandomForestClassifier(
 feature_types=feature_types,
 n_estimators=5,
 max_depth=5,
 bootstrap=True,
 oob_score=True,
 max_features=None,
 criterion="gini",
 min_samples_split=2,
 min_samples_leaf=1,
 tie_break_random=True,
 random_state=42
 )
 rf.fit(data, target_key="play")
 rf.save("random_forest_model.json")
 rf_loaded = RandomForestClassifier.load("random_forest_model.json")
 test_df = generate_synthetic_data(n=100, seed=43)
 X_test = test_df.drop(columns=["play"])
 y_test = test_df["play"]
 print("Accuracy:", rf.score(X_test, y_test, metric="accuracy")[0])
 print("Recall:", rf.score(X_test, y_test, metric="recall", average="binary")[0])
 print("F1 score:", rf.score(X_test, y_test, metric="f1", average="binary")[0])
 print("OOB Score:", rf.oob_score_) # requires bootstrap=True, oob_score=True
 sql = generate_forest_sql(rf, input_table="input_data", id_col="id")
 print(sql)
 # Feature importance
 importances = rf.feature_importances_()
 items = sorted(importances.items(), key=lambda x: x[1], reverse=True)
 features, scores = zip(*items)
 plt.barh(features, scores)
 plt.xlabel("Importance")
 plt.title("Random Forest Feature Importances")
 plt.gca().invert_yaxis()
 plt.tight_layout()
 plt.show()
Example SQL output (click to expand)
WITH predictions AS (
 SELECT
 id,
 CASE
 WHEN outlook = 'Rainy' THEN
 CASE
 WHEN temperature <= 34.0 THEN 'No'
 ELSE
 CASE
 WHEN cloud_cover <= 49.0 THEN 'Yes'
 ELSE 'No'
 END
 END
 WHEN outlook = 'Overcast' THEN
 CASE
 WHEN wind = 'Strong' THEN 'Yes'
 ELSE
 CASE
 WHEN cloud_cover <= 52.0 THEN 'Yes'
 ELSE
 CASE
 WHEN humidity = 'High' THEN 'No'
 ELSE 'Yes'
 END
 END
 END
 WHEN outlook = 'Sunny' THEN 'No'
 ELSE 'No'
 END AS tree_1_pred,
 -- tree_2_pred ... tree_N_pred
 FROM input_data
)
SELECT
 id,
 CASE
 WHEN (tree_1_pred = 'Yes')::int + ... >= 3 THEN 'Yes'
 ELSE 'No'
 END AS prediction
FROM predictions;

About

Compile tree-based machine learning models into SQL inference queries, enabling model predictions to run directly inside the database.

Topics

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

Contributors

Languages

AltStyle によって変換されたページ (->オリジナル) /