Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 4370628

Browse files
Add source code files to repo
1 parent 64764c6 commit 4370628

File tree

7 files changed

+473
-0
lines changed

7 files changed

+473
-0
lines changed

‎.github/workflows/cml.yaml‎

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
name: CML Report
2+
on: pull_request
3+
jobs:
4+
run:
5+
runs-on: [ubuntu-latest]
6+
steps:
7+
- uses: iterative/setup-cml@v2
8+
- uses: iterative/setup-dvc@v1
9+
- uses: actions/checkout@v3
10+
with:
11+
fetch-depth: 2
12+
# Needed for https://github.com/iterative/example-repos-dev/issues/225
13+
- name: Installs JSON5
14+
run: npm install -g json5
15+
- name: Generate metrics report
16+
env:
17+
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18+
run: |
19+
cml ci
20+
if [ $GITHUB_REF = refs/heads/main ]; then
21+
PREVIOUS_REF=HEAD~1
22+
else
23+
PREVIOUS_REF=main
24+
git fetch origin main:main
25+
fi
26+
27+
dvc pull eval
28+
dvc plots diff $PREVIOUS_REF workspace \
29+
--show-vega --targets ROC | json5 > vega.json
30+
vl2svg vega.json roc.svg
31+
32+
dvc plots diff $PREVIOUS_REF workspace \
33+
--show-vega --targets Precision-Recall | json5 > vega.json
34+
vl2svg vega.json prc.svg
35+
36+
dvc plots diff $PREVIOUS_REF workspace \
37+
--show-vega --targets Confusion-Matrix | json5 > vega.json
38+
vl2svg vega.json confusion.svg
39+
40+
cp eval/plots/images/importance.png importance_workspace.png
41+
42+
git checkout $PREVIOUS_REF -- dvc.lock
43+
cp eval/plots/images/importance.png importance_previous.png
44+
45+
dvc_report=$(dvc exp diff $PREVIOUS_REF --md)
46+
47+
cat <<EOF > report.md
48+
# CML Report
49+
## Plots
50+
![ROC](./roc.svg)
51+
![Precision-Recall](./prc.svg)
52+
![Confusion Matrix](./confusion.svg)
53+
#### Feature Importance: ${PREVIOUS_REF}
54+
![Feature Importance: ${PREVIOUS_REF}](./importance_previous.png)
55+
#### Feature Importance: workspace
56+
![Feature Importance: workspace](./importance_workspace.png)
57+
58+
## Metrics and Params
59+
### ${PREVIOUS_REF} → workspace
60+
${dvc_report}
61+
EOF
62+
63+
cml comment create --publish --pr=false report.md

‎params.yaml‎

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
prepare:
2+
split: 0.20
3+
seed: 20170428
4+
5+
featurize:
6+
max_features: 100
7+
ngrams: 1
8+
9+
train:
10+
seed: 20170428
11+
n_est: 50
12+
min_split: 0.01
13+

‎src/evaluate.py‎

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import json
2+
import math
3+
import os
4+
import pickle
5+
import sys
6+
7+
import pandas as pd
8+
from sklearn import metrics
9+
from sklearn import tree
10+
from dvclive import Live
11+
from matplotlib import pyplot as plt
12+
13+
14+
def evaluate(model, matrix, split, live, save_path):
15+
"""
16+
Dump all evaluation metrics and plots for given datasets.
17+
18+
Args:
19+
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
20+
matrix (scipy.sparse.csr_matrix): Input matrix.
21+
split (str): Dataset name.
22+
live (dvclive.Live): Dvclive instance.
23+
save_path (str): Path to save the metrics.
24+
"""
25+
labels = matrix[:, 1].toarray().astype(int)
26+
x = matrix[:, 2:]
27+
28+
predictions_by_class = model.predict_proba(x)
29+
predictions = predictions_by_class[:, 1]
30+
31+
# Use dvclive to log a few simple metrics...
32+
avg_prec = metrics.average_precision_score(labels, predictions)
33+
roc_auc = metrics.roc_auc_score(labels, predictions)
34+
if not live.summary:
35+
live.summary = {"avg_prec": {}, "roc_auc": {}}
36+
live.summary["avg_prec"][split] = avg_prec
37+
live.summary["roc_auc"][split] = roc_auc
38+
39+
# ... and plots...
40+
# ... like an roc plot...
41+
live.log_sklearn_plot("roc", labels, predictions, name=f"roc/{split}")
42+
# ... and precision recall plot...
43+
# ... which passes `drop_intermediate=True` to the sklearn method...
44+
live.log_sklearn_plot(
45+
"precision_recall",
46+
labels,
47+
predictions,
48+
name=f"prc/{split}",
49+
drop_intermediate=True,
50+
)
51+
# ... and confusion matrix plot
52+
live.log_sklearn_plot(
53+
"confusion_matrix",
54+
labels.squeeze(),
55+
predictions_by_class.argmax(-1),
56+
name=f"cm/{split}",
57+
)
58+
59+
60+
def save_importance_plot(live, model, feature_names):
61+
"""
62+
Save feature importance plot.
63+
64+
Args:
65+
live (dvclive.Live): DVCLive instance.
66+
model (sklearn.ensemble.RandomForestClassifier): Trained classifier.
67+
feature_names (list): List of feature names.
68+
"""
69+
fig, axes = plt.subplots(dpi=100)
70+
fig.subplots_adjust(bottom=0.2, top=0.95)
71+
axes.set_ylabel("Mean decrease in impurity")
72+
73+
importances = model.feature_importances_
74+
forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
75+
forest_importances.plot.bar(ax=axes)
76+
77+
live.log_image("importance.png", fig)
78+
79+
80+
def main():
81+
EVAL_PATH = "eval"
82+
83+
if len(sys.argv) != 3:
84+
sys.stderr.write("Arguments error. Usage:\n")
85+
sys.stderr.write("\tpython evaluate.py model features\n")
86+
sys.exit(1)
87+
88+
model_file = sys.argv[1]
89+
train_file = os.path.join(sys.argv[2], "train.pkl")
90+
test_file = os.path.join(sys.argv[2], "test.pkl")
91+
92+
# Load model and data.
93+
with open(model_file, "rb") as fd:
94+
model = pickle.load(fd)
95+
96+
with open(train_file, "rb") as fd:
97+
train, feature_names = pickle.load(fd)
98+
99+
with open(test_file, "rb") as fd:
100+
test, _ = pickle.load(fd)
101+
102+
# Evaluate train and test datasets.
103+
with Live(EVAL_PATH) as live:
104+
evaluate(model, train, "train", live, save_path=EVAL_PATH)
105+
evaluate(model, test, "test", live, save_path=EVAL_PATH)
106+
107+
# Dump feature importance plot.
108+
save_importance_plot(live, model, feature_names)
109+
110+
111+
if __name__ == "__main__":
112+
main()

‎src/featurization.py‎

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import os
2+
import pickle
3+
import sys
4+
5+
import numpy as np
6+
import pandas as pd
7+
import scipy.sparse as sparse
8+
import yaml
9+
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
10+
11+
12+
def get_df(data):
13+
"""Read the input data file and return a data frame."""
14+
df = pd.read_csv(
15+
data,
16+
encoding="utf-8",
17+
header=None,
18+
delimiter="\t",
19+
names=["id", "label", "text"],
20+
)
21+
sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
22+
return df
23+
24+
25+
def save_matrix(df, matrix, names, output):
26+
"""
27+
Save the matrix to a pickle file.
28+
29+
Args:
30+
df (pandas.DataFrame): Input data frame.
31+
matrix (scipy.sparse.csr_matrix): Input matrix.
32+
names (list): List of feature names.
33+
output (str): Output file name.
34+
"""
35+
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
36+
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
37+
38+
result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
39+
40+
msg = "The output matrix {} size is {} and data type is {}\n"
41+
sys.stderr.write(msg.format(output, result.shape, result.dtype))
42+
43+
with open(output, "wb") as fd:
44+
pickle.dump((result, names), fd)
45+
pass
46+
47+
48+
def generate_and_save_train_features(train_input, train_output, bag_of_words, tfidf):
49+
"""
50+
Generate train feature matrix.
51+
52+
Args:
53+
train_input (str): Train input file name.
54+
train_output (str): Train output file name.
55+
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
56+
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
57+
"""
58+
df_train = get_df(train_input)
59+
train_words = np.array(df_train.text.str.lower().values)
60+
61+
bag_of_words.fit(train_words)
62+
63+
train_words_binary_matrix = bag_of_words.transform(train_words)
64+
feature_names = bag_of_words.get_feature_names_out()
65+
66+
tfidf.fit(train_words_binary_matrix)
67+
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
68+
69+
save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)
70+
71+
72+
def generate_and_save_test_features(test_input, test_output, bag_of_words, tfidf):
73+
"""
74+
Generate test feature matrix.
75+
76+
Args:
77+
test_input (str): Test input file name.
78+
test_output (str): Test output file name.
79+
bag_of_words (sklearn.feature_extraction.text.CountVectorizer): Bag of words.
80+
tfidf (sklearn.feature_extraction.text.TfidfTransformer): TF-IDF transformer.
81+
"""
82+
df_test = get_df(test_input)
83+
test_words = np.array(df_test.text.str.lower().values)
84+
85+
test_words_binary_matrix = bag_of_words.transform(test_words)
86+
test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
87+
feature_names = bag_of_words.get_feature_names_out()
88+
89+
save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)
90+
91+
92+
def main():
93+
params = yaml.safe_load(open("params.yaml"))["featurize"]
94+
95+
np.set_printoptions(suppress=True)
96+
97+
if len(sys.argv) != 3 and len(sys.argv) != 5:
98+
sys.stderr.write("Arguments error. Usage:\n")
99+
sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
100+
sys.exit(1)
101+
102+
in_path = sys.argv[1]
103+
out_path = sys.argv[2]
104+
105+
train_input = os.path.join(in_path, "train.tsv")
106+
test_input = os.path.join(in_path, "test.tsv")
107+
train_output = os.path.join(out_path, "train.pkl")
108+
test_output = os.path.join(out_path, "test.pkl")
109+
110+
max_features = params["max_features"]
111+
ngrams = params["ngrams"]
112+
113+
os.makedirs(out_path, exist_ok=True)
114+
115+
bag_of_words = CountVectorizer(
116+
stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
117+
)
118+
tfidf = TfidfTransformer(smooth_idf=False)
119+
120+
generate_and_save_train_features(
121+
train_input=train_input,
122+
train_output=train_output,
123+
bag_of_words=bag_of_words,
124+
tfidf=tfidf,
125+
)
126+
127+
generate_and_save_test_features(
128+
test_input=test_input,
129+
test_output=test_output,
130+
bag_of_words=bag_of_words,
131+
tfidf=tfidf,
132+
)
133+
134+
135+
if __name__ == "__main__":
136+
main()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /