Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit c1cd726

Browse files
Merge pull request #318 from ChethanaPotukanam/os_analysis
Open Source Analysis #315
2 parents dcdcad6 + 583125a commit c1cd726

File tree

5 files changed

+1478
-0
lines changed

5 files changed

+1478
-0
lines changed

‎opensource_analysis/README‎

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Stackoverflow Analysis Project
2+
3+
## Setup Instructions
4+
5+
1. **Download and Extract the Project Folder**
6+
- Download the project folder and extract it to a desired location on your computer.
7+
8+
2. **Navigate to the Project Directory**
9+
```bash
10+
cd /path/to/extracted/project/folder/opensource_analysis
11+
12+
13+
## Install the Dependencies
14+
pip install -r requirements.txt
15+
16+
## Run the Streamlit App
17+
streamlit run app.py
18+
19+
## Access the App
20+
Open the URL http://localhost:8501 in your web browser to access the Streamlit app

‎opensource_analysis/app.py‎

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import os
2+
import streamlit as st
3+
import pandas as pd
4+
import numpy as np
5+
from sklearn.model_selection import train_test_split
6+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
7+
from sklearn.compose import ColumnTransformer
8+
from sklearn.pipeline import Pipeline
9+
from sklearn.ensemble import RandomForestClassifier
10+
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, auc
11+
import matplotlib.pyplot as plt
12+
import seaborn as sns
13+
14+
# Define the path to the data file
15+
file_path = 'survey_results_sample_2018.csv'
16+
17+
# Check if the file exists
18+
if not os.path.exists(file_path):
19+
st.error(f"File not found: {file_path}. Please ensure the file is in the correct directory.")
20+
else:
21+
# Load the dataset
22+
data = pd.read_csv(file_path)
23+
24+
# Define the necessary columns
25+
columns = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age', 'OpenSource']
26+
data = data[columns].copy()
27+
28+
# Map age values to numerical values
29+
age_mapping = {
30+
'Under 18 years old': 0,
31+
'18 - 24 years old': 1,
32+
'25 - 34 years old': 2,
33+
'35 - 44 years old': 3,
34+
'45 - 54 years old': 4,
35+
'55 - 64 years old': 5,
36+
'65 years or older': 6
37+
}
38+
data['Age'] = data['Age'].map(age_mapping)
39+
40+
# Define target variable and feature columns
41+
target_variable = 'OpenSource'
42+
categorical_features = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age']
43+
numerical_features = []
44+
45+
# Preprocessing for categorical data
46+
preprocessor = ColumnTransformer(
47+
transformers=[
48+
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
49+
]
50+
)
51+
52+
# Split the data
53+
X = data.drop(target_variable, axis=1)
54+
y = data[target_variable]
55+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
56+
57+
# Create and train the model
58+
model = Pipeline(steps=[
59+
('preprocessor', preprocessor),
60+
('classifier', RandomForestClassifier(random_state=42))
61+
])
62+
model.fit(X_train, y_train)
63+
64+
# Evaluate the model
65+
y_pred = model.predict(X_test)
66+
classification_rep = classification_report(y_test, y_pred)
67+
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
68+
69+
# Get feature importance
70+
importances = model.named_steps['classifier'].feature_importances_
71+
feature_names = list(model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out())
72+
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
73+
74+
# Streamlit App
75+
st.title('Machine Learning Model Evaluation')
76+
77+
# Show classification report
78+
st.header('Classification Report')
79+
st.text(classification_rep)
80+
81+
# Show ROC-AUC Score
82+
st.header('ROC-AUC Score')
83+
st.text(f"ROC-AUC Score: {roc_auc:.2f}")
84+
85+
# Plot confusion matrix
86+
st.header('Confusion Matrix')
87+
cm = confusion_matrix(y_test, y_pred)
88+
fig, ax = plt.subplots()
89+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], ax=ax)
90+
plt.xlabel('Predicted')
91+
plt.ylabel('Actual')
92+
st.pyplot(fig)
93+
94+
# Plot ROC Curve
95+
st.header('ROC Curve')
96+
y_test_binary = y_test.map({'No': 0, 'Yes': 1})
97+
fpr, tpr, _ = roc_curve(y_test_binary, model.predict_proba(X_test)[:, 1])
98+
roc_auc = auc(fpr, tpr)
99+
fig, ax = plt.subplots()
100+
ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
101+
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
102+
ax.set_xlim([0.0, 1.0])
103+
ax.set_ylim([0.0, 1.05])
104+
ax.set_xlabel('False Positive Rate')
105+
ax.set_ylabel('True Positive Rate')
106+
ax.set_title('ROC Curve')
107+
ax.legend(loc='lower right')
108+
st.pyplot(fig)
109+
110+
# Plot feature importance
111+
st.header('Feature Importance')
112+
fig, ax = plt.subplots()
113+
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20), palette='viridis', ax=ax)
114+
ax.set_title('Top Feature Importances')
115+
ax.set_xlabel('Importance')
116+
ax.set_ylabel('Feature')
117+
st.pyplot(fig)
118+
119+
# Section for new data input and prediction
120+
st.header('Predict for New Data')
121+
122+
# Input fields for new data
123+
employment = st.selectbox('Employment', data['Employment'].unique())
124+
education = st.selectbox('Formal Education', data['FormalEducation'].unique())
125+
company_size = st.selectbox('Company Size', data['CompanySize'].unique())
126+
dev_type = st.selectbox('Dev Type', data['DevType'].unique())
127+
exercise = st.selectbox('Exercise', data['Exercise'].unique())
128+
age = st.selectbox('Age', list(age_mapping.keys()))
129+
130+
# Convert inputs to dataframe
131+
new_data = pd.DataFrame({
132+
'Employment': [employment],
133+
'FormalEducation': [education],
134+
'CompanySize': [company_size],
135+
'DevType': [dev_type],
136+
'Exercise': [exercise],
137+
'Age': [age_mapping[age]]
138+
})
139+
140+
# Handle any NaN values
141+
new_data = new_data.fillna('')
142+
143+
# Predict the output for new data
144+
if st.button('Predict'):
145+
try:
146+
prediction = model.predict(new_data)
147+
prediction_prob = model.predict_proba(new_data)[:, 1]
148+
st.write(f'Prediction: {"Yes" if prediction[0] == "Yes" else "No"}')
149+
st.write(f'Prediction Probability: {prediction_prob[0]:.2f}')
150+
except Exception as e:
151+
st.error(f"An error occurred during prediction: {e}")

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /