Commit c1cd726

authored

Merge pull request #318 from ChethanaPotukanam/os_analysis

Open Source Analysis #315

2 parents dcdcad6 + 583125a commit c1cd726Copy full SHA for c1cd726

File tree

5 files changed

+1478

-0

lines changed

opensource_analysis

5 files changed

+1478

-0

lines changed

`‎opensource_analysis/README‎`

Lines changed: 20 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,20 @@`
	`1`	`+# Stackoverflow Analysis Project`
	`2`	`+`
	`3`	`+## Setup Instructions`
	`4`	`+`
	`5`	`+1. Download and Extract the Project Folder`
	`6`	`+ - Download the project folder and extract it to a desired location on your computer.`
	`7`	`+`
	`8`	`+2. Navigate to the Project Directory`
	`9`	+ ```bash
	`10`	`+ cd /path/to/extracted/project/folder/opensource_analysis`
	`11`	`+`
	`12`	`+`
	`13`	`+## Install the Dependencies`
	`14`	`+pip install -r requirements.txt`
	`15`	`+`
	`16`	`+## Run the Streamlit App`
	`17`	`+streamlit run app.py`
	`18`	`+`
	`19`	`+## Access the App`
	`20`	`+Open the URL http://localhost:8501 in your web browser to access the Streamlit app`

`‎opensource_analysis/app.py‎`

Lines changed: 151 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,151 @@`
	`1`	`+import os`
	`2`	`+import streamlit as st`
	`3`	`+import pandas as pd`
	`4`	`+import numpy as np`
	`5`	`+from sklearn.model_selection import train_test_split`
	`6`	`+from sklearn.preprocessing import StandardScaler, OneHotEncoder`
	`7`	`+from sklearn.compose import ColumnTransformer`
	`8`	`+from sklearn.pipeline import Pipeline`
	`9`	`+from sklearn.ensemble import RandomForestClassifier`
	`10`	`+from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, roc_curve, auc`
	`11`	`+import matplotlib.pyplot as plt`
	`12`	`+import seaborn as sns`
	`13`	`+`
	`14`	`+# Define the path to the data file`
	`15`	`+file_path = 'survey_results_sample_2018.csv'`
	`16`	`+`
	`17`	`+# Check if the file exists`
	`18`	`+if not os.path.exists(file_path):`
	`19`	`+ st.error(f"File not found: {file_path}. Please ensure the file is in the correct directory.")`
	`20`	`+else:`
	`21`	`+ # Load the dataset`
	`22`	`+ data = pd.read_csv(file_path)`
	`23`	`+`
	`24`	`+ # Define the necessary columns`
	`25`	`+ columns = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age', 'OpenSource']`
	`26`	`+ data = data[columns].copy()`
	`27`	`+`
	`28`	`+ # Map age values to numerical values`
	`29`	`+ age_mapping = {`
	`30`	`+ 'Under 18 years old': 0,`
	`31`	`+ '18 - 24 years old': 1,`
	`32`	`+ '25 - 34 years old': 2,`
	`33`	`+ '35 - 44 years old': 3,`
	`34`	`+ '45 - 54 years old': 4,`
	`35`	`+ '55 - 64 years old': 5,`
	`36`	`+ '65 years or older': 6`
	`37`	`+ }`
	`38`	`+ data['Age'] = data['Age'].map(age_mapping)`
	`39`	`+`
	`40`	`+ # Define target variable and feature columns`
	`41`	`+ target_variable = 'OpenSource'`
	`42`	`+ categorical_features = ['Employment', 'FormalEducation', 'CompanySize', 'DevType', 'Exercise', 'Age']`
	`43`	`+ numerical_features = []`
	`44`	`+`
	`45`	`+ # Preprocessing for categorical data`
	`46`	`+ preprocessor = ColumnTransformer(`
	`47`	`+ transformers=[`
	`48`	`+ ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)`
	`49`	`+ ]`
	`50`	`+ )`
	`51`	`+`
	`52`	`+ # Split the data`
	`53`	`+ X = data.drop(target_variable, axis=1)`
	`54`	`+ y = data[target_variable]`
	`55`	`+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)`
	`56`	`+`
	`57`	`+ # Create and train the model`
	`58`	`+ model = Pipeline(steps=[`
	`59`	`+ ('preprocessor', preprocessor),`
	`60`	`+ ('classifier', RandomForestClassifier(random_state=42))`
	`61`	`+ ])`
	`62`	`+ model.fit(X_train, y_train)`
	`63`	`+`
	`64`	`+ # Evaluate the model`
	`65`	`+ y_pred = model.predict(X_test)`
	`66`	`+ classification_rep = classification_report(y_test, y_pred)`
	`67`	`+ roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])`
	`68`	`+`
	`69`	`+ # Get feature importance`
	`70`	`+ importances = model.named_steps['classifier'].feature_importances_`
	`71`	`+ feature_names = list(model.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out())`
	`72`	`+ feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)`
	`73`	`+`
	`74`	`+ # Streamlit App`
	`75`	`+ st.title('Machine Learning Model Evaluation')`
	`76`	`+`
	`77`	`+ # Show classification report`
	`78`	`+ st.header('Classification Report')`
	`79`	`+ st.text(classification_rep)`
	`80`	`+`
	`81`	`+ # Show ROC-AUC Score`
	`82`	`+ st.header('ROC-AUC Score')`
	`83`	`+ st.text(f"ROC-AUC Score: {roc_auc:.2f}")`
	`84`	`+`
	`85`	`+ # Plot confusion matrix`
	`86`	`+ st.header('Confusion Matrix')`
	`87`	`+ cm = confusion_matrix(y_test, y_pred)`
	`88`	`+ fig, ax = plt.subplots()`
	`89`	`+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'], ax=ax)`
	`90`	`+ plt.xlabel('Predicted')`
	`91`	`+ plt.ylabel('Actual')`
	`92`	`+ st.pyplot(fig)`
	`93`	`+`
	`94`	`+ # Plot ROC Curve`
	`95`	`+ st.header('ROC Curve')`
	`96`	`+ y_test_binary = y_test.map({'No': 0, 'Yes': 1})`
	`97`	`+ fpr, tpr, _ = roc_curve(y_test_binary, model.predict_proba(X_test)[:, 1])`
	`98`	`+ roc_auc = auc(fpr, tpr)`
	`99`	`+ fig, ax = plt.subplots()`
	`100`	`+ ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')`
	`101`	`+ ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')`
	`102`	`+ ax.set_xlim([0.0, 1.0])`
	`103`	`+ ax.set_ylim([0.0, 1.05])`
	`104`	`+ ax.set_xlabel('False Positive Rate')`
	`105`	`+ ax.set_ylabel('True Positive Rate')`
	`106`	`+ ax.set_title('ROC Curve')`
	`107`	`+ ax.legend(loc='lower right')`
	`108`	`+ st.pyplot(fig)`
	`109`	`+`
	`110`	`+ # Plot feature importance`
	`111`	`+ st.header('Feature Importance')`
	`112`	`+ fig, ax = plt.subplots()`
	`113`	`+ sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20), palette='viridis', ax=ax)`
	`114`	`+ ax.set_title('Top Feature Importances')`
	`115`	`+ ax.set_xlabel('Importance')`
	`116`	`+ ax.set_ylabel('Feature')`
	`117`	`+ st.pyplot(fig)`
	`118`	`+`
	`119`	`+ # Section for new data input and prediction`
	`120`	`+ st.header('Predict for New Data')`
	`121`	`+`
	`122`	`+ # Input fields for new data`
	`123`	`+ employment = st.selectbox('Employment', data['Employment'].unique())`
	`124`	`+ education = st.selectbox('Formal Education', data['FormalEducation'].unique())`
	`125`	`+ company_size = st.selectbox('Company Size', data['CompanySize'].unique())`
	`126`	`+ dev_type = st.selectbox('Dev Type', data['DevType'].unique())`
	`127`	`+ exercise = st.selectbox('Exercise', data['Exercise'].unique())`
	`128`	`+ age = st.selectbox('Age', list(age_mapping.keys()))`
	`129`	`+`
	`130`	`+ # Convert inputs to dataframe`
	`131`	`+ new_data = pd.DataFrame({`
	`132`	`+ 'Employment': [employment],`
	`133`	`+ 'FormalEducation': [education],`
	`134`	`+ 'CompanySize': [company_size],`
	`135`	`+ 'DevType': [dev_type],`
	`136`	`+ 'Exercise': [exercise],`
	`137`	`+ 'Age': [age_mapping[age]]`
	`138`	`+ })`
	`139`	`+`
	`140`	`+ # Handle any NaN values`
	`141`	`+ new_data = new_data.fillna('')`
	`142`	`+`
	`143`	`+ # Predict the output for new data`
	`144`	`+ if st.button('Predict'):`
	`145`	`+ try:`
	`146`	`+ prediction = model.predict(new_data)`
	`147`	`+ prediction_prob = model.predict_proba(new_data)[:, 1]`
	`148`	`+ st.write(f'Prediction: {"Yes" if prediction[0] == "Yes" else "No"}')`
	`149`	`+ st.write(f'Prediction Probability: {prediction_prob[0]:.2f}')`
	`150`	`+ except Exception as e:`
	`151`	`+ st.error(f"An error occurred during prediction: {e}")`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit c1cd726

File tree

5 files changed

5 files changed

`‎opensource_analysis/README‎`

`‎opensource_analysis/app.py‎`

0 commit comments