我的代码-statistic analysis

时间:2021-01-27 16:53:13

# coding: utf-8

# In[1]:

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# In[3]:

# Test data features
app_test = pd.read_csv(r'D:\Users\sgg91044\Desktop\MEP_no_defect_data_pivot.csv')
print('Testing data shape: ', app_test.shape)
app_test.head(20000)

# In[4]:

app_test['Target'].value_counts()
app_test['Target'].astype(int).plot.hist();

# In[5]:

# Function to calculate missing values by column# Funct
def missing_values_table(app_test):
# Total missing values
mis_val = app_test.isnull().sum()

# Percentage of missing values
mis_val_percent = 100 * app_test.isnull().sum() / len(app_test)

# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})

# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)

# Print some summary information
print ("Your selected dataframe has " + str(app_test.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")

# Return the dataframe with missing information
return mis_val_table_ren_columns

# In[6]:

# Missing values statistics
missing_values = missing_values_table(app_test)
missing_values.head(20)

# In[7]:

RR13_MAX_median = app_test['RR13_MAX.'].median()
ETCM_PHC4_median = app_test['ETCM_PHC4'].median()
HELK_MEAN_median = app_test['HELK_MEAN'].median()
PBK4_median = app_test['PBK4'].median()
ETCM_PHB4_median = app_test['ETCM_PHB4'].median()
ETCM_PHA4_median = app_test['ETCM_PHA4'].median()
THR3_MAX_median = app_test['THR3_MAX.'].median()
THR3_MEAN_median = app_test['THR3_MEAN'].median()
RR23_MEAN_median = app_test['RR23_MEAN'].median()
RR13_MEAN_median = app_test['RR13_MEAN'].median()
THR3_MEAN_DIFF_median = app_test['THR3_MEAN_DIFF'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MAX_DIFF_median = app_test['THR3_MAX._DIFF'].median()
LOWERCHM_PRESS_DIFF_median = app_test['LOWERCHM_PRESS'].median()
HELK_MAX_median = app_test['HELK_MAX.'].median()
#HELK_MIN_median = app_test['HELK_MIN.'].median()
HELK_SD_median = app_test['HELK_SD'].median()
THR3_SD_median = app_test['THR3_SD'].median()
RR23_MAX_median = app_test['RR23_MAX.'].median()
#RR13_MAX_median
#ETCM_PHC4_median
#HELK_MEAN_median
#PBK4_median
#ETCM_PHB4_median
#ETCM_PHA4_median
#THR3_MAX_median
#THR3_MEAN_median
#RR23_MEAN_median
#RR13_MEAN_median
#THR3_MEAN_DIFF_median
#THR3_MEAN_SLOPE_median
#THR3_MEAN_SLOPE_median
#THR3_MAX_DIFF_median
#LOWERCHM_PRESS_DIFF_median
#HELK_MAX_median
#HELK_MIN_median
#HELK_SD_median
#THR3_SD_median

# In[8]:

app_test=app_test.fillna({'RR13_MAX.':RR13_MAX_median,
'ETCM_PHC4':ETCM_PHC4_median,
'HELK_MEAN':HELK_MEAN_median,
'PBK4':PBK4_median,
'ETCM_PHB4':ETCM_PHB4_median,
'ETCM_PHA4':ETCM_PHA4_median,
'THR3_MAX.':THR3_MAX_median,
'THR3_MEAN':THR3_MEAN_median,
'RR23_MEAN':RR23_MEAN_median,
'RR13_MEAN':RR13_MEAN_median,
'THR3_MEAN_DIFF':THR3_MEAN_DIFF_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MAX._DIFF':THR3_MAX_DIFF_median,
'LOWERCHM_PRESS':LOWERCHM_PRESS_DIFF_median,
'HELK_MAX.':HELK_MAX_median,

'HELK_SD':HELK_SD_median,
'THR3_SD':THR3_SD_median,
'RR23_MAX.':RR23_MAX_median
})
app_test

# In[9]:

# Find correlations with the target and sort
correlations = app_test.corr()['Target'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))

# In[10]:

# Measure the correlation of parameter'ETCM_PHC4' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'ETCM_PHC4'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'ETCM_PHC4'],label = 'target == 1')

# Labeling of plot
plt.xlabel('ETCM_PHC4'); plt.ylabel('Density'); plt.title('Distribution of ETCM_PHC4');

# In[11]:

sns.boxplot(x='HELK_SD', y='Target', data=app_test)

sns.plot.show()

# In[ ]:

# Scatter Plot
app_test.plot(kind='scatter', x='PBK4', y='Target',alpha = 0.5,color = 'red')
plt.xlabel('PBK4') # label = name of label
plt.ylabel('Target')
plt.title('PBK4 Target Scatter Plot')

# In[ ]:

# Measure the correlation of parameter'HELK_MEAN' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'HELK_MEAN'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'HELK_MEAN'],label = 'target == 1')

# Labeling of plot
plt.xlabel('HELK_MEAN'); plt.ylabel('Density'); plt.title('Distribution of HELK_MEAN');

# In[ ]:

plt.figure(figsize = (15, 50))
# iterate through the new features
for i, feature in enumerate(['LOWERCHM_PRESS',
'RR13_MEAN',
'RR13_MAX.',
'RR23_MEAN',
'THR3_MAX.',
'THR3_MEAN',
'RR23_MAX.',
'PBK4',
'THR3_MEAN_DIFF',
'HELK_MEAN',
'THR3_MEAN_SLOPE',
'THR3_MAX._DIFF']):

# create a new subplot for each source
plt.subplot(13, 1, i + 1)
# plot non_defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 0, feature], label = 'Target == 0')
# plot defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 1, feature], label = 'Target == 1')

# Label the plots
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');

plt.tight_layout(h_pad = 0.1)

# In[ ]:

# Bin the parameter'ETCM_PHC4' data
ETCM_PHC4 = app_test[['Target', 'ETCM_PHC4']]
ETCM_PHC4['VALUE_BINNED'] = pd.cut(ETCM_PHC4['ETCM_PHC4'], bins = np.linspace(200, 1000, num = 17))
ETCM_PHC4.head(20)

# In[ ]:

# Group by the bin and calculate averages
ETCM_PHC4_groups = ETCM_PHC4.groupby('VALUE_BINNED').mean()
ETCM_PHC4_groups

# In[ ]:

plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(ETCM_PHC4_groups.index.astype(str), 100 * ETCM_PHC4_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('ETCM_PHC4 value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by ETCM_PHC4');

# In[ ]:

# Bin the parameter'HELK_MEAN' data
HELK_MEAN = app_test[['Target', 'HELK_MEAN']]
HELK_MEAN['VALUE_BINNED'] = pd.cut(HELK_MEAN['HELK_MEAN'], bins = np.linspace(0, 17.5, num = 8))
plt.figure(figsize = (8, 8))

# Group by the bin and calculate averages
HELK_MEAN_groups = HELK_MEAN.groupby('VALUE_BINNED').mean()
# Graph the age bins and the average of the target as a bar plot
plt.bar(HELK_MEAN_groups.index.astype(str), 100 * HELK_MEAN_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('HELK_MEAN value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by HELK_MEAN');

# In[13]:

# Extract the parameters variables and show correlations
ext_data = app_test[['Target', 'RR13_MAX.',
'ETCM_PHC4' ,
'HELK_MEAN',
'PBK4',
'ETCM_PHB4' ,
'ETCM_PHA4',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]]
ext_data_corrs = ext_data.corr()
ext_data_corrs

# In[14]:

plt.figure(figsize = (10, 10))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin =0, annot = True, vmax = 1)
plt.title('Correlation Heatmap');

# In[ ]:

plt.figure(figsize = (10, 12))

# iterate through the sources
for i, source in enumerate(['ETCM_PHA4', 'ETCM_PHB4', 'ETCM_PHC4']):

# create a new subplot for each source
plt.subplot(3, 1, i + 1)
# plot non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, source], label = 'target == 0',shade=True)
# plot defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, source], label = 'target == 1',shade=True)

# Label the plots
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source); plt.ylabel('Density');

plt.tight_layout(h_pad = 2.5)

# In[ ]:

# Copy the data for plotting
plot_data = ext_data.drop(columns = ['RR13_MAX.',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]).copy()

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,
size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'Target',
vars = [x for x in list(plot_data.columns) if x != 'Target'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source Features Pairs Plot', size = 32, y = 1.05);

# In[12]:

import seaborn as sns
# Create the default pairplot
sns.pairplot(app_test)

# In[224]:

app_test.to_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv', index=True, header=True)

# In[243]:

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split

CTM_data=pd.read_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv')
# Median imputation of missing values
#RR23_MAX_median = CTM_data['RR23_MAX.'].median()
#CTM_data=CTM_data.fillna({'RR23_MAX.': RR23_MAX_median})

# Drop the target from the training data
#CTM_data['Target_C']=CTM_data['Target'].astype('category')
#CTM_data['Target_C'].cat.categories=['noMEP','MEP']
#le = LabelEncoder()
#le_count = 0
#le.fit(CTM_data['Target_C'])
#CTM_data['Target_C'] = le.transform(CTM_data['Target_C'])
CTM_data['Target']=CTM_data['Target'].astype('float')
CTM_data_Target= CTM_data[['Target']]
CTM_data_Columns = CTM_data.drop(columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(CTM_data_Columns,CTM_data_Target,test_size=0.3, random_state=0)

# In[244]:

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

# In[245]:

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

# In[228]:

#Y_test = test[['Target_C']]
#Y_train = test.drop(columns = ['Target','Target_C'])

# Feature names
#features = list(train.columns)
#y, _ = pd.factorize(app_train['Target'])

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
#imputer.fit(train)

# Transform both training and testing data
#train = imputer.transform(train)
#test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test = scaler.transform(X_test)
print('Training data shape: ', X_train.shape)
print('Testing data shape: ', X_test.shape)

# In[229]:

from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

# In[203]:

y_test

# In[234]:

# Train on the training data
random_forest.fit(X_train,y_train)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict(X_test)

# In[232]:

predictions
predictions.shape

# In[233]:

import pandas as pd
# 第一个参数是行索引,第二个属性为列索引
print( pd.crosstab(y_test['Target'], predictions, rownames=['Target'], colnames=['preds']))
# Make a submission dataframe

# In[165]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score

# In[190]:

cross_val_score(random_forest,y_test,submit )

# In[235]:

def plot_feature_importances(df):
"""
Plot importances returned by a model. This can work with any measure of
feature importance provided that higher importance is better.

Args:
df (dataframe): feature importances. Must have the features in a column
called `features` and the importances in a column called `importance

Returns:
shows a plot of the 15 most importance features

df (dataframe): feature importances sorted by importance (highest to lowest)
with a column for normalized importance
"""

# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()

# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()

# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()

# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:20]))),
df['importance_normalized'].head(20),
align = 'center', edgecolor = 'k')

# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:20]))))
ax.set_yticklabels(df['feature'].head(20))

# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()

return df

# In[236]:

feature_importances_sorted = plot_feature_importances(feature_importances)