集成学习模型对比优化—银行业务-1.Data Understanding¶

时间:2024-06-11 11:52:22
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv("D:\\课程学习\\机器学习\\银行客户开设定期存款账户情况预测\\banking.csv")
#Print the shape of the DataFrame
print("1.the shape of the DataFrame")
print(df.shape)

在这里插入图片描述

# Print the head of the DataFrame
print("2.the head of the DataFrame")
print(df.head())

在这里插入图片描述

# Print info of the DataFrame
print("3.the info of the DataFrame")
print(df.info())

在这里插入图片描述

# Print statistical description of the DataFrame
print("4.the statistical description of the DataFrame")
print(df.describe())

在这里插入图片描述

# Check for any null values in the DataFrame
print("5.Check for any null values in the DataFrame")
datacheck = df.isnull().any()
print(datacheck)

在这里插入图片描述

# Check for duplicates
print("6.Check for duplicates")
duplicates = df.duplicated()
print(f"Number of duplicated rows: {duplicates.sum()}")

在这里插入图片描述

print("7.See the duplicated rows")
# See the duplicated rows:
if duplicates.sum() > 0:
    print("\nDuplicated Rows:")
    print(df[duplicates])

在这里插入图片描述

#pick out the non_numeric_columns
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.to_list()
numeric_columns = df.select_dtypes(include=['number']).columns
print(non_numeric_columns)