2012年国赛高教杯数学建模C题脑卒中发病环境因素分析及干预解题全过程文档及程序
import pandas as pd
from pyecharts.charts import Page #页面
from pyecharts.charts import Bar #柱状图
from pyecharts.charts import Pie #饼状图
from pyecharts.charts import Line #折线图
from pyecharts.charts import Scatter #散点图
from pyecharts import options as opts
# 定义日期转换函数
def convert_date(date_str):
try:
return pd.to_datetime(date_str, errors='coerce').strftime('%Y-%m-%d')
except:
return ''
def analyze():
df1 = pd.read_excel('Appendix-C1/data1.xls', index_col=0)
df1 = df1.reset_index()
df2 = pd.read_excel('Appendix-C1/data2.xls', index_col=0)
df2 = df2.reset_index()
df3 = pd.read_excel('Appendix-C1/data3.xls', index_col=0)
df3 = df3.reset_index()
df4 = pd.read_excel('Appendix-C1/data4.xls', index_col=0)
df4 = df4.reset_index()
df = df1.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)
df = df.append(df4, ignore_index=True)
print("去除脏数据前总数据量:" + str(df.shape[0]))
target_column = "Sex"
df_cleanNullSex = df.dropna(subset=[target_column])
print("去除Sex数据为空的数据量:" + str(df_cleanNullSex.shape[0]))
df_Sex = df_cleanNullSex[df_cleanNullSex[target_column].isin([1.0, 2.0])] # 筛选值为 1 或 2 的行
print("去除Sex脏数据的数据量:" + str(df_Sex.shape[0]))
target_column = "Age" # 将列转换为数值类型,非数字值会被置为 NaN
df_cleanNullSex = df_Sex
df_cleanNullSex[target_column] = pd.to_numeric(df_cleanNullSex[target_column], errors='coerce')
# 筛选值在 0 到 110 范围内的行,并删除空值和乱码
df_cleanSex = df_cleanNullSex.dropna(subset=[target_column]) # 删除空值
df_cleanSex = df_cleanSex[
(df_cleanSex[target_column] >= 0) & (df_cleanSex[target_column] <= 110)] # 筛选值在 0 到 110 范围内的行
print("去除Age数据为空的数据量:" + str(df_cleanSex.shape[0]))
df = df_cleanSex
# 将Occupation列的空值替换为 9
target_column = "Occupation" # 替换为目标列的名称
df[target_column] = df[target_column].fillna(9)
# 将日期转换为日期时间类型并统一格式Report time
df['Time of incidence'] = df['Time of incidence'].apply(convert_date)
df['Report time'] = df['Report time'].apply(convert_date)
# 将日期列转换为 datetime 类型
df['Time of incidence'] = pd.to_datetime(df['Time of incidence'], format='%Y-%m-%d')
# 筛选出2007-2010年之前的数据
df = df[(df['Time of incidence'].dt.year > 2006) & (df['Time of incidence'].dt.year < 2011)].copy()
df = df[df['Time of incidence'].dt.month < 13].copy()
df = df[df['Time of incidence'].dt.month > 0].copy()
df = df[df['Time of incidence'].dt.day < 32].copy()
df_filtered = df[df['Time of incidence'].dt.day > 0].copy()
# 提取日期的年月日部分
df_filtered['Time of incidence'] = df_filtered['Time of incidence'].dt.strftime('%Y-%m-%d')
#print(df_filtered)
df_filtered.to_excel('Appendix-C1/clean_data.xlsx', engine='openpyxl')
#print(df_filtered.shape[0])
#性别-患病 饼状图
def viewSexPie(df):
# 定义性别映射字典
gender_mapping = {1: '男', 2: '女'}
# 统计患病人数并添加到结果列表
gender_counts = df['Sex'].map(gender_mapping).value_counts()
# 获取类型和数量的列表
gender = gender_counts.index.tolist()
counts = gender_counts.values.tolist()
# 打印最终结果
print(gender)
print(counts)
# 可视化
# 创建柱状图对象
sum = 0
for sl in counts:
data = float(sl)
sum += data
print(sum)
counts_list = []
for sl in counts:
data1 = float(sl) / sum
reslut = round(data1*100,2)
counts_list.append(reslut)
# print(counts_list)
c = Pie() # 饼状图
# 设置圆环的粗细和大小,同时将年份和数量作为标签添加到饼状图中
c.add(
"",
[list(z) for z in zip(gender, counts_list)],
label_opts=opts.LabelOpts(formatter="{b}: {c}")
)
# 设置标题和单位
c.set_global_opts(
title_opts=opts.TitleOpts(
title="脑卒中患者按性别统计人数分布比例",
pos_left="center", # 标题居中
pos_bottom="bottom"
),
graphic_opts=[
opts.GraphicText(
graphic_item=opts.GraphicItem(
left="82%",
top="10%",
z=100
),
graphic_textstyle_opts=opts.GraphicTextStyleOpts(
text="单位:%", # 添加额外的文本
font="13px SimSun"
),
)]
)
return c
#性别-患病-年份 柱状图
def viewSexYearZhu(df):
# 定义性别映射字典
gender_mapping = {1: '男', 2: '女'}
df['Gender'] = df['Sex'].map(gender_mapping)
# 提取年份
df['Year'] = pd.to_datetime(df['Time of incidence'], format='%Y-%m-%d').dt.year
# 按照年份统计人数
year_counts = df['Year'].value_counts().sort_index()
counts = year_counts.values.tolist()
# 分组并统计每个年龄段的男性和女性患病人数
year_gender_counts = df.groupby(['Year', 'Gender']).size().unstack()
# 获取类型和数量的列表
# 生成year数据
year = year_counts.index.tolist()
# 提取female_counts数据
female_counts = year_gender_counts['女'].tolist()
# 生成male_counts数据
male_counts = year_gender_counts['男'].tolist()
# 打印最终结果
print(year)
print(female_counts)
print(male_counts)
# 可视化
# 创建柱状图对象
c = (
Bar()
.add_xaxis(year)
.add_yaxis("女性患病人数", female_counts)
.add_yaxis("男性患病总人数", male_counts)
.add_yaxis("患病总人数", counts)
.set_global_opts(
title_opts=opts.TitleOpts(title="脑卒中患者按年份统计人数分布",
pos_left="center",
pos_bottom="bottom"
),
xaxis_opts=opts.AxisOpts(name="年份/年"),
yaxis_opts=opts.AxisOpts(name="患病人数/人")
)
)
return c
#职业-患病 饼状图
def viewOccupationPie(df):
# 定义职业代码和职业名称的映射字典
occupation_mapping = {
1: '农民',
2: '工人',
3: '退休人员',
4: '教师',
5: '渔民',
6: '医务人员',
7: '职工',
8: '离退人员',
9: '其他或缺失',
}
# 将职业代码映射为实际的职业名称
df['Occupation'] = df['Occupation'].map(occupation_mapping)
# 统计各个职业的人数
occupation_counts = df['Occupation'].value_counts()
#print(occupation_counts)
# 获取类型和数量的列表
occupation = occupation_counts.index.tolist()
counts = occupation_counts.values.tolist()
# 打印最终结果
print(occupation)
print(counts)
# 可视化
sum = 0
for sl in counts:
data = float(sl)
sum += data
# print(sum)
counts_list = []
for sl in counts:
data1 = float(sl) / sum
reslut = round(data1 * 100, 2)
counts_list.append(reslut)
# print(counts_list)
c = Pie() # 饼状图
# 设置圆环的粗细和大小,同时将年份和数量作为标签添加到饼状图中
c.add(
"",
[list(z) for z in zip(occupation, counts_list)],
label_opts=opts.LabelOpts(formatter="{b}: {c}")
)
# 设置标题和单位
c.set_global_opts(
title_opts=opts.TitleOpts(
title="脑卒中患者按职业统计人数分布比例",
pos_left="center", # 标题居中
pos_bottom="bottom"
),
graphic_opts=[
opts.GraphicText(
graphic_item=opts.GraphicItem(
left="82%",
top="10%",
z=100
),
graphic_textstyle_opts=opts.GraphicTextStyleOpts(
text="单位:%", # 添加额外的文本
font="13px SimSun"
),
)]
)
# 可视化
# 创建柱状图对象
c1 = (
Bar()
.add_xaxis(occupation)
.add_yaxis("患病人数", counts)
.set_global_opts(
title_opts=opts.TitleOpts(title="脑卒中患者按职业统计人数分布",
pos_left="center",
pos_bottom="bottom"
),
xaxis_opts=opts.AxisOpts(name="职业"),
yaxis_opts=opts.AxisOpts(name="患病人数/人")
)
)
return c1
return c
#年龄-患病 柱状图
def viewAgeZhu(df):
# 将年龄列转换为整数类型
df['Age'] = df['Age'].astype(int)
# 打印结果
#print(df['Age'])
# 定义年龄段区间
age_bins = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
labels = [
'1–10', '11–20', '21–30', '31–40', '41–50', '51–60',
'61–70', '71–80', '81–90', '91–100', '101–110'
]
# 将年龄数据分配到对应的年龄段
df['Age Category'] = pd.cut(df['Age'], bins=age_bins, labels=labels)
#print(df['Age Category'])
# 统计每个年龄段的患病人数
age_counts = df['Age Category'].value_counts().sort_index()
#print(age_counts)
# 获取类型和数量的列表
age = age_counts.index.tolist()
counts = age_counts.values.tolist()
# 打印最终结果
print(age)
print(counts)
# 可视化
# 创建柱状图对象
c = (
Bar()
.add_xaxis(age)
.add_yaxis("患病人数", counts)
.set_global_opts(
title_opts=opts.TitleOpts(title="脑卒中患者按年龄阶段统计人数分布",
pos_left="center",
pos_bottom="bottom"
),
xaxis_opts=opts.AxisOpts(name="年龄阶段"),
yaxis_opts=opts.AxisOpts(name="患病人数/人")
)
)
return c
#年龄-性别-患病 折线图
def viewSexAgeLine(df):
# 定义性别映射字典
gender_mapping = {1: '男', 2: '女'}
df['Gender'] = df['Sex'].map(gender_mapping)
# 将年龄列转换为整数类型
df['Age'] = df['Age'].astype(int)
# 定义年龄段区间
age_bins = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
labels = [
'1–10', '11–20', '21–30', '31–40', '41–50', '51–60',
'61–70', '71–80', '81–90', '91–100', '101–110'
]
# 将年龄数据分配到对应的年龄段
df['Age Category'] = pd.cut(df['Age'], bins=age_bins, labels=labels)
# 统计每个年龄段的患病人数
age_counts = df['Age Category'].value_counts().sort_index()
# 分组并统计每个年龄段的男性和女性患病人数
age_gender_counts = df.groupby(['Age Category', 'Gender']).size().unstack()
# 获取类型和数量的列表
# 生成age_category数据
age_category = age_counts.index.tolist()
# # 提取female_counts数据
female_counts = age_gender_counts['女'].tolist()
# # 生成male_counts数据
male_counts = age_gender_counts['男'].tolist()
# 打印最终结果
print(age_category)
print(female_counts)
print(male_counts)
# 可视化
c = (
Line()
.add_xaxis(age_category)
.add_yaxis("女性患病人数", female_counts)
.add_yaxis("男性患病总人数", male_counts)
.add_yaxis("患病总人数", age_counts)
.set_global_opts(xaxis_opts=opts.AxisOpts(name="年龄阶段"),
yaxis_opts=opts.AxisOpts(name="患病人数/人"),
title_opts=opts.TitleOpts(
title='脑卒中患者按年龄阶段统计人数分布趋势',
pos_left="center",
pos_bottom="bottom"
))
)
return c
#年份-患病 柱状图
def viewYearZhu(df):
# 提取年份
df['Year'] = pd.to_datetime(df['Time of incidence'], format='%Y-%m-%d').dt.year
# 按照年份统计人数
year_counts = df['Year'].value_counts().sort_index()
# 打印结果
print(year_counts)
# 获取类型和数量的列表
year= year_counts.index.tolist()
counts = year_counts.values.tolist()
# 打印最终结果
print(year)
print(counts)
# 可视化
# 创建柱状图对象
c = (
Bar()
.add_xaxis(year)
.