2012年国赛高教杯数学建模C题脑卒中发病环境因素分析及干预解题全过程文档及程序

时间:2024-10-15 11:12:04
import pandas as pd from pyecharts.charts import Page #页面 from pyecharts.charts import Bar #柱状图 from pyecharts.charts import Pie #饼状图 from pyecharts.charts import Line #折线图 from pyecharts.charts import Scatter #散点图 from pyecharts import options as opts # 定义日期转换函数 def convert_date(date_str): try: return pd.to_datetime(date_str, errors='coerce').strftime('%Y-%m-%d') except: return '' def analyze(): df1 = pd.read_excel('Appendix-C1/data1.xls', index_col=0) df1 = df1.reset_index() df2 = pd.read_excel('Appendix-C1/data2.xls', index_col=0) df2 = df2.reset_index() df3 = pd.read_excel('Appendix-C1/data3.xls', index_col=0) df3 = df3.reset_index() df4 = pd.read_excel('Appendix-C1/data4.xls', index_col=0) df4 = df4.reset_index() df = df1.append(df2, ignore_index=True) df = df.append(df3, ignore_index=True) df = df.append(df4, ignore_index=True) print("去除脏数据前总数据量:" + str(df.shape[0])) target_column = "Sex" df_cleanNullSex = df.dropna(subset=[target_column]) print("去除Sex数据为空的数据量:" + str(df_cleanNullSex.shape[0])) df_Sex = df_cleanNullSex[df_cleanNullSex[target_column].isin([1.0, 2.0])] # 筛选值为 1 或 2 的行 print("去除Sex脏数据的数据量:" + str(df_Sex.shape[0])) target_column = "Age" # 将列转换为数值类型,非数字值会被置为 NaN df_cleanNullSex = df_Sex df_cleanNullSex[target_column] = pd.to_numeric(df_cleanNullSex[target_column], errors='coerce') # 筛选值在 0 到 110 范围内的行,并删除空值和乱码 df_cleanSex = df_cleanNullSex.dropna(subset=[target_column]) # 删除空值 df_cleanSex = df_cleanSex[ (df_cleanSex[target_column] >= 0) & (df_cleanSex[target_column] <= 110)] # 筛选值在 0 到 110 范围内的行 print("去除Age数据为空的数据量:" + str(df_cleanSex.shape[0])) df = df_cleanSex # 将Occupation列的空值替换为 9 target_column = "Occupation" # 替换为目标列的名称 df[target_column] = df[target_column].fillna(9) # 将日期转换为日期时间类型并统一格式Report time df['Time of incidence'] = df['Time of incidence'].apply(convert_date) df['Report time'] = df['Report time'].apply(convert_date) # 将日期列转换为 datetime 类型 df['Time of incidence'] = pd.to_datetime(df['Time of incidence'], format='%Y-%m-%d') # 筛选出2007-2010年之前的数据 df = df[(df['Time of incidence'].dt.year > 2006) & (df['Time of incidence'].dt.year < 2011)].copy() df = df[df['Time of incidence'].dt.month < 13].copy() df = df[df['Time of incidence'].dt.month > 0].copy() df = df[df['Time of incidence'].dt.day < 32].copy() df_filtered = df[df['Time of incidence'].dt.day > 0].copy() # 提取日期的年月日部分 df_filtered['Time of incidence'] = df_filtered['Time of incidence'].dt.strftime('%Y-%m-%d') #print(df_filtered) df_filtered.to_excel('Appendix-C1/clean_data.xlsx', engine='openpyxl') #print(df_filtered.shape[0]) #性别-患病 饼状图 def viewSexPie(df): # 定义性别映射字典 gender_mapping = {1: '男', 2: '女'} # 统计患病人数并添加到结果列表 gender_counts = df['Sex'].map(gender_mapping).value_counts() # 获取类型和数量的列表 gender = gender_counts.index.tolist() counts = gender_counts.values.tolist() # 打印最终结果 print(gender) print(counts) # 可视化 # 创建柱状图对象 sum = 0 for sl in counts: data = float(sl) sum += data print(sum) counts_list = [] for sl in counts: data1 = float(sl) / sum reslut = round(data1*100,2) counts_list.append(reslut) # print(counts_list) c = Pie() # 饼状图 # 设置圆环的粗细和大小,同时将年份和数量作为标签添加到饼状图中 c.add( "", [list(z) for z in zip(gender, counts_list)], label_opts=opts.LabelOpts(formatter="{b}: {c}") ) # 设置标题和单位 c.set_global_opts( title_opts=opts.TitleOpts( title="脑卒中患者按性别统计人数分布比例", pos_left="center", # 标题居中 pos_bottom="bottom" ), graphic_opts=[ opts.GraphicText( graphic_item=opts.GraphicItem( left="82%", top="10%", z=100 ), graphic_textstyle_opts=opts.GraphicTextStyleOpts( text="单位:%", # 添加额外的文本 font="13px SimSun" ), )] ) return c #性别-患病-年份 柱状图 def viewSexYearZhu(df): # 定义性别映射字典 gender_mapping = {1: '男', 2: '女'} df['Gender'] = df['Sex'].map(gender_mapping) # 提取年份 df['Year'] = pd.to_datetime(df['Time of incidence'], format='%Y-%m-%d').dt.year # 按照年份统计人数 year_counts = df['Year'].value_counts().sort_index() counts = year_counts.values.tolist() # 分组并统计每个年龄段的男性和女性患病人数 year_gender_counts = df.groupby(['Year', 'Gender']).size().unstack() # 获取类型和数量的列表 # 生成year数据 year = year_counts.index.tolist() # 提取female_counts数据 female_counts = year_gender_counts['女'].tolist() # 生成male_counts数据 male_counts = year_gender_counts['男'].tolist() # 打印最终结果 print(year) print(female_counts) print(male_counts) # 可视化 # 创建柱状图对象 c = ( Bar() .add_xaxis(year) .add_yaxis("女性患病人数", female_counts) .add_yaxis("男性患病总人数", male_counts) .add_yaxis("患病总人数", counts) .set_global_opts( title_opts=opts.TitleOpts(title="脑卒中患者按年份统计人数分布", pos_left="center", pos_bottom="bottom" ), xaxis_opts=opts.AxisOpts(name="年份/年"), yaxis_opts=opts.AxisOpts(name="患病人数/人") ) ) return c #职业-患病 饼状图 def viewOccupationPie(df): # 定义职业代码和职业名称的映射字典 occupation_mapping = { 1: '农民', 2: '工人', 3: '退休人员', 4: '教师', 5: '渔民', 6: '医务人员', 7: '职工', 8: '离退人员', 9: '其他或缺失', } # 将职业代码映射为实际的职业名称 df['Occupation'] = df['Occupation'].map(occupation_mapping) # 统计各个职业的人数 occupation_counts = df['Occupation'].value_counts() #print(occupation_counts) # 获取类型和数量的列表 occupation = occupation_counts.index.tolist() counts = occupation_counts.values.tolist() # 打印最终结果 print(occupation) print(counts) # 可视化 sum = 0 for sl in counts: data = float(sl) sum += data # print(sum) counts_list = [] for sl in counts: data1 = float(sl) / sum reslut = round(data1 * 100, 2) counts_list.append(reslut) # print(counts_list) c = Pie() # 饼状图 # 设置圆环的粗细和大小,同时将年份和数量作为标签添加到饼状图中 c.add( "", [list(z) for z in zip(occupation, counts_list)], label_opts=opts.LabelOpts(formatter="{b}: {c}") ) # 设置标题和单位 c.set_global_opts( title_opts=opts.TitleOpts( title="脑卒中患者按职业统计人数分布比例", pos_left="center", # 标题居中 pos_bottom="bottom" ), graphic_opts=[ opts.GraphicText( graphic_item=opts.GraphicItem( left="82%", top="10%", z=100 ), graphic_textstyle_opts=opts.GraphicTextStyleOpts( text="单位:%", # 添加额外的文本 font="13px SimSun" ), )] ) # 可视化 # 创建柱状图对象 c1 = ( Bar() .add_xaxis(occupation) .add_yaxis("患病人数", counts) .set_global_opts( title_opts=opts.TitleOpts(title="脑卒中患者按职业统计人数分布", pos_left="center", pos_bottom="bottom" ), xaxis_opts=opts.AxisOpts(name="职业"), yaxis_opts=opts.AxisOpts(name="患病人数/人") ) ) return c1 return c #年龄-患病 柱状图 def viewAgeZhu(df): # 将年龄列转换为整数类型 df['Age'] = df['Age'].astype(int) # 打印结果 #print(df['Age']) # 定义年龄段区间 age_bins = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110] labels = [ '1–10', '11–20', '21–30', '31–40', '41–50', '51–60', '61–70', '71–80', '81–90', '91–100', '101–110' ] # 将年龄数据分配到对应的年龄段 df['Age Category'] = pd.cut(df['Age'], bins=age_bins, labels=labels) #print(df['Age Category']) # 统计每个年龄段的患病人数 age_counts = df['Age Category'].value_counts().sort_index() #print(age_counts) # 获取类型和数量的列表 age = age_counts.index.tolist() counts = age_counts.values.tolist() # 打印最终结果 print(age) print(counts) # 可视化 # 创建柱状图对象 c = ( Bar() .add_xaxis(age) .add_yaxis("患病人数", counts) .set_global_opts( title_opts=opts.TitleOpts(title="脑卒中患者按年龄阶段统计人数分布", pos_left="center", pos_bottom="bottom" ), xaxis_opts=opts.AxisOpts(name="年龄阶段"), yaxis_opts=opts.AxisOpts(name="患病人数/人") ) ) return c #年龄-性别-患病 折线图 def viewSexAgeLine(df): # 定义性别映射字典 gender_mapping = {1: '男', 2: '女'} df['Gender'] = df['Sex'].map(gender_mapping) # 将年龄列转换为整数类型 df['Age'] = df['Age'].astype(int) # 定义年龄段区间 age_bins = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110] labels = [ '1–10', '11–20', '21–30', '31–40', '41–50', '51–60', '61–70', '71–80', '81–90', '91–100', '101–110' ] # 将年龄数据分配到对应的年龄段 df['Age Category'] = pd.cut(df['Age'], bins=age_bins, labels=labels) # 统计每个年龄段的患病人数 age_counts = df['Age Category'].value_counts().sort_index() # 分组并统计每个年龄段的男性和女性患病人数 age_gender_counts = df.groupby(['Age Category', 'Gender']).size().unstack() # 获取类型和数量的列表 # 生成age_category数据 age_category = age_counts.index.tolist() # # 提取female_counts数据 female_counts = age_gender_counts['女'].tolist() # # 生成male_counts数据 male_counts = age_gender_counts['男'].tolist() # 打印最终结果 print(age_category) print(female_counts) print(male_counts) # 可视化 c = ( Line() .add_xaxis(age_category) .add_yaxis("女性患病人数", female_counts) .add_yaxis("男性患病总人数", male_counts) .add_yaxis("患病总人数", age_counts) .set_global_opts(xaxis_opts=opts.AxisOpts(name="年龄阶段"), yaxis_opts=opts.AxisOpts(name="患病人数/人"), title_opts=opts.TitleOpts( title='脑卒中患者按年龄阶段统计人数分布趋势', pos_left="center", pos_bottom="bottom" )) ) return c #年份-患病 柱状图 def viewYearZhu(df): # 提取年份 df['Year'] = pd.to_datetime(df['Time of incidence'], format='%Y-%m-%d').dt.year # 按照年份统计人数 year_counts = df['Year'].value_counts().sort_index() # 打印结果 print(year_counts) # 获取类型和数量的列表 year= year_counts.index.tolist() counts = year_counts.values.tolist() # 打印最终结果 print(year) print(counts) # 可视化 # 创建柱状图对象 c = ( Bar() .add_xaxis(year) .