这是微博深度和广度预测的原始代码,写了大约半个月,第一个版本不是这样的,但是这个版本包含所有需要的功能。
模块化的程度也更高。找工作前一直想用python完美解决这个问题,后来发现自己的方法和硬件都有很大的局限。
算是我的第一次正儿八经的尝试在分布式计算的框架下,计算海量的数据。
意识到很多问题,影响我面试时候很多的代码风格。
def get_basic_info():
win_path = "E:/spark/weibo_predict/"
linux_path = "/home/jason/spark/weibo_predict/"
path = linux_path train_path = path + 'train/'
test_path = path + 'test/'
code_path = path + 'source_code/' print('\n训练准备文件保存路径px:%s' % train_path)
print('\n测试准备文件保存路径py:%s' % test_path)
print('\n代码准备文件保存路径pz:%s' % code_path) train_weibo_raw_path = path + "train_weibo_raw.txt"
train_weibo_repost_path = path + "train_weibo_repost_back.txt" test_weibo_raw_path = path + "test_weibo_raw.txt"
test_weibo_repost_path = path + "test_weibo_repost.txt" user_relations_path = path + "user_relations_back.txt" print("\n训练原始微博地址p1:%s" % train_weibo_raw_path)
print("训练转发微博地址p2:%s" % train_weibo_repost_path)
print("\n测试原始微博地址p3:%s" % test_weibo_raw_path)
print("测试转发微博地址p4:%s" % test_weibo_repost_path)
print("\n 用户关系地址p5:%s" % user_relations_path)
return train_path,test_path,code_path,train_weibo_raw_path,train_weibo_repost_path,test_weibo_raw_path,test_weibo_repost_path,user_relations_path
#传递 训练(原始微博,转发微博) 或者 测试(原始微博,转发微博)
#返回化简后的对应关系repost_id_line_time_reduce
#返回微博id对应的用户idwid_uid_rdd
from pyspark import SparkContext
def get_prime_rdd(train_or_test,sc, p1,p2,p3,p4):
if train_or_test == 'train':
inside_path_a = p1
inside_path_b = p2
elif train_or_test == 'test':
inside_path_a = p3
inside_path_b = p4
else:
print("only input train or test")
return 0,0 sc = sc
train_weibo_raw_data = sc.textFile(inside_path_a)
train_weibo_raw_data_count = train_weibo_raw_data.count()
train_weibo_raw_data_rdd = train_weibo_raw_data.map(lambda x: x.split("\001"))
w_id=train_weibo_raw_data_rdd.map(lambda x:x[0])
u_id=train_weibo_raw_data_rdd.map(lambda x:x[1])
wid_uid_rdd = w_id.zip(u_id) train_weibo_repost_data = sc.textFile(inside_path_b)
train_weibo_repost_data_count = train_weibo_repost_data.count()
train_weibo_repost_data_rdd = train_weibo_repost_data.map(lambda x: x.split("\001"))
repost_id = train_weibo_repost_data_rdd.map(lambda x: x[0])
repost_line_time = train_weibo_repost_data_rdd.map(lambda x: x[1:-1])
repost_id_line_time = repost_id.zip(repost_line_time)
repost_id_line_time_reduce = repost_id_line_time.groupByKey().mapValues(list) repost_id_line_time_reduce = repost_id_line_time_reduce.subtractByKey(repost_id_line_time_reduce.subtractByKey(wid_uid_rdd))
wid_uid_rdd = wid_uid_rdd.subtractByKey(wid_uid_rdd.subtractByKey(repost_id_line_time_reduce)) return repost_id_line_time_reduce,wid_uid_rdd
def get_uid_fnum_rdd(sc,p5):
sc = sc
user_relations_data = sc.textFile(p5)
user_relations_data_count = user_relations_data.count()
user_relations_data_rdd_1 = user_relations_data.map(lambda x: x.split("\t")[0])
user_relations_data_rdd_2 = user_relations_data.map(lambda x: x.split("\t")[1])
user_relations_data_rdd_user = user_relations_data_rdd_1
user_relations_data_rdd_fans = user_relations_data_rdd_2.map(lambda x: x.split("\x01"))
user_fans = user_relations_data_rdd_user.zip(user_relations_data_rdd_fans)
fans_nums = user_relations_data_rdd_fans.map(lambda s:len(s))
uid_fnum_rdd = user_fans.keys().zip(fans_nums)
return uid_fnum_rdd
##版本 2 分时间段计算指定时间段的转发量
def cal_times_j(list,j):
ct = 0
for i in range(len(list)):
#if int(list[i][-1]) >= j*900 and int(list[i][-1]) <= (j+1)*900:
#这里可以切换求累计的转发量还是区间的转发量
if int(list[i][-1]) <= (j)*900:
ct += 1
return ct
def cal_id_times_j(rdd,j):
times = rdd.values().map(lambda x: cal_times_j(x,j))
rdd = rdd.keys().zip(times)
return rdd def generate_times_file(rdd,k,path):
for j in range(k-1,k+1):
import csv
a_path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
#print(path)
out_file_train_times_j = open(a_path,'w')
writer = csv.writer(out_file_train_times_j);
zhuanfa = cal_id_times_j(rdd,j+1)
for lists in zhuanfa.collect():
writer.writerow(lists)
out_file_train_times_j.close()
#计算深度
#定义函数,计算出指定阶段的,发生过的转发关系
def cal_during(list,j):
new_list=[]
for i in range(len(list)):
if int(list[i][-1]) <= j*900:
new_list.append(list[i])
return new_list #定义函数,计算一个rdd中,指定阶段,发生过的转发关系
def cal_rdd_during(rdd,j):
return rdd.map(lambda x: cal_during(x,j)) #定义函数,如果一个转发关系的尾部,是另外一个转发关系的头,那么久把这个头的尾部,加到这个转发关系的尾部
def add_deep(list):
kkk = len(list)
if kkk<=1:
return list
else:
for i in range(kkk):
for j in range(kkk):
if list[i][-1] == list[j][0]:
list[i].append(list[j][-1])
return list #定义函数返回序列中的数组的最长的值,作为最大的深度
def max_deep(list):
max=2
if len(list)==0:
return 0
else:
for i in range(len(list)):
max = (len(list[i]) if len(list[i])> max else max)
return max-1 #定义函数,取出其中的两列
def ti_qu(list):
for i in range(len(list)):
list[i] = list[i][:-1]
return list def cal_cal(all_in_one_rdd, j):
id_rdd = all_in_one_rdd.keys() #获取ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
line_rdd_j_extend = line_rdd_j.map(lambda x: add_deep(x))#延长转发关系
line_rdd_j_extend_maxdeep = line_rdd_j_extend.map(lambda x:max_deep(x))#计算最大深度
id_deep_rdd_j = id_rdd.zip(line_rdd_j_extend_maxdeep)#组合微博ID与深度
return id_deep_rdd_j def generate_deeps_file(rdd,k,path):
import csv
for j in range(k-1,k+1):
b_path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
#print(path)
out_file_train_deeps_j = open(b_path,'w')
writer = csv.writer(out_file_train_deeps_j);
shendu = cal_cal(rdd,j+1)
for lists in shendu.collect():
writer.writerow(lists)
out_file_train_deeps_j.close()
def get_wid_fnum_rdd(uid_fnum_rdd,wid_uid_rdd,path):
#print("用户和粉丝个数的对应关系,取出来一个看看:")
#print(uid_fnum_rdd.take(3))
#print(uid_fnum_rdd.count())
#print("\n训练原始约减微博的id和发送微博的人的id的对应rdd:")
#print(wid_uid_rdd.take(3))
#print(wid_uid_rdd.count())
uid_wid_rdd = wid_uid_rdd.values().zip(wid_uid_rdd.keys())
uid__wid_fnum = uid_wid_rdd.leftOuterJoin(uid_fnum_rdd)
wid_fnum_rdd = uid__wid_fnum.values().map(lambda x: x[0]).zip(uid__wid_fnum.values().map(lambda x: x[1]))
#print(wid_fnum_rdd.take(2))
#print(wid_fnum_rdd.count())
import csv
c_path = str(path) + 'wid_fnum_file.csv'
wid_fnum_file = open(c_path,"w")
writer = csv.writer(wid_fnum_file);
for lists in wid_fnum_rdd.collect():
writer.writerow(lists);
wid_fnum_file.close() return wid_fnum_rdd
#定义函数,将列表数组扁平化
def add_flat(list):
if list==None:
return 0
else:
kkk = len(list)
list0 = list[0]
for i in range(kkk):
if i==0:
pass
else:
list0 = list0.append(list[i])
return list0 #定义函数,计算覆盖用户数目
def clac_cover(list):
total_cover=0
for i in range(len(list)):
total_cover += cover_value(list[i])
return total_cover #定义函数,计算某个用户的粉丝数:
def cover_value(user):
'''
try:
return uid_fnum_dict[user]
except:
return 0
'''
for i in range(len(list_uid_fnum)):
if user == list_uid_fnum[i][0]:
return list_uid_fnum[i][1]
else:
return 0
def flatmapvalues(x):
return x def cal_sum(x):
sum = 0
if x==None and len(x)==0:
return sum
else:
for i in range(len(x)):
if x[i]== None:
pass
else:
sum += int(x[i])
return sum def fans_cover_till_j(all_in_one_rdd,j):
id_rdd = all_in_one_rdd.keys() #获取微博ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
#print("\n指定时间段,获取这个时间段发生过的转发和时间组成的RDD");print(line_time_rdd_j.first())
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
#print("\n提取转发关系");print(line_rdd_j.first()) #line_rdd_j.flatMap(lambda x: re.sub(r'\D'," ",x).split())
#line_rdd_j_flat = line_rdd_j.map(lambda x: add_flat(x))#扁平化转发关系,不行
import re
line_rdd_j_flat = line_rdd_j.map(lambda x: re.sub(r'\D'," ",str(x)).split())#扁平化转发关系
#print("\n提取扁平化的转发关系");print(line_rdd_j_flat.first()) line_rdd_j_flat_disc = line_rdd_j_flat.map(lambda x:list(set(list(x)))) #扁平化之后约减重复的用户ID
#print("\n看看去重之后的转发用户");print(line_rdd_j_flat_disc.first()) fans_cover_rdd_j = id_rdd.zip(line_rdd_j_flat_disc)
#print("\n看看去重之后的微博ID和转发用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.flatMapValues(flatmapvalues)
#print("\n看看去重之后的微博ID和转发用户,一对一flatmap之后");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.values().zip(fans_cover_rdd_j.keys())
#print("\n翻转id和用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.leftOuterJoin(uid_fnum_rdd).values()
#print("\n得到用户id_(微博ID,粉丝)");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.map(lambda x: x[0]).zip(fans_cover_rdd_j.map(lambda x:x[1]))
#print("\n得微博id_粉丝");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.groupByKey().mapValues(list)
#print("\n组合,");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x)))
#print("\nmap求和");print(fans_cover_rdd_j.first()) #cover_rdd = line_rdd_j_flat_disc.map(lambda x: clac_cover(x))
#fans_cover_rdd_j = id_rdd.zip(cover_rdd)#组合微博ID与覆盖数目
#print(id_deep_rdd_j.first())
#return line_rdd_j_extend_maxdeep
temp_key_0 = all_in_one_rdd.keys().zip(all_in_one_rdd.values().map(lambda x: 0)) fans_cover_rdd_j = temp_key_0.leftOuterJoin(fans_cover_rdd_j)
fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x))) return fans_cover_rdd_j def generate_covers_file(rdd,k,path):
#按理说没问题
import csv
for j in range(k-1,k+1):
c_path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
#print(c_path)
out_file_train_covers_j = open(c_path,'w')
writer = csv.writer(out_file_train_covers_j)
covers = fans_cover_till_j(rdd,j+1)
for lists in covers.collect():
writer.writerow(lists)
out_file_train_covers_j.close()
px,py,pz,p1,p2,p3,p4,p5 = get_basic_info()
uid_fnum_rdd = get_uid_fnum_rdd(sc,p5)
train_repost_id_line_time_reduce, train_wid_uid_rdd = get_prime_rdd('train',sc,p1,p2,p3,p4)
#wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,train_wid_uid_rdd,px)
#generate_times_file(train_repost_id_line_time_reduce,292,px)
#generate_deeps_file(train_repost_id_line_time_reduce,292,px)
#generate_covers_file(train_repost_id_line_time_reduce,292,px) test_repost_id_line_time_reduce, test_wid_uid_rdd = get_prime_rdd('test',sc,p1,p2,p3,p4)
#test_wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,test_wid_uid_rdd,py)
#generate_times_file(test_repost_id_line_time_reduce,16,py)
#generate_deeps_file(test_repost_id_line_time_reduce,16,py)
#generate_covers_file(test_repost_id_line_time_reduce,16,py)
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import SparseVector,DenseVector #获取用户ID和粉丝数的对比
def get_wid_fnum_rdd(path):
path = path+ 'wid_fnum_file'+'.csv'
wid_fnum_rdd = sc.textFile(path)
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x.split(","))
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x[0]).zip(wid_fnum_rdd.map(lambda x:x[1]))
wid_fnum_rdd = wid_fnum_rdd.sortByKey()
return wid_fnum_rdd def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd #获取其他三个需要的参数
def get_wid_x(j,path,times_or_deeps_or_covers):
if times_or_deeps_or_covers == 'times':
if path == px:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'deeps':
if path == px:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'covers':
if path == px:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/covers_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
else:
print('wrong input about times_or_deeps_or_covers')
return 0
rdd = sc.textFile(path)
rdd = rdd.map(lambda x:x.split(","))
rdd = rdd.map(lambda x:x[0]).zip(rdd.map(lambda x:x[1]))
rdd = rdd.sortByKey()
return rdd #将两个RDDjoin返回一个rdd的函数
def my_join(rdd1,rdd2):
import re
rdd = rdd1.join(rdd2).keys().zip(rdd1.join(rdd2).values().map(lambda x:re.sub(r'\D'," ",str(x)).split()))
return rdd #根据rdd的元素制作lib_svm格式文件
def lib_svm(x):
str1 = str(x[0] + ' ')
for i in range(len(x)):
if i == 0:
pass
else:
str1 += str(str(i) + ":" +str(x[i])+ ' ')
return str1 #生成测试或者训练需要的数据
def generate_train_or_test_data(path,j,times_or_deeps):
if times_or_deeps == 'times':
if path == px:
data_path = str(px) + 'train_data/times_train_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j+1,path,'times')
elif path == py:
data_path = str(py) + 'test_data/times_test_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j,path,'times')
#print(wid_times_rdd.count())
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_times_rdd,wid_fnum_rdd)
records = my_join(records,wid_deeps_rdd)
records = my_join(records,wid_covers_rdd)
records = records.sortByKey()
#print('看看训练集合中的keys()的顺序-------------------------------------------')
#print(records.keys().take(10))
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
elif times_or_deeps == 'deeps':
if path == px:
data_path = str(px) + 'train_data/deeps_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/deeps_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_deeps_rdd = get_wid_x(j,path,'deeps')
else:
wid_deeps_rdd = get_wid_x(j+1,path,'deeps')
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_deeps_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_covers_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
elif times_or_deeps == 'covers':
if path == px:
data_path = str(px) + 'train_data/covers_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/covers_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_covers_rdd = get_wid_x(j,path,'covers')
else:
wid_covers_rdd = get_wid_x(j+1,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps') records = my_join(wid_covers_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_deeps_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
else:
return 0 #生成指定时段的预测结果
def generate_test_predict(j,times_or_deeps):
if times_or_deeps == 'times':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'times_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'times_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'times_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'deeps':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'deeps_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'deeps_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'deeps_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'covers':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'covers_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'covers_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'covers_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close() def generate_test_data_beyond15(j):
path = '/home/jason/spark/weibo_predict/predicts/'+'time_data_'+str(j)+'.txt'
rdd2 = sc.textFile(path)
rdd1 = get_wid_fnum_rdd(py).keys()
rdd = rdd1.zip(rdd2)
return rdd
def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd
for i in range(15):
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
for i in range(15,292):
print(i)
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
generate_train_or_test_data(px,291,'times')
generate_train_or_test_data(py,291,'times')
generate_test_predict(291,'times')
generate_train_or_test_data(px,291,'deeps')
generate_train_or_test_data(py,291,'deeps')
generate_test_predict(291,'deeps')
generate_train_or_test_data(px,291,'covers')
generate_train_or_test_data(py,291,'covers')
generate_test_predict(291,'covers')
#组团搞出来最后的文件 rdd1 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(1)+'.txt')
rdd1 = add_keys(rdd1)
for j in range(4,292):
j = j+1
if j==1:
pass
else:
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt')
rdd2 = add_keys(rdd2)
rdd1 = my_join(rdd1,rdd2) for j in range(4,292):
j=j+1
rdd3 = sc.textFile('/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt')
rdd3 = add_keys(rdd3)
rdd1 = my_join(rdd1,rdd3) def add_head(x):
str1 = 'testWeibo'
str1 = str1+str(x)
return str1 import re
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split())
rdd1 = rdd1.sortBy(lambda x: int(x[0]))
rdd1 = rdd1.map(lambda x:x[0]).zip(rdd1.map(lambda x:x[1:]))
rdd1_key = rdd1.keys().map(lambda x:add_head(x))
rdd1 = rdd1_key.zip(rdd1.values())
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split()) import csv
path = '/home/jason/spark/weibo_predict/'
end_path = str(path) + 'end_of_end.csv'
end_f = open(end_path,'w')
writer = csv.writer(end_f)
for lists in rdd1.collect():
writer.writerow(lists)
end_f.close()
a=','
s1 = ['scaleT'+str((i+1)*15) for i in range(4,292)]
s1 = a.join(s1)
s2 = ['depthT'+str((i+1)*15) for i in range(4,292)]
s2 = a.join(s2)
s3 = 'WeiboID (Time Unit: Minutes)'+a+s1+s2
#print(s3)
end_path_2 = '/home/jason/spark/weibo_predict/end_of_end.csv'
end_path_1 = '/home/jason/spark/weibo_predict/end_of_end_.csv'
rdd = sc.textFile(end_path_2)
rdd = rdd.map(lambda x:add_head(x))
end_ff = open(end_path_1,'w')
end_ff.write(s3)
end_ff.write('\n')
for lists in rdd.collect():
end_ff.write(lists)
end_ff.write('\n')
end_ff.close()