基本思路很简单,最小二乘法就好了:
我们假设两个算法得到的结果权重分别是a,b利用最小二乘法和我们分出来的第二部分数据就可以获取a,b使得误差最小。其实最小二乘法就是求一个广义的逆即可。最后的RMSE比起单一的模型有所提高,变成了(0.86~~~~)
import numpy as np
from code import read_file
from SVD import construct_matrix,get_svd_predict,svd_predict,get_mean
from PCA import get_train,get_pq,predict
def merge_models(index,mean=0):
train = get_train(path=r'smaller_test.txt')
p,q,bu,bi = get_pq(index=index)
svd_predictions = get_svd_predict(index)
A = np.zeros((len(train),2))
Y = np.zeros((len(train)))
for user_id,iid,r in read_file(r'smaller_test.txt'):
pr_lfm = predict(user_id,iid,p,q,bu,bi)
pr_svd = svd_predict(user_id,iid,svd_predictions)
pr_lfm = pr_lfm if pr_lfm else 3
A[user_id-1] = np.array([pr_lfm,pr_svd+mean])
Y[user_id-1] = np.array([r])
alpha_mat = np.dot(np.dot(np.linalg.inv(np.dot(np.transpose(A), A)), np.transpose(A)), Y)
return alpha_mat
def REMS(index,mean=0):
svd_predictions = get_svd_predict(index)
p,q,bu,bi = get_pq(index=index)
error = 0
cnt = 0
alpha1,alpha2 = merge_models(index,mean=mean)
cutoff = {1:-0.8,2:-1,3:0,4:0.3,5:0}
for uid,iid in read_file(r'test.txt',num = 2):
pr_lfm = predict(uid,iid,p,q,bu,bi)
pr_svd = svd_predict(uid,iid,svd_predictions)
if pr_lfm == None or pr_svd==None:continue
pr = alpha1*pr_lfm+alpha2*(pr_svd+mean)
if pr<1:pr = 1.0
if pr>5:pr = 5.0
cnt += 1
yield pr
def write_ans(w_path,data):
with open(w_path,'w'):
pass
with open(w_path,'a') as file:
for r in data:
file.write('{0:.3f}\n'.format(r))
if __name__ == '__main__':
train = get_train(path=r'smaller_test.txt')
a,b,mean = get_mean(train)
write_ans('scores-5.txt',REMS(5,mean))
write_ans('scores-20.txt',REMS(20,mean))
write_ans('scores-50.txt',REMS(50,mean))