I am trying to build a recommender system to recommend electives to the new students based on their core courses and historical students' data (data contain both core courses and electives).
我正在尝试建立一个推荐系统,根据他们的核心课程和历史学生的数据(数据包含核心课程和选修课程)向新生推荐选修课程。
- I have the data as shown in this table:
我有如下表所示的数据:
- I generated a cross-table as shown in Table2 (with no order of Term_Code)
我生成了一个交叉表,如表2所示(没有Term_Code的顺序)
- I want to generate a sequence data as shown in Table3 (The combination of Course_Num:Grade should be in order with respect to Term_Code
我想生成一个序列数据,如表3所示(Course_Num:Grade的组合应该按照Term_Code的顺序排列
Any help is greatly appreciated. Thanks in advance!
任何帮助是极大的赞赏。提前致谢!
3 个解决方案
#1
1
It would probably be easier to start from Table 1 (df1
in example below)
从表1开始可能更容易(下面的例子中的df1)
require(dplyr)
set.seed(46)
df1 <- data.frame(Term_Code = sample(2001:2003, 7, T),
Student_Num = sample(1:3, 7, T),
Course_Num = sample(1000:1003, 7, T),
Grade = sample(LETTERS[1:4], 7, T), stringsAsFactors = F)
# A tibble: 7 x 5
# Groups: Student_Num [3]
# Term_Code Student_Num Course_Num Grade Sequence
# <int> <int> <int> <chr> <chr>
#1 2001 2 1003 A 1003:A
#2 2001 3 1002 D 1002:D
#3 2002 3 1003 A 1003:A
#4 2002 1 1000 A 1000:A
#5 2001 1 1002 B 1002:B
#6 2002 2 1002 B 1002:B
#7 2003 1 1003 A 1003:A
df1 %>%
group_by(Student_Num) %>%
summarise(Sequence = paste(Course_Num, Grade, sep = ':', collapse = ', '))
# A tibble: 3 x 2
# Student_Num Sequence
# <int> <chr>
#1 1 1000:A, 1002:B, 1003:A
#2 2 1003:A, 1002:B
#3 3 1002:D, 1003:A
#2
1
Using the tidyverse
suite of packages:
使用tidyverse套件包:
library(tidyverse)
# The pipe operator (%>%) makes df1 the first argument of the next function.
# It lets us look at this "in order" not nested
df1 <- data_frame(
term_code = c(200701, 200701, 200707, 200701, 200801, 200807, 200707, 200701),
student_number = rep(1:3, c(4, 2, 2)),
course_number = c(1000, 2200, 1100, 4200, 2000, 1100, 2000, 4100),
grade = c('A','B', 'B-','C','A', 'B','C','E')
)
df1 %>%
unite(Sequence,c(course_number, grade), sep = ":") %>%
group_by(student_number) %>%
summarize(
Sequence = paste(Sequence, collapse = ", ")
)
If you aren't familiar with the pipe operator or the other functions I'm using, I would call this one piece at a time so you can see what it's doing (and it's all documented at https://www.tidyverse.org/). For example,
如果您不熟悉管道操作员或我正在使用的其他功能,我会一次调用这个功能,以便您可以看到它正在做什么(并且所有这些都记录在https://www.tidyverse.org /)。例如,
df1 %>%
unite(Sequence,c(course_number, grade), sep = ":")
#3
0
Using reshape2 and %>% operator from dplyr
使用dplyr中的reshape2和%>%运算符
df <- read.csv(text="
Student_Num,1000,1100,2000,2200,4100,4200
1,A,B-,,B,,C
2,,B,A,,,
3,,,C,,E,
", stringsAsFactors = FALSE)
library(reshape2)
library(dplyr)
melt(df, id.vars = "Student_Num", value.name = 'Grade') %>%
mutate(variable = substr(variable, 2, 5)) %>%
filter(Grade != "") %>%
group_by(Student_Num) %>%
summarize(Sequence = paste0(variable, ":", Grade, collapse = ","))
# Student_Num Sequence
# <int> <chr>
# 1 1 1000:A,1100:B-,2200:B,4200:C
# 2 2 1100:B,2000:A
# 3 3 2000:C,4100:E
#1
1
It would probably be easier to start from Table 1 (df1
in example below)
从表1开始可能更容易(下面的例子中的df1)
require(dplyr)
set.seed(46)
df1 <- data.frame(Term_Code = sample(2001:2003, 7, T),
Student_Num = sample(1:3, 7, T),
Course_Num = sample(1000:1003, 7, T),
Grade = sample(LETTERS[1:4], 7, T), stringsAsFactors = F)
# A tibble: 7 x 5
# Groups: Student_Num [3]
# Term_Code Student_Num Course_Num Grade Sequence
# <int> <int> <int> <chr> <chr>
#1 2001 2 1003 A 1003:A
#2 2001 3 1002 D 1002:D
#3 2002 3 1003 A 1003:A
#4 2002 1 1000 A 1000:A
#5 2001 1 1002 B 1002:B
#6 2002 2 1002 B 1002:B
#7 2003 1 1003 A 1003:A
df1 %>%
group_by(Student_Num) %>%
summarise(Sequence = paste(Course_Num, Grade, sep = ':', collapse = ', '))
# A tibble: 3 x 2
# Student_Num Sequence
# <int> <chr>
#1 1 1000:A, 1002:B, 1003:A
#2 2 1003:A, 1002:B
#3 3 1002:D, 1003:A
#2
1
Using the tidyverse
suite of packages:
使用tidyverse套件包:
library(tidyverse)
# The pipe operator (%>%) makes df1 the first argument of the next function.
# It lets us look at this "in order" not nested
df1 <- data_frame(
term_code = c(200701, 200701, 200707, 200701, 200801, 200807, 200707, 200701),
student_number = rep(1:3, c(4, 2, 2)),
course_number = c(1000, 2200, 1100, 4200, 2000, 1100, 2000, 4100),
grade = c('A','B', 'B-','C','A', 'B','C','E')
)
df1 %>%
unite(Sequence,c(course_number, grade), sep = ":") %>%
group_by(student_number) %>%
summarize(
Sequence = paste(Sequence, collapse = ", ")
)
If you aren't familiar with the pipe operator or the other functions I'm using, I would call this one piece at a time so you can see what it's doing (and it's all documented at https://www.tidyverse.org/). For example,
如果您不熟悉管道操作员或我正在使用的其他功能,我会一次调用这个功能,以便您可以看到它正在做什么(并且所有这些都记录在https://www.tidyverse.org /)。例如,
df1 %>%
unite(Sequence,c(course_number, grade), sep = ":")
#3
0
Using reshape2 and %>% operator from dplyr
使用dplyr中的reshape2和%>%运算符
df <- read.csv(text="
Student_Num,1000,1100,2000,2200,4100,4200
1,A,B-,,B,,C
2,,B,A,,,
3,,,C,,E,
", stringsAsFactors = FALSE)
library(reshape2)
library(dplyr)
melt(df, id.vars = "Student_Num", value.name = 'Grade') %>%
mutate(variable = substr(variable, 2, 5)) %>%
filter(Grade != "") %>%
group_by(Student_Num) %>%
summarize(Sequence = paste0(variable, ":", Grade, collapse = ","))
# Student_Num Sequence
# <int> <chr>
# 1 1 1000:A,1100:B-,2200:B,4200:C
# 2 2 1100:B,2000:A
# 3 3 2000:C,4100:E