I have a dataframe with around 1.5 million rows and 5 cols. One variable (VARIABLE) is of this type NATIONALITY_YEAR (e.g. SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. I have tried with concat.split which should be the easiest way:
我有一个大约150万行的dataframe和5个cols。一个变量(变量)是这种类型的NATIONALITY_YEAR(例如SPAIN_1998),我想将它分为两列,一列包含国籍,这是下划线前名称的左边,一列包含年份,下划线的右边。我已经试过了。最简单的方法是:
aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)
but after 2 hours running it did not produce any output. I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this.
但在运行2小时后,它没有产生任何输出。我不确定我是否应该让它运行更长一段时间,或者是否有一种非耗时的方法来做这件事。
Any help on the issue would be very much appreciated!
如果在这个问题上有任何帮助,我们将非常感激!
Here is a reproducible (subset!) sample:
这是一个可复制的(子集!)样本:
mydata<- structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L,
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L,
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L,
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L,
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L,
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L,
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L,
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L,
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L,
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L,
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L,
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L,
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L,
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L,
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L,
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L,
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L,
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L,
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L,
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L,
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L,
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L,
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L,
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L,
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L,
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L,
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L,
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L,
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24",
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"),
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L,
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L,
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L,
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L,
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L,
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L,
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L,
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L,
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L,
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L,
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L,
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L,
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L,
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L,
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L,
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L,
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L,
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L,
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L,
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L,
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L,
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L,
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L,
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L,
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998",
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998",
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999",
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999",
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000",
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000",
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001",
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001",
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002",
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003",
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003",
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004",
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004",
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005",
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005",
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006",
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006",
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007",
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008",
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008",
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009",
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009",
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010",
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010",
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011",
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011",
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012",
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998",
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002",
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006",
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010",
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999",
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003",
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007",
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011",
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001",
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006",
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011",
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943,
59, 201, 188, 10859, 93,
1494, 60, 1001, 1000, 689, 675, 934, 51,
1240, 165, 13, 0, 14, 2, 2,
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0,
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0,
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5,
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839,
4303, 100, 1727, 663, 694, 1210, 623,
1261, 772, 697, 490, 1031, 490, 956, 704,
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528,
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3,
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0,
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK",
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L,
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L,
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L,
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L,
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L,
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L,
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L,
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L,
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L,
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L,
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L,
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L,
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L,
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L,
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L,
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L,
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L,
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L,
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L,
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L,
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L,
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L,
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L,
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L,
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L,
99001L, 99501L), class = "data.frame")
3 个解决方案
#1
7
Try this instead:
试试这个:
library(data.table)
dt = data.table(mydata)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
#2
6
It seems like I need to look into updating my concat.split
functions!
看来我需要更新我的计算机了。分割函数!
The version of the function that you tried to use makes use of read.table
, which does tend to struggle with large datasets. I had used read.table
because it has a convenient text
argument that lets you specify a column in a data.frame
as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)
您尝试使用的函数的版本使用了read。表,它倾向于处理大型数据集。我有阅读使用。表,因为它有一个方便的文本参数,可以让您在data.frame中指定一个列作为输入。当处理小型数据集时,这确实很方便,但对于较大的数据集显然不方便:)
As far as I can tell, fread
from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split
with fread
instead of read.table
.
据我所知,fread来自“数据”。表“包没有类似的特性,但是由于R倾向于很快地编写文件,我认为应该尝试类似于concat中的方法。使用fread而不是read.table。
Here's the concept:
这里的概念:
- Write the variable that needs to be split to a new file.
- 将需要分割的变量写入一个新文件。
- Use the blazing fast
fread
to read it back in. - 用快速的广告把它读回来。
- Wait for
fread
to get atext
argument somewhere down the line? - 等fread得到一个文本参数?
Here's that concept as a function (updated with edits as per @eddi's suggestions in the comments):
这是作为函数的概念(根据@eddi在评论中的建议进行编辑):
csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
if (!is.data.table(dataset)) dataset <- data.table(dataset)
if (sep == ".") {
dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
sep <- "|"
}
if (!is.character(dataset[[splitcol]])) {
dataset[, (splitcol) := as.character(get(splitcol))]
}
x <- tempfile()
writeLines(dataset[[splitcol]], x)
Split <- fread(x, sep=sep, header = FALSE)
setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
if (isTRUE(drop)) dataset[, (splitcol) := NULL]
cbind(dataset, Split)
}
Here's the function in action:
这里是函数的作用:
## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]
csDataTable(out, "VARIABLE", "_")
# PROVINCE AGE5 ZONA91OK VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
# 1: 1 10-14 101 SPAIN_1998 614 SPAIN 1998
# 2: 4 30-34 4079 SPAIN_1998 1943 SPAIN 1998
# 3: 7 50-54 712 SPAIN_1998 59 SPAIN 1998
# 4: 8 40-44 8205 SPAIN_1998 201 SPAIN 1998
# 5: 11 35-39 11022 SPAIN_1998 188 SPAIN 1998
# ---
# 1499996: 44 35-39 4401 ROE_1999 0 ROE 1999
# 1499997: 46 35-39 4621 ROE_1999 0 ROE 1999
# 1499998: 49 10-14 490499 ROE_1999 0 ROE 1999
# 1499999: 3 30-34 3059 MAGREB_1999 5 MAGREB 1999
# 1500000: 6 40-44 6153 MAGREB_1999 2 MAGREB 1999
In this test, at least, the solution fares much better than I expected:
至少在这个测试中,解决方案比我预期的要好得多:
subFun <- function() {
dt = data.table(out)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
}
freadFun <- function() {
csDataTable(out, "VARIABLE", "_")
}
library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# subFun() 3.814174 4.244820 4.273834 4.345358 4.480520 20
# freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886 20
#3
3
Here is some solution with splitting factor labels
这里有一些分解因子标签的解决方案
VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
as.data.frame(do.call("rbind",
strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])
#1
7
Try this instead:
试试这个:
library(data.table)
dt = data.table(mydata)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
#2
6
It seems like I need to look into updating my concat.split
functions!
看来我需要更新我的计算机了。分割函数!
The version of the function that you tried to use makes use of read.table
, which does tend to struggle with large datasets. I had used read.table
because it has a convenient text
argument that lets you specify a column in a data.frame
as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)
您尝试使用的函数的版本使用了read。表,它倾向于处理大型数据集。我有阅读使用。表,因为它有一个方便的文本参数,可以让您在data.frame中指定一个列作为输入。当处理小型数据集时,这确实很方便,但对于较大的数据集显然不方便:)
As far as I can tell, fread
from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split
with fread
instead of read.table
.
据我所知,fread来自“数据”。表“包没有类似的特性,但是由于R倾向于很快地编写文件,我认为应该尝试类似于concat中的方法。使用fread而不是read.table。
Here's the concept:
这里的概念:
- Write the variable that needs to be split to a new file.
- 将需要分割的变量写入一个新文件。
- Use the blazing fast
fread
to read it back in. - 用快速的广告把它读回来。
- Wait for
fread
to get atext
argument somewhere down the line? - 等fread得到一个文本参数?
Here's that concept as a function (updated with edits as per @eddi's suggestions in the comments):
这是作为函数的概念(根据@eddi在评论中的建议进行编辑):
csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
if (!is.data.table(dataset)) dataset <- data.table(dataset)
if (sep == ".") {
dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
sep <- "|"
}
if (!is.character(dataset[[splitcol]])) {
dataset[, (splitcol) := as.character(get(splitcol))]
}
x <- tempfile()
writeLines(dataset[[splitcol]], x)
Split <- fread(x, sep=sep, header = FALSE)
setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
if (isTRUE(drop)) dataset[, (splitcol) := NULL]
cbind(dataset, Split)
}
Here's the function in action:
这里是函数的作用:
## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]
csDataTable(out, "VARIABLE", "_")
# PROVINCE AGE5 ZONA91OK VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
# 1: 1 10-14 101 SPAIN_1998 614 SPAIN 1998
# 2: 4 30-34 4079 SPAIN_1998 1943 SPAIN 1998
# 3: 7 50-54 712 SPAIN_1998 59 SPAIN 1998
# 4: 8 40-44 8205 SPAIN_1998 201 SPAIN 1998
# 5: 11 35-39 11022 SPAIN_1998 188 SPAIN 1998
# ---
# 1499996: 44 35-39 4401 ROE_1999 0 ROE 1999
# 1499997: 46 35-39 4621 ROE_1999 0 ROE 1999
# 1499998: 49 10-14 490499 ROE_1999 0 ROE 1999
# 1499999: 3 30-34 3059 MAGREB_1999 5 MAGREB 1999
# 1500000: 6 40-44 6153 MAGREB_1999 2 MAGREB 1999
In this test, at least, the solution fares much better than I expected:
至少在这个测试中,解决方案比我预期的要好得多:
subFun <- function() {
dt = data.table(out)
dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
YEAR = sub('(.*)_(.*)', '\\2', VARIABLE))]
}
freadFun <- function() {
csDataTable(out, "VARIABLE", "_")
}
library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
# expr min lq median uq max neval
# subFun() 3.814174 4.244820 4.273834 4.345358 4.480520 20
# freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886 20
#3
3
Here is some solution with splitting factor labels
这里有一些分解因子标签的解决方案
VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
as.data.frame(do.call("rbind",
strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])