frame创建新变量

I have a dataframe with around 1.5 million rows and 5 cols. One variable (VARIABLE) is of this type NATIONALITY_YEAR (e.g. SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. I have tried with concat.split which should be the easiest way:

我有一个大约150万行的dataframe和5个cols。一个变量(变量)是这种类型的NATIONALITY_YEAR(例如SPAIN_1998)，我想将它分为两列，一列包含国籍，这是下划线前名称的左边，一列包含年份，下划线的右边。我已经试过了。最简单的方法是:

aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)

but after 2 hours running it did not produce any output. I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this.

但在运行2小时后，它没有产生任何输出。我不确定我是否应该让它运行更长一段时间，或者是否有一种非耗时的方法来做这件事。

Any help on the issue would be very much appreciated!

如果在这个问题上有任何帮助，我们将非常感激!

Here is a reproducible (subset!) sample:

这是一个可复制的(子集!)样本:

mydata<-  structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L, 
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L, 
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L, 
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L, 
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L, 
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L, 
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L, 
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L, 
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L, 
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L, 
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L, 
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L, 
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L, 
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L, 
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L, 
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L, 
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L, 
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L, 
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L, 
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L, 
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L, 
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L, 
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L, 
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L, 
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L, 
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L, 
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L, 
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L, 
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24", 
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"), 
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L, 
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L, 
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L, 
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L, 
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L, 
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L, 
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L, 
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L, 
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L, 
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L, 
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L, 
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L, 
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L, 
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L, 
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L, 
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L, 
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L, 
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L, 
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L, 
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L, 
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L, 
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L, 
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L, 
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L, 
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998", 
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998", 
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999", 
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999", 
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000", 
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000", 
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001", 
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001", 
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002", 
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003", 
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003", 
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004", 
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004", 
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005", 
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005", 
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006", 
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006", 
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007", 
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008", 
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008", 
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009", 
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009", 
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010", 
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010", 
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011", 
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011", 
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012", 
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998", 
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002", 
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006", 
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010", 
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999", 
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003", 
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007", 
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011", 
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001", 
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006", 
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011", 
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943, 
59, 201, 188, 10859, 93, 
1494, 60, 1001, 1000, 689, 675, 934, 51, 
1240, 165, 13, 0, 14, 2, 2, 
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0, 
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5, 
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0, 
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839, 
4303, 100, 1727, 663, 694, 1210, 623, 
1261, 772, 697, 490, 1031, 490, 956, 704, 
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528, 
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3, 
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0, 
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK", 
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L, 
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L, 
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L, 
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L, 
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L, 
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L, 
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L, 
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L, 
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L, 
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L, 
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L, 
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L, 
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L, 
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L, 
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L, 
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L, 
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L, 
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L, 
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L, 
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L, 
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L, 
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L, 
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L, 
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L, 
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L, 
99001L, 99501L), class = "data.frame")

3 个解决方案

#1

Try this instead:

试试这个:

library(data.table)
dt = data.table(mydata)

dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
          YEAR        = sub('(.*)_(.*)', '\\2', VARIABLE))]

#2

It seems like I need to look into updating my concat.split functions!

看来我需要更新我的计算机了。分割函数!

The version of the function that you tried to use makes use of read.table, which does tend to struggle with large datasets. I had used read.table because it has a convenient text argument that lets you specify a column in a data.frame as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)

您尝试使用的函数的版本使用了read。表，它倾向于处理大型数据集。我有阅读使用。表，因为它有一个方便的文本参数，可以让您在data.frame中指定一个列作为输入。当处理小型数据集时，这确实很方便，但对于较大的数据集显然不方便:)

As far as I can tell, fread from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split with fread instead of read.table.

据我所知，fread来自“数据”。表“包没有类似的特性，但是由于R倾向于很快地编写文件，我认为应该尝试类似于concat中的方法。使用fread而不是read.table。

Here's the concept:

这里的概念:

Write the variable that needs to be split to a new file.
将需要分割的变量写入一个新文件。
Use the blazing fast fread to read it back in.
用快速的广告把它读回来。
Wait for fread to get a text argument somewhere down the line?
等fread得到一个文本参数?

Here's that concept as a function (updated with edits as per @eddi's suggestions in the comments):

这是作为函数的概念(根据@eddi在评论中的建议进行编辑):

csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
  if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
  if (!is.data.table(dataset)) dataset <- data.table(dataset)
  if (sep == ".") {
    dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
    sep <- "|"
  }
  if (!is.character(dataset[[splitcol]])) {
    dataset[, (splitcol) := as.character(get(splitcol))]
  }
  x <- tempfile()
  writeLines(dataset[[splitcol]], x)
  Split <- fread(x, sep=sep, header = FALSE)
  setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
  if (isTRUE(drop)) dataset[, (splitcol) := NULL]
  cbind(dataset, Split)
}

Here's the function in action:

这里是函数的作用:

## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]

csDataTable(out, "VARIABLE", "_")
#          PROVINCE  AGE5 ZONA91OK    VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
#       1:        1 10-14      101  SPAIN_1998       614      SPAIN       1998
#       2:        4 30-34     4079  SPAIN_1998      1943      SPAIN       1998
#       3:        7 50-54      712  SPAIN_1998        59      SPAIN       1998
#       4:        8 40-44     8205  SPAIN_1998       201      SPAIN       1998
#       5:       11 35-39    11022  SPAIN_1998       188      SPAIN       1998
#      ---                                                                    
# 1499996:       44 35-39     4401    ROE_1999         0        ROE       1999
# 1499997:       46 35-39     4621    ROE_1999         0        ROE       1999
# 1499998:       49 10-14   490499    ROE_1999         0        ROE       1999
# 1499999:        3 30-34     3059 MAGREB_1999         5     MAGREB       1999
# 1500000:        6 40-44     6153 MAGREB_1999         2     MAGREB       1999

In this test, at least, the solution fares much better than I expected:

至少在这个测试中，解决方案比我预期的要好得多:

subFun <- function() {
  dt = data.table(out)
  dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
            YEAR        = sub('(.*)_(.*)', '\\2', VARIABLE))]
} 
freadFun <- function() {
  csDataTable(out, "VARIABLE", "_")
}

library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
#        expr      min       lq   median       uq      max neval
#    subFun() 3.814174 4.244820 4.273834 4.345358 4.480520    20
#  freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886    20

#3

Here is some solution with splitting factor labels

这里有一些分解因子标签的解决方案

VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
                         as.data.frame(do.call("rbind",
                                       strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])

#1