在r中使用paste嵌套for循环

时间:2022-05-01 21:29:31

I have a dataset with a number of yes/no columns indicating whether a particular record is associated with a given store number in three different regions (i.e., a record will have a single yes value for one of the two stores in each of the three regions, where the variable names are formatted as 'region'_'storenumber'):

我有一个带有多个是/否列的数据集,指示特定记录是否与三个不同区域中的给定商店编号相关联(即,对于三个不同区域中的每个商店中的一个,记录将具有单个是值区域,变量名称格式为'region'_'storenumber'):

  var1_1   var1_2   var2_1   var2_2   var3_1   var3_2
1 Yes      No       No       Yes      Yes      No
2 No       Yes      Yes      No       No       Yes
3 No       Yes      Yes      No       No       Yes
4 No       Yes      No       Yes      Yes      No
5 No       Yes      No       Yes      No       Yes
6 Yes      No       No       Yes      No       Yes
7 Yes      No       Yes      No       Yes      No
8 No       Yes      Yes      No       No       Yes

I'd like to create a variable for each region called 'region_1', 'region_2', and 'region_3' that are equal to the "Yes" store number in that region for that record (1 or 2):

我想为每个名为'region_1','region_2'和'region_3'的区域创建一个变量,该变量等于该区域中该记录的“是”商店编号(1或2):

  region_1   region_2   region_3
1 1          2          1
2 2          1          2
3 2          1          2
4 2          2          1
5 2          2          2
6 1          2          2
7 1          1          1
8 2          1          2

I am able to create each region variable separate using for loops as follows:

我可以使用for循环分别创建每个区域变量,如下所示:

for(i in 1:3) {
  df[paste("region_", toString(i), sep = "")] <- ""
}

for(i in 1:2) {
    df$region_1 <- ifelse(df[paste("var1_", toString(i), sep = "")] == "Yes" & df$region_1 == "", toString(i), df$region_1)
  }

for(i in 1:2) {
  df_1$region_2 <- ifelse(df_1[paste("var2_", toString(i), sep = "")] == "Yes" & df_1$region_2 == "", toString(i), df_1$region_2)
}

for(i in 1:2) {
  df_1$region_3 <- ifelse(df_1[paste("var3_", toString(i), sep = "")] == "Yes" & df_1$region_3 == "", toString(i), df_1$region_3)
}

My actual data has many more than 3 regions (and more than 2 stores per region), so rather than writing a separate loop for each region, I'd like to nest this loop to loop over all regions. I've attempted the following:

我的实际数据有超过3个区域(每个区域超过2个存储区),因此我不想为每个区域编写一个单独的循环,而是希望将此循环嵌套在所有区域上。我尝试过以下方法:

for(j in 1:3) {
  for(i in 1:2) {
    df[paste("region_", toString(j), sep = "")] <- ifelse(df[paste("var", toString(j), "_", toString(i), sep = "")] == "Yes" & df[paste("region_", toString(j), sep = "")] == "", toString(i), df[paste("region_", toString(j), sep = "")])
  }
}

but recieve the warning "provided #### variables to replace 1 variables" and end up with each region variable populated with the same single value for every record.

但收到警告“提供####变量来替换1个变量”,最后每个区域变量用每个记录的相同单个值填充。

Any thoughts on where I'm going wrong with my nested loop?

关于我的嵌套循环出错的问题?

3 个解决方案

#1


0  

If we convert Yes and No to logical values, we can simply do

如果我们将Yes和No转换为逻辑值,我们就可以做到

regions = as.data.frame( sapply(seq(1, NCOL(stores), by=2),function(j) ifelse(stores[,j],1,2)))
names(regions) = c("region_1", "region_2",  "region_3")

regions
#  region_1 region_2 region_3
#1        1        2        1
#2        2        1        2
#3        2        1        2
#4        2        2        1
#5        2        2        2
#6        1        2        2
#7        1        1        1
#8        2        1        2

The data:

library(data.table)  
stores = setDF(fread(gsub("No", "FALSE", gsub("Yes", "TRUE",
"var1_1   var1_2   var2_1   var2_2   var3_1   var3_2
 Yes      No       No       Yes      Yes      No
 No       Yes      Yes      No       No       Yes
 No       Yes      Yes      No       No       Yes
 No       Yes      No       Yes      Yes      No
 No       Yes      No       Yes      No       Yes
 Yes      No       No       Yes      No       Yes
 Yes      No       Yes      No       Yes      No
 No       Yes      Yes      No       No       Yes"))))

#2


0  

For this, you may be better served converting your data into a "long" format, instead of the current "wide" format. Here are examples using dplyr and tidyr. I have tried to comment each line, but the basic idea is to generate one row per store-variable measure, and just have the presence/absence show. Then, you can group the rows by region, and count up the number of "Yes" entries.

为此,您可以更好地将数据转换为“长”格式,而不是当前的“宽”格式。以下是使用dplyr和tidyr的示例。我试图评论每一行,但基本的想法是每个商店变量测量产生一行,并且只有存在/缺席显示。然后,您可以按区域对行进行分组,并计算“是”条目的数量。

# Data entry from @dww, without conversion to logical (though that would make it easier)

library(data.table)  
stores = setDF(fread("var1_1   var1_2   var2_1   var2_2   var3_1   var3_2
 Yes      No       No       Yes      Yes      No
 No       Yes      Yes      No       No       Yes
 No       Yes      Yes      No       No       Yes
 No       Yes      No       Yes      Yes      No
 No       Yes      No       Yes      No       Yes
 Yes      No       No       Yes      No       Yes
 Yes      No       Yes      No       Yes      No
 No       Yes      Yes      No       No       Yes"))

Change to long format, store as new variable

更改为长格式,存储为新变量

longStores <-
  stores %>%
  # tag for printing
  tbl_df() %>%
  # Store the variable of interest as a column, instead of row.names
  mutate(variableInterest = rownames(.)) %>%
  # Convert the data to long format
  gather(StoreID, present, -variableInterest) %>%
  # Split the store_region format
  separate(StoreID, c("Store", "Region"), sep = "_") %>%
  # Eliminate the leading "var" from store names, just for display
  mutate(Store = gsub("var", "", Store))

Summarise by region, still in long format

按地区汇总,仍然是长格式

longRegional <-
  longStores %>%
  # Set grouping
  group_by(variableInterest, Region) %>%
  # Count the number of correct values in the region
  summarise(nStoresWithVariable = sum(present == "Yes"))

Finally, reformat to your original request

最后,重新格式化为您的原始请求

longRegional %>%
  spread(Region, nStoresWithVariable)

#   variableInterest   `1`   `2`
# *            <chr> <int> <int>
# 1                1     2     1
# 2                2     1     2
# 3                3     1     2
# 4                4     1     2
# 5                5     0     3
# 6                6     1     2
# 7                7     3     0
# 8                8     1     2

#3


0  

Here is my messy attempt with a simulation of your data as True and False:

这是我的混乱尝试,模拟您的数据为真和假:

Simulated Data - Generate data table using random Booleans

模拟数据 - 使用随机布尔值生成数据表

rb <- function()
{
  sample(c(T,F), size=10, replace=TRUE, prob=c(0.5, 0.5) )
}

var1_1 = rb()
var2_1 = rb()
var3_1 = rb()
df <- data.frame( var1_1, !var1_1,
                  var2_1, !var2_1,
                  var3_1, !var3_1)

colnames(df) = c('var1_1', 'var1_2', 'var2_1', 'var2_2', 'var3_1', 'var3_2')

df

    var1_1 var1_2 var2_1 var2_2 var3_1 var3_2
 1   FALSE   TRUE   TRUE  FALSE  FALSE   TRUE
 2   FALSE   TRUE  FALSE   TRUE  FALSE   TRUE
 3   FALSE   TRUE   TRUE  FALSE   TRUE  FALSE
 4   FALSE   TRUE   TRUE  FALSE  FALSE   TRUE
 5   FALSE   TRUE  FALSE   TRUE   TRUE  FALSE
 6   FALSE   TRUE  FALSE   TRUE   TRUE  FALSE
 7    TRUE  FALSE   TRUE  FALSE   TRUE  FALSE
 8    TRUE  FALSE  FALSE   TRUE   TRUE  FALSE
 9    TRUE  FALSE  FALSE   TRUE   TRUE  FALSE
 10  FALSE   TRUE  FALSE   TRUE   TRUE  FALSE

Solution

cn <- names(df)
cnprefixes <- gsub("_.*?$","",cn)
cnsuffixes <- gsub("^.*?_","",cn)

newblock<-data.frame()
bFirstTime<-T
for (prefix in unique(cnprefixes))
{
  block<-df[ , grepl( prefix , names( df ) ) ]

  theseSuffixes <- cnsuffixes[startsWith(cn, prefix)]

  j <- 1
  for(suffix in theseSuffixes)
  {
    block[,j][block[,j]==T]=as.numeric(suffix)
    j<-j+1
  }
  tempblock=data.frame(rowSums(block))
  colnames(tempblock)<- prefix
  if (bFirstTime){
    newblock <- tempblock
    bFirstTime <- F
  }
  else{
    newblock<-cbind(newblock, tempblock)
  }
}

newblock

   var1 var2 var3
1     2    1    2
2     2    2    2
3     2    1    1
4     2    1    2
5     2    2    1
6     2    2    1
7     1    1    1
8     1    2    1
9     1    2    1
10    2    2    1

#1


0  

If we convert Yes and No to logical values, we can simply do

如果我们将Yes和No转换为逻辑值,我们就可以做到

regions = as.data.frame( sapply(seq(1, NCOL(stores), by=2),function(j) ifelse(stores[,j],1,2)))
names(regions) = c("region_1", "region_2",  "region_3")

regions
#  region_1 region_2 region_3
#1        1        2        1
#2        2        1        2
#3        2        1        2
#4        2        2        1
#5        2        2        2
#6        1        2        2
#7        1        1        1
#8        2        1        2

The data:

library(data.table)  
stores = setDF(fread(gsub("No", "FALSE", gsub("Yes", "TRUE",
"var1_1   var1_2   var2_1   var2_2   var3_1   var3_2
 Yes      No       No       Yes      Yes      No
 No       Yes      Yes      No       No       Yes
 No       Yes      Yes      No       No       Yes
 No       Yes      No       Yes      Yes      No
 No       Yes      No       Yes      No       Yes
 Yes      No       No       Yes      No       Yes
 Yes      No       Yes      No       Yes      No
 No       Yes      Yes      No       No       Yes"))))

#2


0  

For this, you may be better served converting your data into a "long" format, instead of the current "wide" format. Here are examples using dplyr and tidyr. I have tried to comment each line, but the basic idea is to generate one row per store-variable measure, and just have the presence/absence show. Then, you can group the rows by region, and count up the number of "Yes" entries.

为此,您可以更好地将数据转换为“长”格式,而不是当前的“宽”格式。以下是使用dplyr和tidyr的示例。我试图评论每一行,但基本的想法是每个商店变量测量产生一行,并且只有存在/缺席显示。然后,您可以按区域对行进行分组,并计算“是”条目的数量。

# Data entry from @dww, without conversion to logical (though that would make it easier)

library(data.table)  
stores = setDF(fread("var1_1   var1_2   var2_1   var2_2   var3_1   var3_2
 Yes      No       No       Yes      Yes      No
 No       Yes      Yes      No       No       Yes
 No       Yes      Yes      No       No       Yes
 No       Yes      No       Yes      Yes      No
 No       Yes      No       Yes      No       Yes
 Yes      No       No       Yes      No       Yes
 Yes      No       Yes      No       Yes      No
 No       Yes      Yes      No       No       Yes"))

Change to long format, store as new variable

更改为长格式,存储为新变量

longStores <-
  stores %>%
  # tag for printing
  tbl_df() %>%
  # Store the variable of interest as a column, instead of row.names
  mutate(variableInterest = rownames(.)) %>%
  # Convert the data to long format
  gather(StoreID, present, -variableInterest) %>%
  # Split the store_region format
  separate(StoreID, c("Store", "Region"), sep = "_") %>%
  # Eliminate the leading "var" from store names, just for display
  mutate(Store = gsub("var", "", Store))

Summarise by region, still in long format

按地区汇总,仍然是长格式

longRegional <-
  longStores %>%
  # Set grouping
  group_by(variableInterest, Region) %>%
  # Count the number of correct values in the region
  summarise(nStoresWithVariable = sum(present == "Yes"))

Finally, reformat to your original request

最后,重新格式化为您的原始请求

longRegional %>%
  spread(Region, nStoresWithVariable)

#   variableInterest   `1`   `2`
# *            <chr> <int> <int>
# 1                1     2     1
# 2                2     1     2
# 3                3     1     2
# 4                4     1     2
# 5                5     0     3
# 6                6     1     2
# 7                7     3     0
# 8                8     1     2

#3


0  

Here is my messy attempt with a simulation of your data as True and False:

这是我的混乱尝试,模拟您的数据为真和假:

Simulated Data - Generate data table using random Booleans

模拟数据 - 使用随机布尔值生成数据表

rb <- function()
{
  sample(c(T,F), size=10, replace=TRUE, prob=c(0.5, 0.5) )
}

var1_1 = rb()
var2_1 = rb()
var3_1 = rb()
df <- data.frame( var1_1, !var1_1,
                  var2_1, !var2_1,
                  var3_1, !var3_1)

colnames(df) = c('var1_1', 'var1_2', 'var2_1', 'var2_2', 'var3_1', 'var3_2')

df

    var1_1 var1_2 var2_1 var2_2 var3_1 var3_2
 1   FALSE   TRUE   TRUE  FALSE  FALSE   TRUE
 2   FALSE   TRUE  FALSE   TRUE  FALSE   TRUE
 3   FALSE   TRUE   TRUE  FALSE   TRUE  FALSE
 4   FALSE   TRUE   TRUE  FALSE  FALSE   TRUE
 5   FALSE   TRUE  FALSE   TRUE   TRUE  FALSE
 6   FALSE   TRUE  FALSE   TRUE   TRUE  FALSE
 7    TRUE  FALSE   TRUE  FALSE   TRUE  FALSE
 8    TRUE  FALSE  FALSE   TRUE   TRUE  FALSE
 9    TRUE  FALSE  FALSE   TRUE   TRUE  FALSE
 10  FALSE   TRUE  FALSE   TRUE   TRUE  FALSE

Solution

cn <- names(df)
cnprefixes <- gsub("_.*?$","",cn)
cnsuffixes <- gsub("^.*?_","",cn)

newblock<-data.frame()
bFirstTime<-T
for (prefix in unique(cnprefixes))
{
  block<-df[ , grepl( prefix , names( df ) ) ]

  theseSuffixes <- cnsuffixes[startsWith(cn, prefix)]

  j <- 1
  for(suffix in theseSuffixes)
  {
    block[,j][block[,j]==T]=as.numeric(suffix)
    j<-j+1
  }
  tempblock=data.frame(rowSums(block))
  colnames(tempblock)<- prefix
  if (bFirstTime){
    newblock <- tempblock
    bFirstTime <- F
  }
  else{
    newblock<-cbind(newblock, tempblock)
  }
}

newblock

   var1 var2 var3
1     2    1    2
2     2    2    2
3     2    1    1
4     2    1    2
5     2    2    1
6     2    2    1
7     1    1    1
8     1    2    1
9     1    2    1
10    2    2    1