R - 通过组合列中的值来匹配一个值

时间:2021-03-16 13:08:56

I have a data frame with two columns as below:

我有一个包含两列的数据框,如下所示:

    Col1       Col2
1   7197.36    14.00
2        NA  5173.94
3        NA 13333.06
4   7004.38   473.32
5        NA  4980.61
6  26355.52   110.05
7        NA  1307.32
8        NA  6531.06
9        NA  3777.65
10       NA  7827.44
11  8753.22    85.00
12       NA     1.86
13       NA  2009.42
14       NA   502.89
15       NA  3182.86
16       NA       NA

I wanted to find matching rows in column 'Col2' corresponding to the single value in 'Col1'. For example, 7197.36 = 14.00 + 5173.94 + 2009.42 (rows 1,2,13 in 'Col2')

我想在“Col2”列中找到与“Col1”中的单个值对应的匹配行。例如,7197.36 = 14.00 + 5173.94 + 2009.42('Col2'中的第1,2,13行)

Here, sum of 'Col1' = sum of 'Col2'

这里,'Col1'的总和='Col2'的总和

The final data frame should look like this:

最终数据框应如下所示:

   Col1    Col2
 1   7197.36    14.00
 2        NA  5173.94
 3        NA  2009.42
 4   7004.38   473.32
 5        NA  6531.06
 6  26355.52   110.05
 7        NA  1307.32
 8        NA 13333.06
 9        NA  3777.65
10       NA  7827.44
11  8753.22    85.00
12       NA     1.86
13       NA  4980.61
14       NA   502.89
15       NA  3182.86
16       NA       NA

Can anybody help me?

有谁能够帮助我?

2 个解决方案

#1


7  

We solve it via integer linear programming solving the problem of finding the minimum objective value greater than or equal to the target and if it is found to within numerical precision then return it; otherwise, return NULL.

我们通过整数线性规划求解它,解决了找到大于或等于目标的最小目标值的问题,如果发现它在数值精度范围内,则返回它;否则,返回NULL。

library(lpSolve)

obj <- na.omit(DF$Col2)
targets <- na.omit(DF$Col1)
L <- lapply(targets, function(value) {
    iobj <- 100 * obj
    ivalue <- 100 * value
    res <- lp("min", iobj, t(iobj), ">=", ivalue, all.bin = TRUE)
    ok <- isTRUE(all.equal(ivalue, res$objval))
    if (ok) obj[res$solution == 1]
})
names(L) <- targets

giving:

> L

$`7197.36`
[1]   14.00 5173.94 2009.42

$`7004.38`
[1]  473.32 6531.06

$`26355.52`
[1] 13333.06   110.05  1307.32  3777.65  7827.44

$`8753.22`
[1] 4980.61   85.00    1.86  502.89 3182.86

Note 1: Later the question was modified to request this form of output:

注1:后来问题被修改为请求这种形式的输出:

transform(stack(L), Col1 = ifelse(duplicated(ind), NA, as.numeric(paste(ind))), 
                    Col2 = values)[3:4]

Note 2: We used this as DF

注2:我们用它作为DF

Lines <- "    Col1       Col2
1   7197.36    14.00
2        NA  5173.94
3        NA 13333.06
4   7004.38   473.32
5        NA  4980.61
6  26355.52   110.05
7        NA  1307.32
8        NA  6531.06
9        NA  3777.65
10       NA  7827.44
11  8753.22    85.00
12       NA     1.86
13       NA  2009.42
14       NA   502.89
15       NA  3182.86
16       NA       NA"

DF <- read.table(text = Lines, header = TRUE)

#2


2  

Here is a way using combinations from gtools (Won't be very efficient for huge data sets)

这是一种使用gtools组合的方法(对于大型数据集不会非常有效)

library(gtools)
library(zoo)
library(splitstackshape)

data$Col1_mod = na.locf(data$Col1)

df = stack(
     lapply(split(data, f = data$Col1_mod), 
     function(x){ 
         tmp1 = data.frame(
                combinations(
                    length(data$Col2[!is.na(data$Col2)]),
                    length(x$Col2[!is.na(x$Col2)]),
                    data$Col2[!is.na(data$Col2)]));
         tmp1$rowsums = rowSums(tmp1); 
         tmp2 = tmp1[tmp1$rowsums == unique(x$Col1_mod),];   
         toString(tmp2[,!colnames(tmp2) %in% 'rowsums'])
     }))

this will give

这会给

#> df
#                                       values      ind
#1                             473.32, 6531.06  7004.38
#2                        14, 2009.42, 5173.94  7197.36
#3          1.86, 85, 502.89, 3182.86, 4980.61  8753.22
#4 110.05, 1307.32, 3777.65, 7827.44, 13333.06 26355.52

you can reshape it using cSplit from splitstackshape

你可以使用splitstackshape中的cSplit重塑它

out = cSplit(setDT(df), 'values', ',', 'long')

 #>out
 #     values      ind
 #1:   473.32  7004.38
 #2:  6531.06  7004.38
 #3:    14.00  7197.36
 #4:  2009.42  7197.36
 #5:  5173.94  7197.36
 #6:     1.86  8753.22
 #7:    85.00  8753.22
 #8:   502.89  8753.22
 #9:  3182.86  8753.22
#10:  4980.61  8753.22
#11:   110.05 26355.52
#12:  1307.32 26355.52
#13:  3777.65 26355.52
#14:  7827.44 26355.52
#15: 13333.06 26355.52

#1


7  

We solve it via integer linear programming solving the problem of finding the minimum objective value greater than or equal to the target and if it is found to within numerical precision then return it; otherwise, return NULL.

我们通过整数线性规划求解它,解决了找到大于或等于目标的最小目标值的问题,如果发现它在数值精度范围内,则返回它;否则,返回NULL。

library(lpSolve)

obj <- na.omit(DF$Col2)
targets <- na.omit(DF$Col1)
L <- lapply(targets, function(value) {
    iobj <- 100 * obj
    ivalue <- 100 * value
    res <- lp("min", iobj, t(iobj), ">=", ivalue, all.bin = TRUE)
    ok <- isTRUE(all.equal(ivalue, res$objval))
    if (ok) obj[res$solution == 1]
})
names(L) <- targets

giving:

> L

$`7197.36`
[1]   14.00 5173.94 2009.42

$`7004.38`
[1]  473.32 6531.06

$`26355.52`
[1] 13333.06   110.05  1307.32  3777.65  7827.44

$`8753.22`
[1] 4980.61   85.00    1.86  502.89 3182.86

Note 1: Later the question was modified to request this form of output:

注1:后来问题被修改为请求这种形式的输出:

transform(stack(L), Col1 = ifelse(duplicated(ind), NA, as.numeric(paste(ind))), 
                    Col2 = values)[3:4]

Note 2: We used this as DF

注2:我们用它作为DF

Lines <- "    Col1       Col2
1   7197.36    14.00
2        NA  5173.94
3        NA 13333.06
4   7004.38   473.32
5        NA  4980.61
6  26355.52   110.05
7        NA  1307.32
8        NA  6531.06
9        NA  3777.65
10       NA  7827.44
11  8753.22    85.00
12       NA     1.86
13       NA  2009.42
14       NA   502.89
15       NA  3182.86
16       NA       NA"

DF <- read.table(text = Lines, header = TRUE)

#2


2  

Here is a way using combinations from gtools (Won't be very efficient for huge data sets)

这是一种使用gtools组合的方法(对于大型数据集不会非常有效)

library(gtools)
library(zoo)
library(splitstackshape)

data$Col1_mod = na.locf(data$Col1)

df = stack(
     lapply(split(data, f = data$Col1_mod), 
     function(x){ 
         tmp1 = data.frame(
                combinations(
                    length(data$Col2[!is.na(data$Col2)]),
                    length(x$Col2[!is.na(x$Col2)]),
                    data$Col2[!is.na(data$Col2)]));
         tmp1$rowsums = rowSums(tmp1); 
         tmp2 = tmp1[tmp1$rowsums == unique(x$Col1_mod),];   
         toString(tmp2[,!colnames(tmp2) %in% 'rowsums'])
     }))

this will give

这会给

#> df
#                                       values      ind
#1                             473.32, 6531.06  7004.38
#2                        14, 2009.42, 5173.94  7197.36
#3          1.86, 85, 502.89, 3182.86, 4980.61  8753.22
#4 110.05, 1307.32, 3777.65, 7827.44, 13333.06 26355.52

you can reshape it using cSplit from splitstackshape

你可以使用splitstackshape中的cSplit重塑它

out = cSplit(setDT(df), 'values', ',', 'long')

 #>out
 #     values      ind
 #1:   473.32  7004.38
 #2:  6531.06  7004.38
 #3:    14.00  7197.36
 #4:  2009.42  7197.36
 #5:  5173.94  7197.36
 #6:     1.86  8753.22
 #7:    85.00  8753.22
 #8:   502.89  8753.22
 #9:  3182.86  8753.22
#10:  4980.61  8753.22
#11:   110.05 26355.52
#12:  1307.32 26355.52
#13:  3777.65 26355.52
#14:  7827.44 26355.52
#15: 13333.06 26355.52