I have a data frame with two columns as below:
我有一个包含两列的数据框,如下所示:
Col1 Col2
1 7197.36 14.00
2 NA 5173.94
3 NA 13333.06
4 7004.38 473.32
5 NA 4980.61
6 26355.52 110.05
7 NA 1307.32
8 NA 6531.06
9 NA 3777.65
10 NA 7827.44
11 8753.22 85.00
12 NA 1.86
13 NA 2009.42
14 NA 502.89
15 NA 3182.86
16 NA NA
I wanted to find matching rows in column 'Col2' corresponding to the single value in 'Col1'. For example, 7197.36 = 14.00 + 5173.94 + 2009.42 (rows 1,2,13 in 'Col2')
我想在“Col2”列中找到与“Col1”中的单个值对应的匹配行。例如,7197.36 = 14.00 + 5173.94 + 2009.42('Col2'中的第1,2,13行)
Here, sum of 'Col1' = sum of 'Col2'
这里,'Col1'的总和='Col2'的总和
The final data frame should look like this:
最终数据框应如下所示:
Col1 Col2
1 7197.36 14.00
2 NA 5173.94
3 NA 2009.42
4 7004.38 473.32
5 NA 6531.06
6 26355.52 110.05
7 NA 1307.32
8 NA 13333.06
9 NA 3777.65
10 NA 7827.44
11 8753.22 85.00
12 NA 1.86
13 NA 4980.61
14 NA 502.89
15 NA 3182.86
16 NA NA
Can anybody help me?
有谁能够帮助我?
2 个解决方案
#1
7
We solve it via integer linear programming solving the problem of finding the minimum objective value greater than or equal to the target and if it is found to within numerical precision then return it; otherwise, return NULL.
我们通过整数线性规划求解它,解决了找到大于或等于目标的最小目标值的问题,如果发现它在数值精度范围内,则返回它;否则,返回NULL。
library(lpSolve)
obj <- na.omit(DF$Col2)
targets <- na.omit(DF$Col1)
L <- lapply(targets, function(value) {
iobj <- 100 * obj
ivalue <- 100 * value
res <- lp("min", iobj, t(iobj), ">=", ivalue, all.bin = TRUE)
ok <- isTRUE(all.equal(ivalue, res$objval))
if (ok) obj[res$solution == 1]
})
names(L) <- targets
giving:
> L
$`7197.36`
[1] 14.00 5173.94 2009.42
$`7004.38`
[1] 473.32 6531.06
$`26355.52`
[1] 13333.06 110.05 1307.32 3777.65 7827.44
$`8753.22`
[1] 4980.61 85.00 1.86 502.89 3182.86
Note 1: Later the question was modified to request this form of output:
注1:后来问题被修改为请求这种形式的输出:
transform(stack(L), Col1 = ifelse(duplicated(ind), NA, as.numeric(paste(ind))),
Col2 = values)[3:4]
Note 2: We used this as DF
注2:我们用它作为DF
Lines <- " Col1 Col2
1 7197.36 14.00
2 NA 5173.94
3 NA 13333.06
4 7004.38 473.32
5 NA 4980.61
6 26355.52 110.05
7 NA 1307.32
8 NA 6531.06
9 NA 3777.65
10 NA 7827.44
11 8753.22 85.00
12 NA 1.86
13 NA 2009.42
14 NA 502.89
15 NA 3182.86
16 NA NA"
DF <- read.table(text = Lines, header = TRUE)
#2
2
Here is a way using combinations
from gtools
(Won't be very efficient for huge data sets)
这是一种使用gtools组合的方法(对于大型数据集不会非常有效)
library(gtools)
library(zoo)
library(splitstackshape)
data$Col1_mod = na.locf(data$Col1)
df = stack(
lapply(split(data, f = data$Col1_mod),
function(x){
tmp1 = data.frame(
combinations(
length(data$Col2[!is.na(data$Col2)]),
length(x$Col2[!is.na(x$Col2)]),
data$Col2[!is.na(data$Col2)]));
tmp1$rowsums = rowSums(tmp1);
tmp2 = tmp1[tmp1$rowsums == unique(x$Col1_mod),];
toString(tmp2[,!colnames(tmp2) %in% 'rowsums'])
}))
this will give
这会给
#> df
# values ind
#1 473.32, 6531.06 7004.38
#2 14, 2009.42, 5173.94 7197.36
#3 1.86, 85, 502.89, 3182.86, 4980.61 8753.22
#4 110.05, 1307.32, 3777.65, 7827.44, 13333.06 26355.52
you can reshape it using cSplit
from splitstackshape
你可以使用splitstackshape中的cSplit重塑它
out = cSplit(setDT(df), 'values', ',', 'long')
#>out
# values ind
#1: 473.32 7004.38
#2: 6531.06 7004.38
#3: 14.00 7197.36
#4: 2009.42 7197.36
#5: 5173.94 7197.36
#6: 1.86 8753.22
#7: 85.00 8753.22
#8: 502.89 8753.22
#9: 3182.86 8753.22
#10: 4980.61 8753.22
#11: 110.05 26355.52
#12: 1307.32 26355.52
#13: 3777.65 26355.52
#14: 7827.44 26355.52
#15: 13333.06 26355.52
#1
7
We solve it via integer linear programming solving the problem of finding the minimum objective value greater than or equal to the target and if it is found to within numerical precision then return it; otherwise, return NULL.
我们通过整数线性规划求解它,解决了找到大于或等于目标的最小目标值的问题,如果发现它在数值精度范围内,则返回它;否则,返回NULL。
library(lpSolve)
obj <- na.omit(DF$Col2)
targets <- na.omit(DF$Col1)
L <- lapply(targets, function(value) {
iobj <- 100 * obj
ivalue <- 100 * value
res <- lp("min", iobj, t(iobj), ">=", ivalue, all.bin = TRUE)
ok <- isTRUE(all.equal(ivalue, res$objval))
if (ok) obj[res$solution == 1]
})
names(L) <- targets
giving:
> L
$`7197.36`
[1] 14.00 5173.94 2009.42
$`7004.38`
[1] 473.32 6531.06
$`26355.52`
[1] 13333.06 110.05 1307.32 3777.65 7827.44
$`8753.22`
[1] 4980.61 85.00 1.86 502.89 3182.86
Note 1: Later the question was modified to request this form of output:
注1:后来问题被修改为请求这种形式的输出:
transform(stack(L), Col1 = ifelse(duplicated(ind), NA, as.numeric(paste(ind))),
Col2 = values)[3:4]
Note 2: We used this as DF
注2:我们用它作为DF
Lines <- " Col1 Col2
1 7197.36 14.00
2 NA 5173.94
3 NA 13333.06
4 7004.38 473.32
5 NA 4980.61
6 26355.52 110.05
7 NA 1307.32
8 NA 6531.06
9 NA 3777.65
10 NA 7827.44
11 8753.22 85.00
12 NA 1.86
13 NA 2009.42
14 NA 502.89
15 NA 3182.86
16 NA NA"
DF <- read.table(text = Lines, header = TRUE)
#2
2
Here is a way using combinations
from gtools
(Won't be very efficient for huge data sets)
这是一种使用gtools组合的方法(对于大型数据集不会非常有效)
library(gtools)
library(zoo)
library(splitstackshape)
data$Col1_mod = na.locf(data$Col1)
df = stack(
lapply(split(data, f = data$Col1_mod),
function(x){
tmp1 = data.frame(
combinations(
length(data$Col2[!is.na(data$Col2)]),
length(x$Col2[!is.na(x$Col2)]),
data$Col2[!is.na(data$Col2)]));
tmp1$rowsums = rowSums(tmp1);
tmp2 = tmp1[tmp1$rowsums == unique(x$Col1_mod),];
toString(tmp2[,!colnames(tmp2) %in% 'rowsums'])
}))
this will give
这会给
#> df
# values ind
#1 473.32, 6531.06 7004.38
#2 14, 2009.42, 5173.94 7197.36
#3 1.86, 85, 502.89, 3182.86, 4980.61 8753.22
#4 110.05, 1307.32, 3777.65, 7827.44, 13333.06 26355.52
you can reshape it using cSplit
from splitstackshape
你可以使用splitstackshape中的cSplit重塑它
out = cSplit(setDT(df), 'values', ',', 'long')
#>out
# values ind
#1: 473.32 7004.38
#2: 6531.06 7004.38
#3: 14.00 7197.36
#4: 2009.42 7197.36
#5: 5173.94 7197.36
#6: 1.86 8753.22
#7: 85.00 8753.22
#8: 502.89 8753.22
#9: 3182.86 8753.22
#10: 4980.61 8753.22
#11: 110.05 26355.52
#12: 1307.32 26355.52
#13: 3777.65 26355.52
#14: 7827.44 26355.52
#15: 13333.06 26355.52