使用顺序编号和NA填充data.frame

时间:2022-12-05 23:41:29

Here my example data.frame:

这是我的示例data.frame:

df = read.table(text = 'colA  colB colC colD
74001   9520    2   56
74006   9520    2   56
33021   9518    3   99
33024   9518    3   99
37001   9518    3   99
33014   9517    19  143
33023   9517    19  143
33050   9517    19  143
34005   9517    19  143
34006   9517    19  143
37006   9517    19  143
38001   9517    19  143
38020   9517    19  143
38021   9517    19  143
39005   9517    19  143
39093   9517    19  143
40004   9517    19  143
40012   9517    19  143
41005   9517    19  143
41006   9517    19  143
41012   9517    19  143
41014   9517    19  143
41020   9517    19  143
41022   9517    19  143
38022   9516    5   177
39003   9516    5   177
39056   9516    5   177
40016   9516    5   177
47011   9516    5   177
46006   9514    16  176
47007   9514    16  176
47009   9514    16  176
47011   9514    16  176
58008   9514    16  176
59001   9514    16  176
59002   9514    16  176
60004   9514    16  176
60006   9514    16  176
61001   9514    16  176
61002   9514    16  176
61003   9514    16  176
65005   9514    16  176
81002   9514    16  176
81003   9514    16  176
82003   9514    16  176
41006   9512    1   163
65005   9510    1   164
40003   9509    9   165
40011   9509    9   165
40012   9509    9   165
47004   9509    9   165
47009   9509    9   165
48010   9509    9   165
60004   9509    9   165
62001   9509    9   165
66006   9509    9   165', header = TRUE)

I need to fill df with only the missing colB observations (one per each missing one) and NAs in the remaining columns.

我需要填充df,只有缺少的colB观察值(每个缺失一个)和其余列中的NA。

In this case my colB col ranges from colB = 9509 to colB = 9520 and the missing observations between this range are colB = 9519, 9515, 9513 and 9511.

在这种情况下,我的colB col范围从colB = 9509到colB = 9520,并且该范围之间缺失的观察值是colB = 9519,9515,9513和9511。

Here my expected output:

这是我的预期输出:

colA    colB  colC  colD
74001   9520    2   56
74006   9520    2   56
NA      9519    NA  NA
33021   9518    3   99
33024   9518    3   99
37001   9518    3   99
33014   9517    19  143
33023   9517    19  143
33050   9517    19  143
34005   9517    19  143
34006   9517    19  143
37006   9517    19  143
38001   9517    19  143
38020   9517    19  143
38021   9517    19  143
39005   9517    19  143
39093   9517    19  143
40004   9517    19  143
40012   9517    19  143
41005   9517    19  143
41006   9517    19  143
41012   9517    19  143
41014   9517    19  143
41020   9517    19  143
41022   9517    19  143
38022   9516    5   177
39003   9516    5   177
39056   9516    5   177
40016   9516    5   177
47011   9516    5   177
NA      9515    NA  NA
46006   9514    16  176
47007   9514    16  176
47009   9514    16  176
47011   9514    16  176
58008   9514    16  176
59001   9514    16  176
59002   9514    16  176
60004   9514    16  176
60006   9514    16  176
61001   9514    16  176
61002   9514    16  176
61003   9514    16  176
65005   9514    16  176
81002   9514    16  176
81003   9514    16  176
82003   9514    16  176
NA      9513    NA  NA
41006   9512    1   163
NA      9511    NA  NA
65005   9510    1   164
40003   9509    9   165
40011   9509    9   165
40012   9509    9   165
47004   9509    9   165
47009   9509    9   165
48010   9509    9   165
60004   9509    9   165
62001   9509    9   165
66006   9509    9   165

Any help would really appreciated.

任何帮助都会非常感激。

Thanks

2 个解决方案

#1


3  

You can simply do :

你可以简单地做:

# create a vector with all the missing days
missing <- setdiff(max(df$Day):min(df$Day),df$Day)
# append the missing rows to the end of df data.frame (creating a new one)
df2 <- rbind(df,data.frame(colA=NA,Day=missing,colC=NA,colD=NA))
# sort the rows by Day
df2 <- df2[order(df2$Day),]

Result :

> df2
    colA  Day colC colD
48 40003 9509    9  165
49 40011 9509    9  165
50 40012 9509    9  165
51 47004 9509    9  165
52 47009 9509    9  165
53 48010 9509    9  165
54 60004 9509    9  165
55 62001 9509    9  165
56 66006 9509    9  165
47 65005 9510    1  164
60    NA 9511   NA   NA
46 41006 9512    1  163
59    NA 9513   NA   NA
30 46006 9514   16  176
31 47007 9514   16  176
32 47009 9514   16  176
33 47011 9514   16  176
34 58008 9514   16  176
35 59001 9514   16  176
36 59002 9514   16  176
37 60004 9514   16  176
38 60006 9514   16  176
39 61001 9514   16  176
40 61002 9514   16  176
41 61003 9514   16  176
42 65005 9514   16  176
43 81002 9514   16  176
44 81003 9514   16  176
45 82003 9514   16  176
58    NA 9515   NA   NA
25 38022 9516    5  177
26 39003 9516    5  177
27 39056 9516    5  177
28 40016 9516    5  177
29 47011 9516    5  177
6  33014 9517   19  143
7  33023 9517   19  143
8  33050 9517   19  143
9  34005 9517   19  143
10 34006 9517   19  143
11 37006 9517   19  143
12 38001 9517   19  143
13 38020 9517   19  143
14 38021 9517   19  143
15 39005 9517   19  143
16 39093 9517   19  143
17 40004 9517   19  143
18 40012 9517   19  143
19 41005 9517   19  143
20 41006 9517   19  143
21 41012 9517   19  143
22 41014 9517   19  143
23 41020 9517   19  143
24 41022 9517   19  143
3  33021 9518    3   99
4  33024 9518    3   99
5  37001 9518    3   99
57    NA 9519   NA   NA
1  74001 9520    2   56
2  74006 9520    2   56

#2


1  

This should work for your;

这应该对你有用;

library(zoo)

df$Day<-as.POSIXct(df$Day,format="%m/%d/%y", origin = "1960-01-01")
df.z<-zoo(df[,-1],df$Day) #Day as Index 

df.final <- merge(df.z,zoo(,seq(start(df.z),end(df.z),by="min")), all=TRUE)

if you want to stick to the numeric version then the following will be more useful as it does not change the class/type of the data;

如果你想坚持数字版本,那么以下将更有用,因为它不会改变数据的类/类型;

#find the missing days:
allDays <- seq(min(df$Day), max(df$Day), 1)
Day0 <- allDays[!(allDays %in% df$Day)]


#create a dataframe with the same size (columns) as original dataset with missing days:
missed <- data.frame(colA = NA_real_, Day = Day0, colC = NA_real_,colD = NA_real_ )

#append to the original dataset
df.filled <- rbind(df, missed)

#sort based on days to have the missing value in the right place:
df.filled <- df.filled[order(df.filled$Day),] 

#1


3  

You can simply do :

你可以简单地做:

# create a vector with all the missing days
missing <- setdiff(max(df$Day):min(df$Day),df$Day)
# append the missing rows to the end of df data.frame (creating a new one)
df2 <- rbind(df,data.frame(colA=NA,Day=missing,colC=NA,colD=NA))
# sort the rows by Day
df2 <- df2[order(df2$Day),]

Result :

> df2
    colA  Day colC colD
48 40003 9509    9  165
49 40011 9509    9  165
50 40012 9509    9  165
51 47004 9509    9  165
52 47009 9509    9  165
53 48010 9509    9  165
54 60004 9509    9  165
55 62001 9509    9  165
56 66006 9509    9  165
47 65005 9510    1  164
60    NA 9511   NA   NA
46 41006 9512    1  163
59    NA 9513   NA   NA
30 46006 9514   16  176
31 47007 9514   16  176
32 47009 9514   16  176
33 47011 9514   16  176
34 58008 9514   16  176
35 59001 9514   16  176
36 59002 9514   16  176
37 60004 9514   16  176
38 60006 9514   16  176
39 61001 9514   16  176
40 61002 9514   16  176
41 61003 9514   16  176
42 65005 9514   16  176
43 81002 9514   16  176
44 81003 9514   16  176
45 82003 9514   16  176
58    NA 9515   NA   NA
25 38022 9516    5  177
26 39003 9516    5  177
27 39056 9516    5  177
28 40016 9516    5  177
29 47011 9516    5  177
6  33014 9517   19  143
7  33023 9517   19  143
8  33050 9517   19  143
9  34005 9517   19  143
10 34006 9517   19  143
11 37006 9517   19  143
12 38001 9517   19  143
13 38020 9517   19  143
14 38021 9517   19  143
15 39005 9517   19  143
16 39093 9517   19  143
17 40004 9517   19  143
18 40012 9517   19  143
19 41005 9517   19  143
20 41006 9517   19  143
21 41012 9517   19  143
22 41014 9517   19  143
23 41020 9517   19  143
24 41022 9517   19  143
3  33021 9518    3   99
4  33024 9518    3   99
5  37001 9518    3   99
57    NA 9519   NA   NA
1  74001 9520    2   56
2  74006 9520    2   56

#2


1  

This should work for your;

这应该对你有用;

library(zoo)

df$Day<-as.POSIXct(df$Day,format="%m/%d/%y", origin = "1960-01-01")
df.z<-zoo(df[,-1],df$Day) #Day as Index 

df.final <- merge(df.z,zoo(,seq(start(df.z),end(df.z),by="min")), all=TRUE)

if you want to stick to the numeric version then the following will be more useful as it does not change the class/type of the data;

如果你想坚持数字版本,那么以下将更有用,因为它不会改变数据的类/类型;

#find the missing days:
allDays <- seq(min(df$Day), max(df$Day), 1)
Day0 <- allDays[!(allDays %in% df$Day)]


#create a dataframe with the same size (columns) as original dataset with missing days:
missed <- data.frame(colA = NA_real_, Day = Day0, colC = NA_real_,colD = NA_real_ )

#append to the original dataset
df.filled <- rbind(df, missed)

#sort based on days to have the missing value in the right place:
df.filled <- df.filled[order(df.filled$Day),]