Merge two data.tables
merge.Rd
Fast merge of two data.table
s. The data.table
method behaves
similarly to data.frame
except that row order is specified, and by
default the columns to merge on are chosen:
at first based on the shared key columns, and if there are none,
then based on key columns of the first argument
x
, and if there are none,then based on the common columns between the two
data.table
s.
Use the by
, by.x
and by.y
arguments explicitly to override this default.
Arguments
- x, y
data table
s.y
is coerced to adata.table
if it isn't one already.- by
A vector of shared column names in
x
andy
to merge on. This defaults to the shared key columns between the two tables. Ify
has no key columns, this defaults to the key ofx
.- by.x, by.y
Vectors of column names in
x
andy
to merge on.- all
logical;
all = TRUE
is shorthand to save setting bothall.x = TRUE
andall.y = TRUE
.- all.x
logical; if
TRUE
, rows fromx
which have no matching row iny
are included. These rows will have 'NA's in the columns that are usually filled with values fromy
. The default isFALSE
so that only rows with data from bothx
andy
are included in the output.- all.y
logical; analogous to
all.x
above.- sort
logical. If
TRUE
(default), the rows of the mergeddata.table
are sorted by setting the key to theby / by.x
columns. IfFALSE
, unlike base R'smerge
for which row order is unspecified, the row order inx
is retained (including retaining the position of missing entries whenall.x=TRUE
), followed byy
rows that don't matchx
(whenall.y=TRUE
) retaining the order those appear iny
.- suffixes
A
character(2)
specifying the suffixes to be used for making non-by
column names unique. The suffix behaviour works in a similar fashion as themerge.data.frame
method does.- no.dups
logical indicating that
suffixes
are also appended to non-by.y
column names iny
when they have the same column name as anyby.x
.- allow.cartesian
See
allow.cartesian
in[.data.table
.- incomparables
values which cannot be matched and therefore are excluded from by columns.
- ...
Not used at this time.
Details
merge
is a generic function in base R. It dispatches to either the
merge.data.frame
method or merge.data.table
method depending on
the class of its first argument. Note that, unlike SQL
join, NA
is
matched against NA
(and NaN
against NaN
) while merging.
For a more data.table
-centric way of merging two data.table
s, see
[.data.table
; e.g., x[y, ...]
. See FAQ 1.11 for a detailed
comparison of merge
and x[y, ...]
.
Value
A new data.table
based on the merged data table
s, and sorted by the
columns set (or inferred for) the by
argument if argument sort
is
set to TRUE
.
Examples
(dt1 <- data.table(A = letters[1:10], X = 1:10, key = "A"))
#> Key: <A>
#> A X
#> <char> <int>
#> 1: a 1
#> 2: b 2
#> 3: c 3
#> 4: d 4
#> 5: e 5
#> 6: f 6
#> 7: g 7
#> 8: h 8
#> 9: i 9
#> 10: j 10
(dt2 <- data.table(A = letters[5:14], Y = 1:10, key = "A"))
#> Key: <A>
#> A Y
#> <char> <int>
#> 1: e 1
#> 2: f 2
#> 3: g 3
#> 4: h 4
#> 5: i 5
#> 6: j 6
#> 7: k 7
#> 8: l 8
#> 9: m 9
#> 10: n 10
merge(dt1, dt2)
#> Key: <A>
#> A X Y
#> <char> <int> <int>
#> 1: e 5 1
#> 2: f 6 2
#> 3: g 7 3
#> 4: h 8 4
#> 5: i 9 5
#> 6: j 10 6
merge(dt1, dt2, all = TRUE)
#> Key: <A>
#> A X Y
#> <char> <int> <int>
#> 1: a 1 NA
#> 2: b 2 NA
#> 3: c 3 NA
#> 4: d 4 NA
#> 5: e 5 1
#> 6: f 6 2
#> 7: g 7 3
#> 8: h 8 4
#> 9: i 9 5
#> 10: j 10 6
#> 11: k NA 7
#> 12: l NA 8
#> 13: m NA 9
#> 14: n NA 10
(dt1 <- data.table(A = letters[rep(1:3, 2)], X = 1:6, key = "A"))
#> Key: <A>
#> A X
#> <char> <int>
#> 1: a 1
#> 2: a 4
#> 3: b 2
#> 4: b 5
#> 5: c 3
#> 6: c 6
(dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A"))
#> Key: <A>
#> A Y
#> <char> <int>
#> 1: b 6
#> 2: b 3
#> 3: c 5
#> 4: c 2
#> 5: d 4
#> 6: d 1
merge(dt1, dt2, allow.cartesian=TRUE)
#> Key: <A>
#> A X Y
#> <char> <int> <int>
#> 1: b 2 6
#> 2: b 2 3
#> 3: b 5 6
#> 4: b 5 3
#> 5: c 3 5
#> 6: c 3 2
#> 7: c 6 5
#> 8: c 6 2
(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = c("A", "B")))
#> Key: <A, B>
#> A B X
#> <int> <char> <int>
#> 1: 1 a 1
#> 2: 1 a 4
#> 3: 1 b 2
#> 4: 1 b 5
#> 5: 1 c 3
#> 6: 2 c 6
(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = c("A", "B")))
#> Key: <A, B>
#> A B Y
#> <int> <char> <int>
#> 1: 1 b 6
#> 2: 1 b 3
#> 3: 1 c 5
#> 4: 1 c 2
#> 5: 1 d 4
#> 6: 2 d 1
merge(dt1, dt2)
#> Key: <A, B>
#> A B X Y
#> <int> <char> <int> <int>
#> 1: 1 b 2 6
#> 2: 1 b 2 3
#> 3: 1 b 5 6
#> 4: 1 b 5 3
#> 5: 1 c 3 5
#> 6: 1 c 3 2
merge(dt1, dt2, by="B", allow.cartesian=TRUE)
#> Key: <B>
#> B A.x X A.y Y
#> <char> <int> <int> <int> <int>
#> 1: b 1 2 1 6
#> 2: b 1 2 1 3
#> 3: b 1 5 1 6
#> 4: b 1 5 1 3
#> 5: c 1 3 1 5
#> 6: c 1 3 1 2
#> 7: c 2 6 1 5
#> 8: c 2 6 1 2
# test it more:
d1 <- data.table(a=rep(1:2,each=3), b=1:6, key=c("a", "b"))
d2 <- data.table(a=0:1, bb=10:11, key="a")
d3 <- data.table(a=0:1, key="a")
d4 <- data.table(a=0:1, b=0:1, key=c("a", "b"))
merge(d1, d2)
#> Key: <a>
#> a b bb
#> <int> <int> <int>
#> 1: 1 1 11
#> 2: 1 2 11
#> 3: 1 3 11
merge(d2, d1)
#> Key: <a>
#> a bb b
#> <int> <int> <int>
#> 1: 1 11 1
#> 2: 1 11 2
#> 3: 1 11 3
merge(d1, d2, all=TRUE)
#> Key: <a>
#> a b bb
#> <int> <int> <int>
#> 1: 0 NA 10
#> 2: 1 1 11
#> 3: 1 2 11
#> 4: 1 3 11
#> 5: 2 4 NA
#> 6: 2 5 NA
#> 7: 2 6 NA
merge(d2, d1, all=TRUE)
#> Key: <a>
#> a bb b
#> <int> <int> <int>
#> 1: 0 10 NA
#> 2: 1 11 1
#> 3: 1 11 2
#> 4: 1 11 3
#> 5: 2 NA 4
#> 6: 2 NA 5
#> 7: 2 NA 6
merge(d3, d1)
#> Key: <a>
#> a b
#> <int> <int>
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
merge(d1, d3)
#> Key: <a>
#> a b
#> <int> <int>
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
merge(d1, d3, all=TRUE)
#> Key: <a>
#> a b
#> <int> <int>
#> 1: 0 NA
#> 2: 1 1
#> 3: 1 2
#> 4: 1 3
#> 5: 2 4
#> 6: 2 5
#> 7: 2 6
merge(d3, d1, all=TRUE)
#> Key: <a>
#> a b
#> <int> <int>
#> 1: 0 NA
#> 2: 1 1
#> 3: 1 2
#> 4: 1 3
#> 5: 2 4
#> 6: 2 5
#> 7: 2 6
merge(d1, d4)
#> Key: <a, b>
#> a b
#> <int> <int>
#> 1: 1 1
merge(d1, d4, by="a", suffixes=c(".d1", ".d4"))
#> Key: <a>
#> a b.d1 b.d4
#> <int> <int> <int>
#> 1: 1 1 1
#> 2: 1 2 1
#> 3: 1 3 1
merge(d4, d1)
#> Key: <a, b>
#> a b
#> <int> <int>
#> 1: 1 1
merge(d1, d4, all=TRUE)
#> Key: <a, b>
#> a b
#> <int> <int>
#> 1: 0 0
#> 2: 1 1
#> 3: 1 2
#> 4: 1 3
#> 5: 2 4
#> 6: 2 5
#> 7: 2 6
merge(d4, d1, all=TRUE)
#> Key: <a, b>
#> a b
#> <int> <int>
#> 1: 0 0
#> 2: 1 1
#> 3: 1 2
#> 4: 1 3
#> 5: 2 4
#> 6: 2 5
#> 7: 2 6
# setkey is automatic by default
set.seed(1L)
d1 <- data.table(a=sample(rep(1:3,each=2)), z=1:6)
d2 <- data.table(a=2:0, z=10:12)
merge(d1, d2, by="a")
#> Key: <a>
#> a z.x z.y
#> <int> <int> <int>
#> 1: 1 1 11
#> 2: 1 5 11
#> 3: 2 2 10
#> 4: 2 3 10
merge(d1, d2, by="a", all=TRUE)
#> Key: <a>
#> a z.x z.y
#> <int> <int> <int>
#> 1: 0 NA 12
#> 2: 1 1 11
#> 3: 1 5 11
#> 4: 2 2 10
#> 5: 2 3 10
#> 6: 3 4 NA
#> 7: 3 6 NA
# using by.x and by.y
setnames(d2, "a", "b")
merge(d1, d2, by.x="a", by.y="b")
#> Key: <a>
#> a z.x z.y
#> <int> <int> <int>
#> 1: 1 1 11
#> 2: 1 5 11
#> 3: 2 2 10
#> 4: 2 3 10
merge(d1, d2, by.x="a", by.y="b", all=TRUE)
#> Key: <a>
#> a z.x z.y
#> <int> <int> <int>
#> 1: 0 NA 12
#> 2: 1 1 11
#> 3: 1 5 11
#> 4: 2 2 10
#> 5: 2 3 10
#> 6: 3 4 NA
#> 7: 3 6 NA
merge(d2, d1, by.x="b", by.y="a")
#> Key: <b>
#> b z.x z.y
#> <int> <int> <int>
#> 1: 1 11 1
#> 2: 1 11 5
#> 3: 2 10 2
#> 4: 2 10 3
# using incomparables values
d1 <- data.table(a=c(1,2,NA,NA,3,1), z=1:6)
d2 <- data.table(a=c(1,2,NA), z=10:12)
merge(d1, d2, by="a")
#> Key: <a>
#> a z.x z.y
#> <num> <int> <int>
#> 1: NA 3 12
#> 2: NA 4 12
#> 3: 1 1 10
#> 4: 1 6 10
#> 5: 2 2 11
merge(d1, d2, by="a", incomparables=NA)
#> Key: <a>
#> a z.x z.y
#> <num> <int> <int>
#> 1: 1 1 10
#> 2: 1 6 10
#> 3: 2 2 11