Consider the following example:
考虑下面的例子:
db.article.aggregate(
{ $group : {
_id : "$author",
docsPerAuthor : { $sum : 1 },
viewsPerAuthor : { $sum : "$pageViews" }
}}
);
This groups by the author field and computes two fields.
这组由作者字段和计算两个字段。
I have values for $author = FirstName_LastName. Now instead of grouping by $author, I want to group by all authors who share the same LastName.
我有$author = FirstName_LastName的值。现在,我想将所有具有相同姓的作者分组,而不是按$author进行分组。
I tried $regex to group by all matching strings after the '_'
我尝试了$regex对所有匹配的字符串进行分组
$author.match(/_[a-zA-Z0-9]+$/)
db.article.aggregate(
{ $group : {
_id : "$author".match(/_[a-zA-Z0-9]+$/),
docsPerAuthor : { $sum : 1 },
viewsPerAuthor : { $sum : "$pageViews" }
}}
);
also tried the following:
db.article.aggregate(
{ $group : {
_id : {$author: {$regex: /_[a-zA-Z0-9]+$/}},
docsPerAuthor : { $sum : 1 },
viewsPerAuthor : { $sum : "$pageViews" }
}}
);
3 个解决方案
#1
6
Actually there is no such method which provides this kind of functionality or i could not find the appropriate version which contains it. That will not work with $regexp i think : http://docs.mongodb.org/manual/reference/operator/regex/ it is just for pattern matching.
实际上没有提供这种功能的方法,或者我找不到包含它的适当版本。我认为,对于$regexp,这是行不通的:http://docs.mongodb.org/manual/reference/operator/regex/,它只是用于模式匹配。
There is an improvement request in the jira : https://jira.mongodb.org/browse/SERVER-6773
jira中有一个改进请求:https://jira.mongodb.org/browse/SERVER-6773
It is in open unresolved state. BUT
它处于开放的未解决状态。但
in github i found this disscussion: https://github.com/mongodb/mongo/pull/336
在github上,我发现了这个问题:https://github.com/mongodb/mongo/pull/336
And if you check this commit: https://github.com/nleite/mongo/commit/2dd175a5acda86aaad61f5eb9dab83ee19915709
如果您检查这个提交:https://github.com/nleite/mongo/commit/2dd175a5acda86aa61f5eb93ee19915709
it contains more or less exactly the method you likely to have. I do not really get the point of the state of this improvement: in 2.2.3 it is not working .
它或多或少包含您可能拥有的方法。我并没有真正理解这种改进的状态:2.2.3它没有发挥作用。
#2
4
Use mapReduce: it is the general form of aggregation. This is how to proceed in mongo shell: Define the map function
使用mapReduce:它是聚合的一般形式。这是如何在mongo shell中进行的:定义映射函数
var mapFunction = function() {
var key = this.author.match(/_[a-zA-Z0-9]+$/)[0];
var nb_match_bar2 = 0;
if( this.bar.match(/bar2/g) ){
nb_match_bar2 = 1;
}
var value = {
docsPerAuthor: 1,
viewsPerAuthor: Array.sum(this.pageViews)
};
emit( key, value );
};
and the reduce function
和reduce函数
var reduceFunction = function(key, values) {
var reducedObject = {
_id: key,
docsPerAuthor: 0,
viewsPerAuthor: 0
};
values.forEach( function(value) {
reducedObject.docsPerAuthor += value.docsPerAuthor;
reducedObject.viewsPerAuthor += value.viewsPerAuthor;
}
);
return reducedObject;
};
run mapReduce and save the result in map_reduce_result
运行mapReduce并将结果保存到map_reduce_result中
>db.st.mapReduce(mapFunction, reduceFunction, {out:'map_reduce_result'})
query map_reduce_result to have the result
查询map_reduce_result以获得结果
>db.map_reduce_result.find()
#3
3
A possible workaround with the aggregation framework consists in using $project to compute the author name. However, it is dirty as you need to manually loop through the different first name sizes:
使用聚合框架的一个可能的解决方案是使用$project来计算作者名。但是,它是脏的,因为您需要手动循环通过不同的名字大小:
Here, we compute the field name as the substring after the '_' character, trying each of its possible position (this is why there is a chain of $cond), and fallbacking in returning the whole $author if the first name is too long:
这里,我们计算字段名作为“_”字符后面的子字符串,尝试每个可能的位置(这就是为什么有一个$cond链),如果第一个名称太长,则返回整个$author:
http://mongotry.herokuapp.com/#?bookmarkId=52fb5f24a0378802003b4c68
http://mongotry.herokuapp.com/ ? bookmarkId = 52 fb5f24a0378802003b4c68
[
{
"$project": {
"author": 1,
"pageViews": 1,
"name": {
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
0,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
1,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
1,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
2,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
2,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
3,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
3,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
4,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
4,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
5,
999
]
},
"$author"
]
}
]
}
]
}
]
}
]
}
}
},
{
"$group": {
"_id": "$name",
"viewsPerAuthor": {
"$sum": "$pageViews"
}
}
}
]
#1
6
Actually there is no such method which provides this kind of functionality or i could not find the appropriate version which contains it. That will not work with $regexp i think : http://docs.mongodb.org/manual/reference/operator/regex/ it is just for pattern matching.
实际上没有提供这种功能的方法,或者我找不到包含它的适当版本。我认为,对于$regexp,这是行不通的:http://docs.mongodb.org/manual/reference/operator/regex/,它只是用于模式匹配。
There is an improvement request in the jira : https://jira.mongodb.org/browse/SERVER-6773
jira中有一个改进请求:https://jira.mongodb.org/browse/SERVER-6773
It is in open unresolved state. BUT
它处于开放的未解决状态。但
in github i found this disscussion: https://github.com/mongodb/mongo/pull/336
在github上,我发现了这个问题:https://github.com/mongodb/mongo/pull/336
And if you check this commit: https://github.com/nleite/mongo/commit/2dd175a5acda86aaad61f5eb9dab83ee19915709
如果您检查这个提交:https://github.com/nleite/mongo/commit/2dd175a5acda86aa61f5eb93ee19915709
it contains more or less exactly the method you likely to have. I do not really get the point of the state of this improvement: in 2.2.3 it is not working .
它或多或少包含您可能拥有的方法。我并没有真正理解这种改进的状态:2.2.3它没有发挥作用。
#2
4
Use mapReduce: it is the general form of aggregation. This is how to proceed in mongo shell: Define the map function
使用mapReduce:它是聚合的一般形式。这是如何在mongo shell中进行的:定义映射函数
var mapFunction = function() {
var key = this.author.match(/_[a-zA-Z0-9]+$/)[0];
var nb_match_bar2 = 0;
if( this.bar.match(/bar2/g) ){
nb_match_bar2 = 1;
}
var value = {
docsPerAuthor: 1,
viewsPerAuthor: Array.sum(this.pageViews)
};
emit( key, value );
};
and the reduce function
和reduce函数
var reduceFunction = function(key, values) {
var reducedObject = {
_id: key,
docsPerAuthor: 0,
viewsPerAuthor: 0
};
values.forEach( function(value) {
reducedObject.docsPerAuthor += value.docsPerAuthor;
reducedObject.viewsPerAuthor += value.viewsPerAuthor;
}
);
return reducedObject;
};
run mapReduce and save the result in map_reduce_result
运行mapReduce并将结果保存到map_reduce_result中
>db.st.mapReduce(mapFunction, reduceFunction, {out:'map_reduce_result'})
query map_reduce_result to have the result
查询map_reduce_result以获得结果
>db.map_reduce_result.find()
#3
3
A possible workaround with the aggregation framework consists in using $project to compute the author name. However, it is dirty as you need to manually loop through the different first name sizes:
使用聚合框架的一个可能的解决方案是使用$project来计算作者名。但是,它是脏的,因为您需要手动循环通过不同的名字大小:
Here, we compute the field name as the substring after the '_' character, trying each of its possible position (this is why there is a chain of $cond), and fallbacking in returning the whole $author if the first name is too long:
这里,我们计算字段名作为“_”字符后面的子字符串,尝试每个可能的位置(这就是为什么有一个$cond链),如果第一个名称太长,则返回整个$author:
http://mongotry.herokuapp.com/#?bookmarkId=52fb5f24a0378802003b4c68
http://mongotry.herokuapp.com/ ? bookmarkId = 52 fb5f24a0378802003b4c68
[
{
"$project": {
"author": 1,
"pageViews": 1,
"name": {
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
0,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
1,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
1,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
2,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
2,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
3,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
3,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
4,
999
]
},
{
"$cond": [
{
"$eq": [
{
"$substr": [
"$author",
4,
1
]
},
"_"
]
},
{
"$substr": [
"$author",
5,
999
]
},
"$author"
]
}
]
}
]
}
]
}
]
}
}
},
{
"$group": {
"_id": "$name",
"viewsPerAuthor": {
"$sum": "$pageViews"
}
}
}
]