Elasticsearch 去重后求和

时间:2024-02-23 10:52:30

标题的要求可以用如下 SQL 表示

select sum(column2) from (select distinct(column1),column2 from table)t

要如何用 DSL 实现呢,先准备下索引和数据

PUT test_index
{
  "mappings": {
    "properties": {
      "column1": {
        "type": "keyword"
      },
      "column2": {
        "type": "long"
      }
    }
  }
}
PUT test_index/_doc/1
{
  "column1": "1",
  "column2": 2
}

PUT test_index/_doc/2
{
  "column1": "1",
  "column2": 2
}


PUT test_index/_doc/3
{
  "column1": "2",
  "column2": 1
}

PUT test_index/_doc/4
{
  "column1": "2",
  "column2": 1
}

我首先想到的是 collapse 搭配 cardinality,再sum,那效果如何呢

GET test_index/_search
{
  "collapse": {
    "field": "column1"
  },
  "aggs": {
    "distinct_column": {
      "cardinality": {
        "field": "column1"
      }
    },
    "distinct_sum":{
      "sum": {
        "field": "column2"
      }
    },
    "all_sum":{
      "sum": {
        "field": "column2"
      }
    }
  }
}

结果,count是去重了的,但sum没有

{
  "took" : 5,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 4,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [
      {
        "_index" : "test_index",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "column1" : "1",
          "column2" : 2
        },
        "fields" : {
          "column1" : [
            "1"
          ]
        }
      },
      {
        "_index" : "test_index",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 1.0,
        "_source" : {
          "column1" : "2",
          "column2" : 1
        },
        "fields" : {
          "column1" : [
            "2"
          ]
        }
      }
    ]
  },
  "aggregations" : {
    "distinct_sum" : {
      "value" : 6.0
    },
    "all_sum" : {
      "value" : 6.0
    },
    "distinct_column" : {
      "value" : 2
    }
  }
}

尝试多次未果后,找到了这个

Sum aggregation on Unique Data in ElasticSearch - Stack Overflow

那试下呗

GET test_index/_search
{
  "size": 0, 
  "aggs": {
    "column1_count": {
      "terms": {
        "field": "column1",
        "size": 100
      },
      "aggs": {
        "column2_avg": {
          "avg": {
            "field": "column2"
          }
        }
      }
    },
    "unique_count": {
      "cardinality": {
        "field": "column1"
      }
    },
    "unique_sum_column2":{
      "sum_bucket": {
        "buckets_path": "column1_count>column2_avg"
      }
    },
    "sum_column2":{
      "sum": {
        "field": "column2"
      }
    }
  }
}

非常好,达到期望,这个是先求平均值,再求和,为这思路叹服

{
  "took" : 5,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 4,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "sum_column2" : {
      "value" : 6.0
    },
    "column1_count" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "1",
          "doc_count" : 2,
          "column2_avg" : {
            "value" : 2.0
          }
        },
        {
          "key" : "2",
          "doc_count" : 2,
          "column2_avg" : {
            "value" : 1.0
          }
        }
      ]
    },
    "unique_count" : {
      "value" : 2
    },
    "unique_sum_column2" : {
      "value" : 3.0
    }
  }
}