GroupDocumentSource.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. "use strict";
  2. var DocumentSource = require("./DocumentSource"),
  3. Accumulators = require("../accumulators/"),
  4. Document = require("../Document"),
  5. Expression = require("../expressions/Expression"),
  6. ConstantExpression = require("../expressions/ConstantExpression"),
  7. FieldPathExpression = require("../expressions/FieldPathExpression"),
  8. Variables = require("../expressions/Variables"),
  9. VariablesIdGenerator = require("../expressions/VariablesIdGenerator"),
  10. VariablesParseState = require("../expressions/VariablesParseState"),
  11. async = require("async");
  12. /**
  13. * A class for grouping documents together
  14. *
  15. * @class GroupDocumentSource
  16. * @namespace mungedb-aggregate.pipeline.documentSources
  17. * @module mungedb-aggregate
  18. * @constructor
  19. * @param [expCtx] {ExpressionContext}
  20. **/
  21. var GroupDocumentSource = module.exports = function GroupDocumentSource(expCtx) {
  22. if (arguments.length > 1) throw new Error("up to one arg expected");
  23. base.call(this, expCtx);
  24. expCtx = !expCtx ? {} : expCtx;
  25. this.populated = false;
  26. this._doingMerge = false;
  27. this._spilled = false;
  28. this.extSortAllowed = expCtx.extSortAllowed && !expCtx.inRouter;
  29. this._maxMemoryUsageBytes = 100*1024*1024; // NOTE: This came from mongo
  30. this.accumulatorFactories = [];
  31. this.currentAccumulators = [];
  32. this.groups = {}; // GroupsType Value -> Accumulators[]
  33. this.groupsKeys = []; // This is to faciliate easier look up of groups
  34. this._variables = null;
  35. this.fieldNames = [];
  36. this.idFieldNames = [];
  37. this.expressions = [];
  38. this.idExpressions = [];
  39. this.currentGroupsKeysIndex = 0;
  40. }, klass = GroupDocumentSource, base = DocumentSource, proto = klass.prototype = Object.create(base.prototype, {constructor:{value:klass}});
  41. // TODO: Do we need this?
  42. klass.groupOps = {
  43. "$addToSet": Accumulators.AddToSet,
  44. "$avg": Accumulators.Avg,
  45. "$first": Accumulators.First,
  46. "$last": Accumulators.Last,
  47. "$max": Accumulators.MinMax.createMax, // $min and $max have special constructors because they share base features
  48. "$min": Accumulators.MinMax.createMin,
  49. "$push": Accumulators.Push,
  50. "$sum": Accumulators.Sum
  51. };
  52. klass.groupName = "$group";
  53. /**
  54. * Factory for making GroupDocumentSources
  55. *
  56. * @method create
  57. * @static
  58. * @param [expCtx] {ExpressionContext}
  59. **/
  60. klass.create = function create(expCtx) {
  61. return new GroupDocumentSource(expCtx);
  62. };
  63. /**
  64. * Factory for making GroupDocumentSources
  65. *
  66. * @method getSourceName
  67. * @return {GroupDocumentSource}
  68. **/
  69. proto.getSourceName = function getSourceName() {
  70. return klass.groupName;
  71. };
  72. /**
  73. * Gets the next document or DocumentSource.EOF if none
  74. *
  75. * @method getNext
  76. * @return {Object}
  77. **/
  78. proto.getNext = function getNext(callback) {
  79. if (!callback) throw new Error(this.getSourceName() + ' #getNext() requires callback.');
  80. if (this.expCtx.checkForInterrupt && this.expCtx.checkForInterrupt() === false)
  81. return callback(new Error("Interrupted"));
  82. var self = this;
  83. async.series([
  84. function(next) {
  85. if (!self.populated)
  86. self.populate(function(err) {
  87. return next(err);
  88. });
  89. else
  90. return next();
  91. },
  92. function(next) {
  93. if (self._spilled) {
  94. // NOTE: this got skipped before, work on it
  95. throw new error("Spilled isn't finished.")
  96. if (!self._sortIterator)
  97. return next(null, DocumentSource.EOF); // self is a boost::none in mongo
  98. numAccumulators = self.accumulatorFactories.length; // TODO: whaaat?
  99. for (var i = 0; i < numAccumulators; i++) {
  100. self.currentAccumulators[i].reset(); // prep accumulatorts for new group
  101. }
  102. self._currentId = self.firstPartOfNextGroup.first;
  103. while (self._currentId === _firstPartOfNextGroup.first) {
  104. // Inside of self loop, _firstPartOfNextGroup is the current data being processed.
  105. // At loop exit, it is the first value to be processed in the next group.
  106. switch (numAccumulators) { // mirrors switch in spill()
  107. case 0: // No Accumulators so no Values
  108. break;
  109. case 1: // single accumulators serialize as a single value
  110. self.currentAccumulators[0].process(_firstPartOfNextGroup.second, /*merging=*/true);
  111. break;
  112. default:
  113. var accumulatorStates = _firstPartOfNextGroup.second.getArray();
  114. for (var j = 0; j < numAccumulators; j++) {
  115. self.currentAccumulators[j].process(accumulatorStates[j], /*merging=*/true);
  116. }
  117. break;
  118. }
  119. if (!self._sorterIterator.more()) {
  120. dispose(); // what?
  121. break;
  122. }
  123. _firstPartOfNextGroup = _sorterIterator.next();
  124. }
  125. return next(null, makeDocument(_currentId, _currentAccumulators, self.expCtx.inShard));
  126. } else {
  127. if(self.currentGroupsKeysIndex === self.groupsKeys.length) {
  128. return next(null, DocumentSource.EOF);
  129. }
  130. var out = makeDocument(groupsIterator[0], groupsIterator[1], expCtx.inShard);
  131. if (++groupsIterator === groups.end)
  132. dispose();
  133. return next(null, out);
  134. if(self.currentGroupsKeysIndex === self.groupsKeys.length) {
  135. return next(null, DocumentSource.EOF);
  136. }
  137. var id = self.groupsKeys[self.currentGroupsKeysIndex],
  138. accumulators = self.groups[id],
  139. out = self.makeDocument(id, accumulators, expCtx.inShard);
  140. if(++self.currentGroupsKeysIndex === self.groupsKeys.length) {
  141. self.dispose();
  142. }
  143. return next(null, out);
  144. }
  145. }
  146. ], function(err, results) {
  147. callback(err, results[1]);
  148. });
  149. };
  150. /**
  151. * Sets this source as apparently empty
  152. *
  153. * @method dispose
  154. **/
  155. proto.dispose = function dispose() {
  156. //NOTE: Skipped 'freeing' our resources; at best we could remove some references to things, but our parent will probably forget us anyways!
  157. // make us look done
  158. this.currentGroupsKeysIndex = this.groupsKeys.length;
  159. // free our source's resources
  160. this.source.dispose();
  161. };
  162. /**
  163. * Optimizes the expressions in the group
  164. * @method optimize
  165. **/
  166. proto.optimize = function optimize() {
  167. // TODO if all _idExpressions are ExpressionConstants after optimization, then we know there
  168. // will only be one group. We should take advantage of that to avoid going through the hash
  169. // table.
  170. var self = this;
  171. self.idExpressions.forEach(function(expression, i) {
  172. self.idExpressions[i] = expression.optimize();
  173. });
  174. self.expressions.forEach(function(expression, i) {
  175. self.expressions[i] = expression.optimize();
  176. });
  177. };
  178. /**
  179. * Create an object that represents the document source. The object
  180. * will have a single field whose name is the source's name.
  181. *
  182. * @method serialize
  183. * @param explain {Boolean} Create explain output
  184. **/
  185. proto.serialize = function serialize(explain) {
  186. var insides = {};
  187. // add the _id
  188. if (this.idFieldNames.length === 0) {
  189. if (this.idExpressions.length !== 1) throw new error("Should only have one _id field");
  190. insides._id = this.idExpressions[0].serialize(explain);
  191. }
  192. //add the remaining fields
  193. var aFacs = this.accumulatorFactories,
  194. aFacLen = aFacs.length;
  195. for(var i=0; i < aFacLen; i++) {
  196. var aFac = aFacs[i](),
  197. serialExpression = this.expressions[i].serialize(explain), //Get the accumulator's expression
  198. serialAccumulator = {}; //Where we'll put the expression
  199. serialAccumulator[aFac.getOpName()] = serialExpression;
  200. insides[this.fieldNames[i]] = serialAccumulator;
  201. }
  202. var serialSource = {};
  203. serialSource[this.getSourceName()] = insides;
  204. return serialSource;
  205. };
  206. /**
  207. * Creates a GroupDocumentSource from the given elem
  208. *
  209. * @method createFromJson
  210. * @param elem {Object} The group specification object; the right hand side of the $group
  211. **/
  212. klass.createFromJson = function createFromJson(elem, expCtx) {
  213. if (!(elem instanceof Object && elem.constructor === Object)) throw new Error("a group's fields must be specified in an object");
  214. var group = GroupDocumentSource.create(expCtx),
  215. idSet = false;
  216. var groupObj = elem,
  217. idGenerator = new VariablesIdGenerator(),
  218. vps = new VariablesParseState(idGenerator);
  219. for (var groupFieldName in groupObj) {
  220. if (groupObj.hasOwnProperty(groupFieldName)) {
  221. var groupField = groupObj[groupFieldName];
  222. if (groupFieldName === "_id") {
  223. if(idSet) throw new Error("15948 a group's _id may only be specified once");
  224. group.parseIdExpression(groupField, vps);
  225. idSet = true;
  226. } else if (groupFieldName === '$doingMerge' && groupField) {
  227. throw new Error("17030 $doingMerge should be true if present");
  228. } else {
  229. /*
  230. Treat as a projection field with the additional ability to
  231. add aggregation operators.
  232. */
  233. if (groupFieldName.indexOf(".") !== -1) throw new Error("16414 the group aggregate field name '" + groupFieldName + "' cannot contain '.'");
  234. if (groupFieldName[0] === "$") throw new Error("15950 the group aggregate field name '" + groupFieldName + "' cannot be an operator name");
  235. if (group._getTypeStr(groupFieldName) === "Object") throw new Error("15951 the group aggregate field '" + groupFieldName + "' must be defined as an expression inside an object");
  236. var subElementCount = 0;
  237. for (var subElementName in groupField) {
  238. if (groupField.hasOwnProperty(subElementName)) {
  239. var subElement = groupField[subElementName],
  240. op = klass.groupOps[subElementName];
  241. if (!op) throw new Error("15952 unknown group operator '" + subElementName + "'");
  242. var groupExpression,
  243. subElementTypeStr = group._getTypeStr(subElement);
  244. if (subElementTypeStr === "Object") {
  245. var subElementObjCtx = new Expression.ObjectCtx({isDocumentOk:true});
  246. groupExpression = Expression.parseObject(subElement, subElementObjCtx, vps);
  247. } else if (subElementTypeStr === "Array") {
  248. throw new Error("15953 aggregating group operators are unary (" + subElementName + ")");
  249. } else { /* assume its an atomic single operand */
  250. groupExpression = Expression.parseOperand(subElement, vps);
  251. }
  252. group.addAccumulator(groupFieldName, op, groupExpression);
  253. ++subElementCount;
  254. }
  255. }
  256. if (subElementCount !== 1) throw new Error("15954 the computed aggregate '" + groupFieldName + "' must specify exactly one operator");
  257. }
  258. }
  259. }
  260. if (!idSet) throw new Error("15955 a group specification must include an _id");
  261. group._variables = new Variables(idGenerator.getIdCount());
  262. return group;
  263. };
  264. /**
  265. * Populates the GroupDocumentSource by grouping all of the input documents at once.
  266. *
  267. * @method populate
  268. * @param callback {Function} Required. callback(err) when done populating.
  269. * @async
  270. **/
  271. proto.populate = function populate(callback) {
  272. var numAccumulators = this.accumulatorFactories.length;
  273. // NOTE: this is not in mongo, does it belong here?
  274. if(numAccumulators !== this.expressions.length) {
  275. callback(new Error("Must have equal number of accumulators and expressions"));
  276. }
  277. var input,
  278. self = this;
  279. async.whilst(
  280. function() {
  281. return input !== DocumentSource.EOF;
  282. },
  283. function(cb) {
  284. self.source.getNext(function(err, doc) {
  285. if(err) return cb(err);
  286. if(doc === DocumentSource.EOF) {
  287. input = doc;
  288. return cb(); //Need to stop now, no new input
  289. }
  290. input = doc;
  291. self._variables.setRoot(input);
  292. /* get the _id value */
  293. var id = self.idExpression.evaluate(self._variables);
  294. if(undefined === id) id = null;
  295. var groupKey = JSON.stringify(id),
  296. group = self.groups[JSON.stringify(id)];
  297. if(!group) {
  298. self.groupsKeys.push(groupKey);
  299. group = [];
  300. self.groups[groupKey] = group;
  301. // Add the accumulators
  302. for(var afi = 0; afi<self.accumulatorFactories.length; afi++) {
  303. group.push(self.accumulatorFactories[afi]());
  304. }
  305. }
  306. //NOTE: Skipped memory usage stuff for case when group already existed
  307. if(numAccumulators !== group.length) {
  308. throw new Error('Group must have one of each accumulator');
  309. }
  310. //NOTE: passing the input to each accumulator
  311. for(var gi=0; gi<group.length; gi++) {
  312. group[gi].process(self.expressions[gi].evaluate(self._variables, self._doingMerge));
  313. }
  314. // We are done with the ROOT document so release it.
  315. self._variables.clearRoot();
  316. //NOTE: Skipped the part about sorted files
  317. return cb();
  318. });
  319. },
  320. function(err) {
  321. if(err) return callback(err);
  322. self.populated = true;
  323. return callback();
  324. }
  325. );
  326. };
  327. /**
  328. * Get the dependencies of the group
  329. *
  330. * @method getDependencies
  331. * @param deps {Object} The
  332. * @return {DocumentSource.getDepsReturn} An enum value specifying that these dependencies are exhaustive
  333. * @async
  334. **/
  335. proto.getDependencies = function getDependencies(deps) {
  336. var self = this;
  337. // add _id
  338. this.idExpressions.forEach(function(expression, i) {
  339. expression.addDependencies(deps);
  340. });
  341. // add the rest
  342. this.fieldNames.forEach(function (field, i) {
  343. self.expressions[i].addDependencies(deps);
  344. });
  345. return DocumentSource.GetDepsReturn.EXHAUSTIVE;
  346. };
  347. /**
  348. * Called internally only. Adds an accumulator for each matching group.
  349. *
  350. * @method addAccumulator
  351. * @param fieldName {String} The name of the field where the accumulated value will be placed
  352. * @param accumulatorFactory {Accumulator} The constructor for creating accumulators
  353. * @param epxression {Expression} The expression to be evaluated on incoming documents before they are accumulated
  354. **/
  355. proto.addAccumulator = function addAccumulator(fieldName, accumulatorFactory, expression) {
  356. this.fieldNames.push(fieldName);
  357. this.accumulatorFactories.push(accumulatorFactory);
  358. this.expressions.push(expression);
  359. };
  360. /**
  361. * Makes a document with the given id and accumulators
  362. *
  363. * @method makeDocument
  364. * @param fieldName {String} The name of the field where the accumulated value will be placed
  365. * @param accums {Array} An array of accumulators
  366. * @param epxression {Expression} The expression to be evaluated on incoming documents before they are accumulated
  367. **/
  368. proto.makeDocument = function makeDocument(id, accums, mergeableOutput) {
  369. var out = {};
  370. /* add the _id field */
  371. out._id = this.expandId(id);
  372. /* add the rest of the fields */
  373. this.fieldNames.forEach(function(fieldName, i) {
  374. var val = accums[i].getValue(mergeableOutput);
  375. if (!val) {
  376. out[fieldName] = null;
  377. } else {
  378. out[fieldName] = val;
  379. }
  380. });
  381. return out;
  382. };
  383. /**
  384. * Computes the internal representation of the group key.
  385. */
  386. proto.computeId = function computeId(vars) {
  387. var self = this;
  388. // If only one expression return result directly
  389. if (self.idExpressions.length === 1)
  390. return self.idExpressions[0].evaluate(vars); // NOTE: self will probably need to be async soon
  391. // Multiple expressions get results wrapped in an array
  392. var vals = [];
  393. self.idExpressions.forEach(function(expression, i) {
  394. vals.push(expression.evaluate(vars));
  395. });
  396. return vals;
  397. };
  398. /**
  399. * Converts the internal representation of the group key to the _id shape specified by the
  400. * user.
  401. */
  402. proto.expandId = function expandId(val) {
  403. var self = this;
  404. // _id doesn't get wrapped in a document
  405. if (self.idFieldNames.length === 0)
  406. return val;
  407. var doc = {};
  408. // _id is a single-field document containing val
  409. if (self.idFieldNames.length === 1) {
  410. doc[self.idFieldNames[0]] = val;
  411. return doc;
  412. }
  413. // _id is a multi-field document containing the elements of val
  414. vals.forEach(function(val, i) {
  415. doc[self.idFieldNames[i]] = val;
  416. });
  417. return doc;
  418. };
  419. proto.parseIdExpression = function parseIdExpression(groupField, vps) {
  420. var self = this;
  421. if (self._getTypeStr(groupField) === 'Object' && groupField !== {}) {
  422. // {_id: {}} is treated as grouping on a constant, not an expression
  423. var idKeyObj = groupField;
  424. if (Object.keys(idKeyObj)[0] == '$') {
  425. self.idExpressions.push(Expression.parseObject(idKeyObj, {}, vps));
  426. } else {
  427. Object.keys(idKeyObj).forEach(function(key, i) {
  428. var field = {}; //idKeyObj[key];
  429. field[key] = idKeyObj[key];
  430. self.idFieldNames.push(key);
  431. self.idExpressions.push(Expression.parseOperand(field, vps));
  432. });
  433. }
  434. } else if (self._getTypeStr(groupField) === 'string' && groupField[0] === '$') {
  435. self.idExpressions.push(FieldPathExpression.parse(groupField, vps));
  436. } else {
  437. self.idExpressions.push(ConstantExpression.create(groupField));
  438. }
  439. };
  440. /**
  441. * Get the type of something. Handles objects specially to return their true type; i.e. their constructor
  442. *
  443. * @method populate
  444. * @param obj {Object} The object to get the type of
  445. * @return {String} The type of the object as a string
  446. * @async
  447. **/
  448. proto._getTypeStr = function _getTypeStr(obj) {
  449. var typeofStr = typeof obj,
  450. typeStr = (typeofStr == "object" && obj !== null) ? obj.constructor.name : typeofStr;
  451. return typeStr;
  452. };