-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremoveDuplicates.js
105 lines (94 loc) · 1.91 KB
/
removeDuplicates.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
var _ = require('underscore');
var Promise = require('bluebird');
var mongoose = require('mongoose');
Promise.promisifyAll(mongoose);
//Configuration
var conf = require('./configuration/conf.json');
//Mongo stuff
console.log('Connecting to mongo');
mongoose.connect(conf.db);
var db = mongoose.connection;
var ImageSchema = new mongoose.Schema({
id: String,
owner: String,
secret: String,
server: String,
farm: Number,
title: String,
ispublic: Number,
isfriend: Number,
isfamily: Number,
license: String,
description: {
type: 'mixed'
},
dateupload: String,
lastupdate: String,
datetaken: Date,
datetakengranularity: String,
ownername: String,
iconserver: String,
iconfarm: Number,
views: String,
tags: String,
machine_tags: String,
latitude: Number,
longitude: Number,
accuracy: Number,
context: Number,
media: String,
media_status: String,
url_o: String,
url_z: String
});
console.log('Loading the schema');
var Image = mongoose.model('image', ImageSchema);
var findDuplicates = function() {
return Image.collection.aggregateAsync([{
$group: {
_id: {
name: "$id"
}, // replace `name` here twice
uniqueIds: {
$addToSet: "$_id"
},
count: {
$sum: 1
}
}
}, {
$match: {
count: {
$gte: 2
}
}
}, {
$sort: {
count: -1
}
}, {
$limit: 50
}]);
};
var removeDuplicates = function(result) {
console.log('Removing duplicates');
var tasks = [];
for (var i = 0; i < result.length; i++) {
var duplicate = result[i];
duplicate.uniqueIds.pop();
tasks.push(Image.find()
.where('_id')
.in(duplicate.uniqueIds)
.remove()
.execAsync());
}
return Promise.all(tasks);
};
findDuplicates()
.then(removeDuplicates)
.then(function() {
console.log('Duplicates removed');
})
.catch(function(err) {
console.log(err);
});