Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify deduplication logic #1607

Merged
merged 1 commit into from
Mar 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 16 additions & 68 deletions middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@ const formatLog = (hit) => {
return [name, zip, hit._id].filter(Boolean).join(' ');
};

/**
* Deduplication workflow:
*
* 1. iterate over results starting at position 0
* 2. on each iteration search for duplicate candidates:
* 2.1 at higher positions in array
* 2.2 not contained in the skip-list
* 3. from the list of candidates, select a preferred master record
* 4. push master record on to return array
* 5. add non-master candidates to a skip-list
* 6. continue down list until end
*/

function dedupeResults(req, res, next) {

// do nothing if request data is invalid
Expand All @@ -33,72 +20,33 @@ function dedupeResults(req, res, next) {
// do nothing if no result data is invalid
if( _.isUndefined(res) || !_.isArray(res.data) || _.isEmpty(res.data) ){ return next(); }

// loop through data items and only copy unique items to unique
const unique = [];

// maintain a skip-list
const skip = [];

// use the user agent language to improve deduplication
const lang = _.get(req, 'clean.lang.iso6393');

// 1. iterate over res.data
res.data.forEach((place, ppos) => {

// skip records in the skip-list
if (skip.includes(place)){ return; }

// 2. search for duplicate candidates
const candidates = res.data.filter((candidate, cpos) => {

// 2.1 at higher positions in array
if (cpos <= ppos) { return false; }

// 2.2 not contained in the skip-list
if (skip.includes(candidate)) { return false; }
// maintain a set of inferior records (by their array offsets)
const inferior = new Set();
for (var i = 0; i < (res.data.length-1); i++) {
for (var j = (i+1); j < res.data.length; j++) {

// true if the two records are considered duplicates
return !isDifferent(place, candidate, lang);
});
// ensure these two records are considered duplicates
if (isDifferent(res.data[i], res.data[j], lang)) { continue; }

// 3. select a preferred master record
// decide which of the two records was 'inferior'
// note: $preference equals true when $j is preferred and vice versa
const preference = isPreferred(res.data[i], res.data[j]);
inferior.add(preference ? i : j);

// simple case where no candidates were found
if (candidates.length === 0){
unique.push(place);
return;
}

// by default we consider the candidate with the lowest index as master
let master = place;

// iterate over candidates looking for one which is preferred to
// the currently selected master
candidates.forEach(candidate => {
if (isPreferred(master, candidate)){
master = candidate;
}
});

// logging
if (master !== place) {
// logging
logger.debug('[dupe][replacing]', {
query: req.clean.text,
previous: formatLog(place),
hit: formatLog(master)
superior: formatLog(res.data[preference ? j : i]),
inferior: formatLog(res.data[preference ? i : j]),
});
}
}

// 4. push master record on to return array
unique.push(master);

// 5. add non-master candidates to a skip-list
candidates.forEach(candidate => {
skip.push(candidate);
});
});

// replace the original data with only the unique hits
// remove inferior records, return the remaining results
const unique = res.data.filter((v, o) => !inferior.has(o));
const maxElements = _.get(req, 'clean.size', undefined);
res.data = unique.slice(0, maxElements);

Expand Down
35 changes: 35 additions & 0 deletions test/unit/middleware/dedupe.js
Original file line number Diff line number Diff line change
Expand Up @@ -820,6 +820,41 @@ module.exports.tests.priority = function(test, common) {
t.end();
});
});

test('A->B B->C dependency graph', function (t) {
var req = {
clean: {
text: 'A B C',
size: 10
}
};
var res = {
data: [
{
'source': 'example',
'source_id': 'A',
'layer': 'test',
'name': { 'default': ['name1'] }
}, {
'source': 'example',
'source_id': 'B',
'layer': 'test',
'name': { 'default': ['name1', 'name2'] }
}, {
'source': 'example',
'source_id': 'C',
'layer': 'test',
'name': { 'default': ['name2'] }
}
]
};

dedupe(req, res, () => {
t.equal(res.data.length, 1, 'results are deduped');
t.equal(res.data[0].source_id, 'A');
t.end();
});
});
};

module.exports.all = function (tape, common) {
Expand Down