source: duplicate_check.py @ 26:4af842589412

Revision 26:4af842589412, 1.3 KB checked in by hagenbruch@phoibe.ub.rub.de, 7 years ago (diff)

Added auto-suggest and linked data mashup; leafing through single hits is now independent of position in result list (i.e. an entry can be bookmarked as is); further improvements...

Line 
1#!/usr/bin/env python
2# encoding: utf-8
3"""
4duplicate_check.py
5
6Created by Andre Hagenbruch on 2010-01-02.
7Copyright (c) 2010 University Library Bochum. All rights reserved.
8"""
9
10import simhash
11from urllib import urlopen
12
13def get_data():
14    records = {}
15    result = eval(urlopen('http://134.147.247.36:8983/solr/select/?q=*:*&fl=title+person+date+entryID&wt=python&rows=100').read())
16    docs = result['response']['docs']
17    for doc in docs:
18        #print docs
19        text = ''
20        text += ' %s' % doc.get('title', '')
21        for person in doc.get('person', ''):
22            text += ' %s' % person
23        for subject in doc.get('subject', ''):
24            text += ' %s' % subject
25        for abstract in doc.get('abstract', ''):
26            text += ' %s' % abstract
27        text += ' %s' % doc.get('date', '')
28
29        records.setdefault(doc.get('entryID'), text)
30    return records
31
32def main():
33    records = get_data()
34    for r1 in records:
35        for r2 in records:
36            if r1 is not r2:
37                h1 = simhash.Simhash(records.get(r1).split())
38                h2 = simhash.Simhash(records.get(r2).split())
39                if h1.similarity(h2) > 0.999:
40                    print '%s <==> %s' % (r1, r2)
41                    print h1.similarity(h2), "percent similar\n"
42
43
44if __name__ == '__main__':
45    main()
46
Note: See TracBrowser for help on using the repository browser.