source: extract_mesh.py @ 37:cfb19c3c4214

Revision 37:cfb19c3c4214, 4.6 KB checked in by andre.hagenbruch@rub.de, 7 years ago (diff)

Further improvements on bibliographic templates; integration of DBpedia information based on MESH

Line 
1#! /usr/bin/env python
2
3#  The MIT License
4#
5#  Copyright 2010 hagenbruch.
6#
7#  Permission is hereby granted, free of charge, to any person obtaining a copy
8#  of this software and associated documentation files (the "Software"), to deal
9#  in the Software without restriction, including without limitation the rights
10#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11#  copies of the Software, and to permit persons to whom the Software is
12#  furnished to do so, subject to the following conditions:
13#
14#  The above copyright notice and this permission notice shall be included in
15#  all copies or substantial portions of the Software.
16#
17#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23#  THE SOFTWARE.
24
25__author__="hagenbruch"
26__date__ ="$28.07.2010 19:01:57$"
27
28from lxml import etree
29from SPARQLWrapper import SPARQLWrapper, JSON
30import cPickle as pickle
31
32MODS_NAMESPACE = 'http://www.loc.gov/mods/v3'
33MODS = '{%s}' % MODS_NAMESPACE
34
35def extract(filename):
36    ids = set()
37    med = etree.parse(filename)
38
39    meshes = med.findall('.//%ssubject[@authority="mesh"]' % MODS)
40
41    for mesh in meshes:
42        ids.add(mesh[0].text)
43
44    return sorted(ids)
45
46def get_dbpedia_uri(myid):
47    dbpedia = {}
48    #print 'MESH-Deskriptor: %s' % myid
49    sparql = SPARQLWrapper('http://dbpedia.org/sparql')
50
51    sparql.setQuery('PREFIX dbpedia-owl: <http://dbpedia.org/ontology/> SELECT DISTINCT ?s WHERE {?s dbpedia-owl:meshId "%s"@en}' % myid)
52    sparql.setReturnFormat(JSON)
53    results = sparql.query().convert()
54
55    for result in results.get('results').get('bindings'):
56        uri = result.get('s').get('value')
57        dbpedia.setdefault(myid, {}).setdefault('URI', uri)
58        return dbpedia
59
60def get_dbpedia_content(mymap):
61    for myid in mymap:
62        myuri = mymap.get(myid).get('URI')
63        #print 'dbpedia-URI: %s' % myuri
64        sparql = SPARQLWrapper('http://dbpedia.org/sparql')
65
66        sparql.setQuery('PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?labels WHERE {<%s> rdfs:label ?labels}' % myuri)
67        sparql.setReturnFormat(JSON)
68        results = sparql.query().convert()
69
70        for result in results.get('results').get('bindings'):
71            label = result.get('labels').get('value').encode('utf8')
72            lang = result.get('labels').get('xml:lang').encode('utf8')
73            mymap.setdefault(myid, {}).setdefault('labels', {}).setdefault(lang, label)
74            #print 'Label: %s' % label
75
76        sparql.setQuery('PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?c WHERE {<%s> rdfs:comment ?c}' % myuri)
77        sparql.setReturnFormat(JSON)
78        results = sparql.query().convert()
79
80        for result in results.get('results').get('bindings'):
81            #print result
82            comment = result.get('c').get('value').encode('utf8')
83            lang = result.get('c').get('xml:lang').encode('utf8')
84            mymap.setdefault(myid, {}).setdefault('comments', {}).setdefault(lang, comment)
85            #print 'Comment: %s ' % comment
86
87        sparql.setQuery('PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX dbpprop: <http://dbpedia.org/property/> SELECT DISTINCT ?labels WHERE {?r dbpprop:redirect <%s> . ?r rdfs:label ?labels}' % myuri)
88        sparql.setReturnFormat(JSON)
89        results = sparql.query().convert()
90
91        for result in results.get('results').get('bindings'):
92            alt_label = result.get('labels').get('value').encode('utf8')
93            lang = result.get('labels').get('xml:lang').encode('utf8')
94            mymap.setdefault(myid, {}).setdefault('alt_labels', {}).setdefault(lang, []).append(alt_label)
95            #print 'Alternative Label: %s ' % alt_label
96
97        print mymap
98        return mymap
99
100def main():
101    pckl = open('dbpedia_mesh_map.pkl', 'wb')
102    ids = extract('/home/hagenbruch/dev/bibliographie-index/mods-xml/medizin.xml')
103    #ids = ['D001943']
104    dbpedia_info = []
105    for myid in ids:
106        dbpedia_map = get_dbpedia_uri(myid)
107        try:
108            dbpedia_map.get(myid, '').get('URI', '')
109        except AttributeError:
110            pass
111        else:
112            dbpedia_info.append(get_dbpedia_content(dbpedia_map))
113
114    #print dbpedia_info
115    pickle.dump(dbpedia_info, pckl, -1)
116
117if __name__ == "__main__":
118    main()
Note: See TracBrowser for help on using the repository browser.