source: orga_csv2rdf.py @ 14:0e0f06219896

Revision 14:0e0f06219896, 6.2 KB checked in by hagenbruch@phoibe.ub.rub.de, 7 years ago (diff)

First version of organization RDF

Line 
1#!/usr/bin/env python
2# encoding: utf-8
3
4#  The MIT License
5#
6#  Copyright 2010 Andre Hagenbruch <andre.hagenbruch@ruhr-uni-bochum.de>.
7#
8#  Permission is hereby granted, free of charge, to any person obtaining a copy
9#  of this software and associated documentation files (the "Software"), to deal
10#  in the Software without restriction, including without limitation the rights
11#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12#  copies of the Software, and to permit persons to whom the Software is
13#  furnished to do so, subject to the following conditions:
14#
15#  The above copyright notice and this permission notice shall be included in
16#  all copies or substantial portions of the Software.
17#
18#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24#  THE SOFTWARE.
25
26__author__="Andre Hagenbruch <andre.hagenbruch@ruhr-uni-bochum.de>"
27__date__ ="$04.02.2010 20:15:22$"
28
29import logging
30import re
31import urllib
32from rdflib import ConjunctiveGraph, Namespace, Literal, URIRef, RDF
33
34''' Lehreinheiten haben für uns unbrauchbare Präfixe. Beispiel:
35(LM) LE Ev. Theologie
36(LM) VKST Dekanat Philosophie und Erziehungswissenschaft
37'''
38LE_RE = re.compile('\([LDS][MG]\)\s[LV][EK](?:ST)?\s(?:ZWE\s)?')
39
40FAK_RE = re.compile('\d') # Fakultaeten-IDs haben Nummern-Präfixe
41
42AIISO = Namespace('http://purl.org/vocab/aiiso/schema#')
43FOAF = Namespace('http://xmlns.com/foaf/0.1/')
44UB = Namespace('http://ub.rub.de/vocab/orga/')
45
46BASE = 'http://ub.rub.de/resource/'
47
48logging.basicConfig(level=logging.DEBUG)
49
50def parse_csv(filename):
51    data = open(filename, 'r').readlines()
52
53    rub = ConjunctiveGraph()
54    rub.bind('aiiso', 'http://purl.org/vocab/aiiso/schema#')
55    rub.bind('foaf', 'http://xmlns.com/foaf/0.1/')
56    rub.add((URIRef(BASE + 'Ruhr_Universität'), RDF.type, AIISO['Institution']))
57    rub.add((URIRef(BASE + 'Ruhr_Universität'), FOAF['name'], Literal('Ruhr-Universität Bochum')))
58
59    orga = ConjunctiveGraph()
60    orga.bind('aiiso', 'http://purl.org/vocab/aiiso/schema#')
61    orga.bind('foaf', 'http://xmlns.com/foaf/0.1/')
62
63    fak = ConjunctiveGraph()
64    fak.bind('aiiso', 'http://purl.org/vocab/aiiso/schema#')
65    fak.bind('foaf', 'http://xmlns.com/foaf/0.1/')
66
67    le = ConjunctiveGraph()
68    le.bind('aiiso', 'http://purl.org/vocab/aiiso/schema#')
69    le.bind('foaf', 'http://xmlns.com/foaf/0.1/')
70
71    ordnr = ConjunctiveGraph()
72    ordnr.bind('aiiso', 'http://purl.org/vocab/aiiso/schema#')
73    ordnr.bind('foaf', 'http://xmlns.com/foaf/0.1/')
74    ordnr.bind('ub', 'http://ub.rub.de/vocab/orga/')
75
76    for line in data:
77        fields = line.split('|')
78        if len(fields[26]) == 0: # Weiter, wenn es keine Ordnungsnummer gibt
79            continue
80
81        orga_id = ''
82        orga_label = ''
83        fak_id = ''
84        fak_label = ''
85        le_id = ''
86        le_label = ''
87        ord_id = ''
88        ord_label = ''
89
90        if fields[1].strip() != 'Keiner Organisationseinheit zugeordnet':
91            orga_id = fields[0]
92            orga_label = fields[1]
93
94        fak_id = FAK_RE.sub('', fields[6])
95        if len(fak_id) > 0:
96            fak_labels = fields[8].split('; ')
97            fak_label = fak_labels[1]
98        le_id = fields[13]
99        le_label = LE_RE.sub('', fields[15])
100
101        if orga_label == 'ZWEs wie LE': # Oder nicht? Merkmal fuer zentrale Einheit
102            orga_label = le_label
103        ord_id = fields[26]
104        ord_label = fields[28]
105
106        if len(orga_label) > 0:
107            orga.add((URIRef(BASE + urllib.quote_plus(orga_label)), RDF.type, AIISO['Organization']))
108            orga.add((URIRef(BASE + urllib.quote_plus(orga_label)), FOAF['name'], Literal(orga_label)))
109            orga.add((URIRef(BASE + urllib.quote_plus(orga_label)), AIISO['code'], Literal(orga_id)))
110
111        if len(fak_label) > 0:
112            fak.add((URIRef(BASE + urllib.quote_plus(fak_label)), RDF.type, AIISO['Department']))
113            fak.add((URIRef(BASE + urllib.quote_plus(fak_label)), FOAF['name'], Literal(fak_label)))
114            fak.add((URIRef(BASE + urllib.quote_plus(fak_label)), AIISO['code'], Literal(fak_id)))
115
116        if len(le_label) > 0:
117            le.add((URIRef(BASE + urllib.quote_plus(le_label)), RDF.type, AIISO['Institute']))
118            le.add((URIRef(BASE + urllib.quote_plus(le_label)), FOAF['name'], Literal(le_label)))
119            le.add((URIRef(BASE + urllib.quote_plus(le_label)), AIISO['code'], Literal(le_id)))
120
121        if len(ord_label) > 0:
122            ordnr.add((URIRef(BASE + urllib.quote_plus(ord_label)), RDF.type, UB['Chair']))
123            ordnr.add((URIRef(BASE + urllib.quote_plus(ord_label)), FOAF['name'], Literal(ord_label)))
124            ordnr.add((URIRef(BASE + urllib.quote_plus(ord_label)), AIISO['code'], Literal(ord_id)))
125
126            ordnr.add((URIRef(BASE + urllib.quote_plus(ord_label)), AIISO['part_of'], URIRef(BASE + urllib.quote_plus(le_label))))
127            le.add((URIRef(BASE + urllib.quote_plus(le_label)), AIISO['part_of'], URIRef(BASE + urllib.quote_plus(fak_label))))
128            fak.add((URIRef(BASE + urllib.quote_plus(fak_label)), AIISO['part_of'], URIRef(BASE + urllib.quote_plus(orga_label))))
129
130        orga.add((URIRef(BASE + orga_label.replace(' ', '_')), AIISO['part_of'], URIRef(BASE + 'Ruhr_Universität')))
131        #logging.info('ORGA_ID: %s' % orga_id)
132        #logging.info('ORGA_LABEL: %s' % orga_label)
133        #logging.info('FAK_ID: %s' % fak_id)
134        #logging.info('FAK_LABEL: %s' % fak_label)
135        #logging.info('LE_ID: %s' % le_id)
136        #logging.info('LE_LABEL: %s' % le_label)
137        #logging.info('ORD_ID: %s' % ord_id)
138        #logging.info('ORD_LABEL: %s' % ord_label)
139
140    logging.warn(rub.serialize())
141    logging.warn(orga.serialize())
142    logging.warn(fak.serialize())
143    logging.warn(le.serialize())
144    logging.warn(ordnr.serialize())
145
146def main():
147    parse_csv('UB_Kostenstellenexport.csv')
148
149if __name__ == '__main__':
150    main()
Note: See TracBrowser for help on using the repository browser.