source: url_checker.py @ 26:4af842589412

Revision 26:4af842589412, 2.7 KB checked in by hagenbruch@phoibe.ub.rub.de, 7 years ago (diff)

Added auto-suggest and linked data mashup; leafing through single hits is now independent of position in result list (i.e. an entry can be bookmarked as is); further improvements...

Line 
1#!/usr/bin/env python
2# encoding: utf-8
3
4#  The MIT License
5#
6#  Copyright 2010 Andre Hagenbruch <andre.hagenbruch@ruhr-uni-bochum.de>.
7#
8#  Permission is hereby granted, free of charge, to any person obtaining a copy
9#  of this software and associated documentation files (the "Software"), to deal
10#  in the Software without restriction, including without limitation the rights
11#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12#  copies of the Software, and to permit persons to whom the Software is
13#  furnished to do so, subject to the following conditions:
14#
15#  The above copyright notice and this permission notice shall be included in
16#  all copies or substantial portions of the Software.
17#
18#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24#  THE SOFTWARE.
25
26__author__ = "Andre Hagenbruch <andre.hagenbruch@ruhr-uni-bochum.de>"
27__date__ = "$20.06.2010 16:46:32$"
28
29import logging
30import urllib
31import urllib2
32class HeadRequest(urllib2.Request):
33    def get_method(self):
34        return "HEAD"
35
36
37logging.basicConfig(level=logging.DEBUG)
38
39def _fetch_result(params):
40    return eval(urllib.urlopen('http://134.147.247.36:8983/solr/select?', params).read())
41
42def _fetch_info(url, e=None):
43    params = 'q=url:"%s"&fl=title+fach+fakultaet&wt=python' % url
44    res = _fetch_result(params)
45    docs = res['response']['docs']
46    for doc in docs:
47        if doc.get('fach') is not None:
48            logging.error('%s: %s => %s (%s)' % (e, url, doc.get('title'), doc.get('fach').capitalize()))
49        else:
50            logging.error('%s: %s => %s (%s)' % (e, url, doc.get('title'), doc.get('fakultaet')))
51
52def check_url(url):
53    try:
54        res = urllib2.urlopen(HeadRequest(url))
55        if res.getcode() != 200:
56            _fetch_info(url, e='NOT 200')
57    except ValueError, e:
58        _fetch_info(url, e)
59    except urllib2.HTTPError, e:
60        logging.error('%s => %s' % (url, e))
61    except urllib2.URLError, e:
62        logging.error('%s => %s' % (url, e))
63
64
65def retrieve_urls():
66    params = 'q=*:*&facet=true&facet.field=url&facet.limit=15000&rows=0&wt=python'
67    res = _fetch_result(params)
68    return res['facet_counts']['facet_fields']['url']
69
70def main():
71    urls = retrieve_urls()
72    while urls:
73        url, count = urls[0:2]
74        del urls[0:2]
75        check_url(url)
76
77if __name__ == '__main__':
78    main()
Note: See TracBrowser for help on using the repository browser.