lxml_parser/simple_parser.py at master · GABBAR1947/lxml_parser · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2016 gabbar1947 <gabbar1947@Rathore1947>
#
# Distributed under terms of the MIT license.

import urllib
import re
from lxml import etree
import random

class Downloader():
    '''
    Class to retrieve HTML
    code and binary files
    from a specific website

    '''

    def __init__(self,url):
        self.url = url
        self.contents = ''

    def download(self, image_name='', is_image=False):
        browser = urllib.urlopen(self.url)
        response = browser.getcode()
        if response == 200:
            self.contents = browser.read()

        if is_image:
            image_file = open(image_name,'wb') #write binary
            image_file.write(self.contents)
            image_file.close()

class xkcdParser(Downloader):
    '''
    class for parsing xkcd.com
    '''

    def __init__(self,url):
        Downloader.__init__(self,url)
        self.last_comic_nr = None
        self.title = ''
        self.caption = ''

    def get_last_comic_nr(self):
        try:
            self.last_comic.nr= re.search("http://xkcd.com/(\d+)",self.contents).group(1)
            self.last_comic_nr = int(self.last_comic_nr)
        except:
            self.last_comic_nr = None

    def get_current_comic(self):
        self.download(self.url)
        self.get_last_comic_nr()
        self.get_title()
        self.get_caption()
        self.get_comic()

    def get_random_comic(self):
        if self.last_comic_nr:
            comic_nr = random.randint(1, self.last_comic_nr)

            self.url = "http://xkcd.com/"+str(comic_nr)
            self.download()
            self.get_title()
            self.get_caption()
            self.get_comic()

    def get_title(self):
        if self.contents:
            tree = etree.HTML(self.contents)
            self.title = tree.xpath("string(//div[@id='ctitle'])")

    def get_caption(self):
        if self.contents:
            tree = etree.HTML(self.contents)
            self.caption = tree.xpath("string(//div[@id='comic']/img/@title)")

    def get_comic(self):
        if self.contents:
            tree = etree.HTML(self.contents)
            self.url = tree.xpath("string(//div[@id = 'comic']/img/@src)")

            self.download(self.title, is_image = True)


if __name__ == '__main__':
    url = "http://xkcd.com/"
    xkcd_parser = xkcdParser(url)
    xkcd_parser.get_current_comic()
    xkcd_parser.get_random_comic()
    print xkcd_parser.title
    print xkcd_parser.caption