-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimple_parser.py
More file actions
96 lines (78 loc) · 2.51 KB
/
simple_parser.py
File metadata and controls
96 lines (78 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2016 gabbar1947 <gabbar1947@Rathore1947>
#
# Distributed under terms of the MIT license.
import urllib
import re
from lxml import etree
import random
class Downloader():
'''
Class to retrieve HTML
code and binary files
from a specific website
'''
def __init__(self,url):
self.url = url
self.contents = ''
def download(self, image_name='', is_image=False):
browser = urllib.urlopen(self.url)
response = browser.getcode()
if response == 200:
self.contents = browser.read()
if is_image:
image_file = open(image_name,'wb') #write binary
image_file.write(self.contents)
image_file.close()
class xkcdParser(Downloader):
'''
class for parsing xkcd.com
'''
def __init__(self,url):
Downloader.__init__(self,url)
self.last_comic_nr = None
self.title = ''
self.caption = ''
def get_last_comic_nr(self):
try:
self.last_comic.nr= re.search("http://xkcd.com/(\d+)",self.contents).group(1)
self.last_comic_nr = int(self.last_comic_nr)
except:
self.last_comic_nr = None
def get_current_comic(self):
self.download(self.url)
self.get_last_comic_nr()
self.get_title()
self.get_caption()
self.get_comic()
def get_random_comic(self):
if self.last_comic_nr:
comic_nr = random.randint(1, self.last_comic_nr)
self.url = "http://xkcd.com/"+str(comic_nr)
self.download()
self.get_title()
self.get_caption()
self.get_comic()
def get_title(self):
if self.contents:
tree = etree.HTML(self.contents)
self.title = tree.xpath("string(//div[@id='ctitle'])")
def get_caption(self):
if self.contents:
tree = etree.HTML(self.contents)
self.caption = tree.xpath("string(//div[@id='comic']/img/@title)")
def get_comic(self):
if self.contents:
tree = etree.HTML(self.contents)
self.url = tree.xpath("string(//div[@id = 'comic']/img/@src)")
self.download(self.title, is_image = True)
if __name__ == '__main__':
url = "http://xkcd.com/"
xkcd_parser = xkcdParser(url)
xkcd_parser.get_current_comic()
xkcd_parser.get_random_comic()
print xkcd_parser.title
print xkcd_parser.caption