-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgrap_image.py
More file actions
99 lines (75 loc) · 1.89 KB
/
grap_image.py
File metadata and controls
99 lines (75 loc) · 1.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import sys
from random import Random
import requests
import json;
import urllib.request
import re
from bs4 import BeautifulSoup
import os
import hashlib
URL_COUNT = 0
def prend(str):
print (str)
sys.exit(0)
def pre(str):
print (str)
def obtainImgList(content):
reg = r'data-original="(.+?\.jpg|.+?\.JPG)"'
imgreg = re.compile(reg)
imgList = imgreg.findall(content)
return imgList
def obtainHref(content):
BS = BeautifulSoup(content, "html.parser")
links = BS.find_all("a", rel="nofollow") # 获取用户的a标签
hrefs = []
for a in links:
if a:
href = a.get('href')
hrefs.append(href)
else:
print("获取失败,跳过")
continue
return hrefs
def downUrlImage(imgList,url):
rootPath = 'F://luolaifa'
md5 = hashlib.md5(url.encode('utf-8')).hexdigest()
path = rootPath + '/' + md5
filepath = path.strip()
dirpath = filepath.strip("\\")
isExists = os.path.exists(dirpath)
if isExists == False :
os.makedirs(dirpath)
x = 0
paths = dirpath + '\\'
for imgurl in imgList:
sss = re.findall(r'/\s',imgurl)
if len(imgurl) > 200:
continue
page = urllib.request.urlopen(imgurl)
binary_data = page.read()
temp_file = open(paths + str(x) + '.jpg' , 'wb')
temp_file.write(binary_data)
temp_file.close()
x = x + 1
def queryTicket(url):
global URL_COUNT
# if URL_COUNT > 10:
# print (URL_COUNT)
# URL_COUNT = 0
# sys.exit(0)
URL_COUNT = URL_COUNT + 1
content = getHtml(url)
imgList = obtainImgList(content)
hrefList = obtainHref(content)
downUrlImage(imgList,url)
for href in hrefList:
queryTicket(href)
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html.decode('UTF-8')
def main():
url = 'https://www.rosegal.com'
queryTicket(url)
if __name__ == '__main__':
main()