-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstring_clean.py
More file actions
56 lines (47 loc) · 1.3 KB
/
string_clean.py
File metadata and controls
56 lines (47 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python
# encoding=utf8
import re
domain_regex = '(\/\/)(?:[\w-]+\.)*([\w-]{1,63})(?:\.(?:\w{3}|\w{2}))'
def rm_first_slash(url) :
return re.sub(r"^\/", '', url)
def rm_last_slash(url) :
return re.sub(r"\/$", '', url)
def rm_protocol(url) :
return re.sub(r"http:\/\/|https:\/\/|\/\/", '', url)
def rm_all_after_domain(url) :
protocol = re.search(r"http:\/\/|https:\/\/|\/\/", url)
if hasattr(protocol, 'group') :
protocol = protocol.group(0)
else:
protocol = 'http://'
url = clean_domain(url)
url = re.sub(r"\/(.*)", '', url)
return protocol + url
def clean_domain(url) :
url = rm_protocol(url)
url = rm_first_slash(url)
url = rm_last_slash(url)
url = re.sub(r"\/(.*)", '', url)
return url
def valid_url(url):
potential_url = re.search(domain_regex, url)
if hasattr(potential_url, 'group') :
return True
else :
return False
def does_not_match(url):
matches_to_exclude = [
'facebook.com',
'twitter.com',
'google.com',
'tel:',
'mailto:',
'goo.gl',
'wikipedia.org',
'instagram.com'
]
for regex in matches_to_exclude:
match = re.search(regex, url)
if hasattr(match, 'group'):
return False
return True