-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_dataset.py
More file actions
48 lines (38 loc) · 1.79 KB
/
create_dataset.py
File metadata and controls
48 lines (38 loc) · 1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
from bs4 import BeautifulSoup
import requests
import zipfile
import io
import os
import time
URL = 'https://firstratedata.com/it/stock/'
EXTRACT_PATH = 'data/frd_sample_stock' # Destination directory to extract content
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all anchor tags in the page
links = soup.find_all('a')
# Extract the href attribute from each anchor tag and filter based on the criteria
filtered_links = [link.get('href') for link in links if link.get('href') and link.get('href').startswith('/i/stock/')]
avail_tickers = [link.removeprefix('/i/stock/') for link in filtered_links]
# Drop ticker from the list if it ends with '-DELISTED'
avail_tickers = [ticker for ticker in avail_tickers if not ticker.endswith('-DELISTED')]
def download_and_unzip(url, extract_to):
# Check if the directory exists, create it if not
if not os.path.exists(extract_to):
os.makedirs(extract_to)
with requests.get(url, stream=True) as response:
# Check if the request was successful
response.raise_for_status()
# Use BytesIO to convert the streamed content to a file-like object
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
# List all members in the ZIP archive
for member in z.namelist():
# Check if the member is a .csv file
if member.endswith('.csv'):
z.extract(member, extract_to)
print(f"Downloaded and extracted {url} to {extract_to}")
if __name__ == '__main__':
for ticker in avail_tickers:
URL = f'https://frd001.s3.us-east-2.amazonaws.com/frd_sample_stock_{ticker}.zip'
download_and_unzip(URL, EXTRACT_PATH)
time.sleep(1) # Sleep for 1 second to avoid throttling by AWS