Skip to content
This repository was archived by the owner on Jun 20, 2022. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ Download and normalize internet data from various sources. This package is norma
### Ruby

#### Ubuntu 16.04 LTS
* sudo apt-get install ruby
* sudo apt-get install ruby ruby-dev
* sudo gem install typhoeus

#### Other Distributions
* gpg --keyserver hkp://keys.gnupg.net --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3
Expand Down Expand Up @@ -93,9 +94,17 @@ Normalize jobs can be run manually through ``bin/normalize.sh``. To select which

Project Sonar is a community project sponsored by Rapid7. The latest data can be found at [https://scans.io/](https://scans.io/). More information about Project Sonar can be found on the offical [website](https://sonar.labs.rapid7.com/).

The download script pulls down the sonar.fdns and sonar.rdns datasets, which are updated weekly. In addition, this project pulls down the sonar.ssl and sonar.moressl "names" files (but not the rest of the certificate data). The normalization process converts the sonar.fdns and sonar.rdns files into a set of
CSVs and MTBLs. These include both a forward and reverse lookup. These normalized files can be queried using standard unix utilities or MTBL front-ends such as mtbl_dump, rmtbl_dump, and mq.
The download script pulls down the sonar.fdns and sonar.rdns datasets, which are updated monthly. In addition, this project pulls down the sonar.ssl and sonar.moressl "names" files (but not the rest of the certificate data). The normalization process converts the sonar.fdns and sonar.rdns files into a set of CSVs and MTBLs. These include both a forward and reverse lookup. These normalized files can be queried using standard unix utilities or MTBL front-ends such as mtbl_dump, rmtbl_dump, and mq.

Users with [free API access](https://opendata.rapid7.com/apihelp/) can retrieve more frequently updated datasets. Add your API key to the `sonar_api_key` entry in `conf/inetdata.json`.

```json
{
"sonar_base_url": "https://opendata.rapid7.com",
"sonar_api_base_url": "https://us.api.insight.rapid7.com/opendata/studies",
"sonar_api_key": "<API Key>",
}
```

### Censys

Expand Down
2 changes: 1 addition & 1 deletion bin/download.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
opts.on("-l", "--list-sources", "List available sources") do |opt|
options[:list_sources] = true
end
opts.on("-s", "--sources [sources]", "Comma-separated list of sources to download") do |opt|
opts.on("-s", "--sources [sources]", "Comma-separated list of sources to download; e.g. \"sonar, gov\"") do |opt|
options[:selected_sources] = opt.split(/,\s+/).uniq.map{|x| x.downcase}
end
end.parse!
Expand Down
2 changes: 1 addition & 1 deletion bin/normalize.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
opts.on("-l", "--list-sources", "List available sources") do |opt|
options[:list_sources] = true
end
opts.on("-s", "--sources [sources]", "Comma-separated list of sources to normalize") do |opt|
opts.on("-s", "--sources [sources]", "Comma-separated list of sources to normalize; e.g. \"sonar, gov\"") do |opt|
options[:selected_sources] = opt.split(/,\s+/).uniq.map{|x| x.downcase}
end
end.parse!
Expand Down
2 changes: 2 additions & 0 deletions conf/inetdata.json.sample
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
"DISABLED_max_cores" : "4",

"sonar_base_url": "https://opendata.rapid7.com",
"sonar_api_base_url": "https://us.api.insight.rapid7.com/opendata/studies",
"sonar_api_key": "",

"censys_base_url": "https://www.censys.io/api/v1",
"censys_api_id": "",
Expand Down
2 changes: 1 addition & 1 deletion lib/inetdata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
require 'inetdata/source'

module InetData
VERSION = "1.2.2"
VERSION = "1.3.0"
end


151 changes: 76 additions & 75 deletions lib/inetdata/source/sonar.rb
Original file line number Diff line number Diff line change
@@ -1,102 +1,109 @@
require 'typhoeus'

module InetData
module Source
class Sonar < Base

def download_file(src, dst,redirect_count=0)
tmp = dst + ".tmp"
target = URI.parse(src)
size = 0
ims = false
http = Net::HTTP.new(target.host, target.port)

if src.index("https") == 0
http.use_ssl = true
end

req = Net::HTTP::Get.new(target.request_uri)

if File.exists?(dst)
req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822
ims = true
end

http.request(req) do |res|

if ims && res.code.to_i == 304
log(" > Skipped downloading of #{dst} due to not modified response")
return true
end

if ims && res['Content-Length']
if res['Content-Length'].to_i == File.size(dst)
log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes")
return true
end
end

if [301, 302].include?(res.code.to_i)

if redirect_count > 3
log(" > Skipped downloading of #{dst} due to rediret count being over limit: #{redirect_count}")
return true
end

new_src = res['Location'].to_s
def download_files(queue)
hydra = Typhoeus::Hydra.hydra
dir = storage_path
FileUtils.mkdir_p(dir)

if new_src.length == 0
log(" > Skipped downloading of #{dst} due to server redirect with no location")
return true
end
queue.each do |url|
filename = File.join(dir, url.split("/").last.split("?").first)
dst = File.open(filename, 'wb')
req = Typhoeus::Request.new(url, followlocation: true)

log(" > Download of #{src} moved to #{new_src}...")
return download_file(new_src, dst, redirect_count + 1)
req.on_headers do |res|
raise "Request failed: #{url}" unless res.code == 200
end

if res.code.to_i != 200
log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message} #{res['Location']}")
return true
req.on_body do |chunk|
dst.write(chunk)
end

outp = File.open(tmp, "wb")

res.read_body do |chunk|
outp.write(chunk)
size += chunk.length
req.on_complete do |res|
dst.close
size = File.size(filename)
log(" > Downloading of #{filename} completed with #{size} bytes")
end

outp.close
hydra.queue req
end

File.rename(tmp, dst)

log(" > Downloading of #{dst} completed with #{size} bytes")
hydra.run
end

def download_index(dset)
target = URI.parse(config['sonar_base_url'] + dset)
unless config['sonar_api_key'].strip.empty?
based_url = config['sonar_api_base_url'] + dset
target = URI.parse(based_url)
else
target = URI.parse(config['sonar_base_url'] + dset)
end

tries = 0
begin

#
# Acquire a listing of the dataset archives
#
tries += 1
http = Net::HTTP.new(target.host, target.port)
http.use_ssl = true

req = Net::HTTP::Get.new(target.request_uri)
req['X-Api-Key'] = config['sonar_api_key'] unless config['sonar_api_key'].strip.empty?
res = http.request(req)

unless (res and res.code.to_i == 200 and res.body.to_s.index('SHA1-Fingerprint'))
if res
raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
links = []
if !config['sonar_api_key'].strip.empty?
unless (res and res.code.to_i == 200 and res.body)
raise RuntimeError.new("Unexpected 'studies' API reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
end

#
# Find the newest archives
#
archives = {}
if dset.include? 'rdns'
archives['rdns'] = JSON.parse(res.body)['sonarfile_set'].shift
else
JSON.parse(res.body)['sonarfile_set'].each do |archive|
next unless archive.include? '_'
record = (archive.split /_|\.json\.gz/).last
archives[record] = archive unless archives[record]
end
end

#
# Generate a download URL for a file (https://opendata.rapid7.com/apihelp/)
#
archives.values.each do |filename|
target = URI.parse("#{based_url}#{filename}/download/")
http = Net::HTTP.new(target.host, target.port)
http.use_ssl = true

req = Net::HTTP::Get.new(target.request_uri)
req['X-Api-Key'] = config['sonar_api_key']
res = http.request(req)

unless (res and res.code.to_i == 200 and res.body)
raise RuntimeError.new("Unexpected 'download' API reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
end

links << ( JSON.parse(res.body)['url'] )
end
else
unless (res and res.code.to_i == 200 and res.body.to_s.index('SHA1-Fingerprint'))
raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}")
end
end

links = []
res.body.scan(/href=\"(#{dset}\d+\-\d+\-\d+\-\d+\-[^\"]+)\"/).each do |link|
link = link.first
if link =~ /\.json.gz/
links << ( config['sonar_base_url'] + link )
res.body.scan(/href=\"(#{dset}\d+\-\d+\-\d+\-\d+\-[^\"]+)\"/).each do |link|
link = link.first
if link =~ /\.json.gz/
links << ( config['sonar_base_url'] + link )
end
end
end

Expand Down Expand Up @@ -124,20 +131,14 @@ def download_rdns_index
end

def download
dir = storage_path
FileUtils.mkdir_p(dir)

fdns_links = download_fdns_index
rdns_links = download_rdns_index

queue = []
queue += rdns_links
queue += fdns_links

queue.each do |url|
dst = File.join(dir, url.split("/").last)
download_file(url, dst)
end
download_files(queue)
end

def normalize
Expand Down