diff --git a/README.md b/README.md index 64d34b3..63b428b 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@ Download and normalize internet data from various sources. This package is norma ### Ruby #### Ubuntu 16.04 LTS - * sudo apt-get install ruby + * sudo apt-get install ruby ruby-dev + * sudo gem install typhoeus #### Other Distributions * gpg --keyserver hkp://keys.gnupg.net --recv-keys 409B6B1796C275462A1703113804BB82D39DC0E3 @@ -93,9 +94,17 @@ Normalize jobs can be run manually through ``bin/normalize.sh``. To select which Project Sonar is a community project sponsored by Rapid7. The latest data can be found at [https://scans.io/](https://scans.io/). More information about Project Sonar can be found on the offical [website](https://sonar.labs.rapid7.com/). -The download script pulls down the sonar.fdns and sonar.rdns datasets, which are updated weekly. In addition, this project pulls down the sonar.ssl and sonar.moressl "names" files (but not the rest of the certificate data). The normalization process converts the sonar.fdns and sonar.rdns files into a set of -CSVs and MTBLs. These include both a forward and reverse lookup. These normalized files can be queried using standard unix utilities or MTBL front-ends such as mtbl_dump, rmtbl_dump, and mq. +The download script pulls down the sonar.fdns and sonar.rdns datasets, which are updated monthly. In addition, this project pulls down the sonar.ssl and sonar.moressl "names" files (but not the rest of the certificate data). The normalization process converts the sonar.fdns and sonar.rdns files into a set of CSVs and MTBLs. These include both a forward and reverse lookup. These normalized files can be queried using standard unix utilities or MTBL front-ends such as mtbl_dump, rmtbl_dump, and mq. +Users with [free API access](https://opendata.rapid7.com/apihelp/) can retrieve more frequently updated datasets. Add your API key to the `sonar_api_key` entry in `conf/inetdata.json`. + +```json +{ + "sonar_base_url": "https://opendata.rapid7.com", + "sonar_api_base_url": "https://us.api.insight.rapid7.com/opendata/studies", + "sonar_api_key": "", +} +``` ### Censys diff --git a/bin/download.rb b/bin/download.rb index 80c9b2b..b77b6f1 100755 --- a/bin/download.rb +++ b/bin/download.rb @@ -12,7 +12,7 @@ opts.on("-l", "--list-sources", "List available sources") do |opt| options[:list_sources] = true end - opts.on("-s", "--sources [sources]", "Comma-separated list of sources to download") do |opt| + opts.on("-s", "--sources [sources]", "Comma-separated list of sources to download; e.g. \"sonar, gov\"") do |opt| options[:selected_sources] = opt.split(/,\s+/).uniq.map{|x| x.downcase} end end.parse! diff --git a/bin/normalize.rb b/bin/normalize.rb index 4629796..008f152 100755 --- a/bin/normalize.rb +++ b/bin/normalize.rb @@ -32,7 +32,7 @@ opts.on("-l", "--list-sources", "List available sources") do |opt| options[:list_sources] = true end - opts.on("-s", "--sources [sources]", "Comma-separated list of sources to normalize") do |opt| + opts.on("-s", "--sources [sources]", "Comma-separated list of sources to normalize; e.g. \"sonar, gov\"") do |opt| options[:selected_sources] = opt.split(/,\s+/).uniq.map{|x| x.downcase} end end.parse! diff --git a/conf/inetdata.json.sample b/conf/inetdata.json.sample index 5c34cc5..e567dc1 100644 --- a/conf/inetdata.json.sample +++ b/conf/inetdata.json.sample @@ -9,6 +9,8 @@ "DISABLED_max_cores" : "4", "sonar_base_url": "https://opendata.rapid7.com", + "sonar_api_base_url": "https://us.api.insight.rapid7.com/opendata/studies", + "sonar_api_key": "", "censys_base_url": "https://www.censys.io/api/v1", "censys_api_id": "", diff --git a/lib/inetdata.rb b/lib/inetdata.rb index cbee45f..15ea390 100644 --- a/lib/inetdata.rb +++ b/lib/inetdata.rb @@ -13,7 +13,7 @@ require 'inetdata/source' module InetData - VERSION = "1.2.2" + VERSION = "1.3.0" end diff --git a/lib/inetdata/source/sonar.rb b/lib/inetdata/source/sonar.rb index fcba853..b2d2154 100644 --- a/lib/inetdata/source/sonar.rb +++ b/lib/inetdata/source/sonar.rb @@ -1,102 +1,109 @@ +require 'typhoeus' + module InetData module Source class Sonar < Base - def download_file(src, dst,redirect_count=0) - tmp = dst + ".tmp" - target = URI.parse(src) - size = 0 - ims = false - http = Net::HTTP.new(target.host, target.port) - - if src.index("https") == 0 - http.use_ssl = true - end - - req = Net::HTTP::Get.new(target.request_uri) - - if File.exists?(dst) - req['If-Modified-Since'] = File.stat(dst).mtime.rfc2822 - ims = true - end - - http.request(req) do |res| - - if ims && res.code.to_i == 304 - log(" > Skipped downloading of #{dst} due to not modified response") - return true - end - - if ims && res['Content-Length'] - if res['Content-Length'].to_i == File.size(dst) - log(" > Skipped downloading of #{dst} with same size of #{res['Content-Length']} bytes") - return true - end - end - - if [301, 302].include?(res.code.to_i) - - if redirect_count > 3 - log(" > Skipped downloading of #{dst} due to rediret count being over limit: #{redirect_count}") - return true - end - - new_src = res['Location'].to_s + def download_files(queue) + hydra = Typhoeus::Hydra.hydra + dir = storage_path + FileUtils.mkdir_p(dir) - if new_src.length == 0 - log(" > Skipped downloading of #{dst} due to server redirect with no location") - return true - end + queue.each do |url| + filename = File.join(dir, url.split("/").last.split("?").first) + dst = File.open(filename, 'wb') + req = Typhoeus::Request.new(url, followlocation: true) - log(" > Download of #{src} moved to #{new_src}...") - return download_file(new_src, dst, redirect_count + 1) + req.on_headers do |res| + raise "Request failed: #{url}" unless res.code == 200 end - if res.code.to_i != 200 - log(" > Skipped downloading of #{dst} due to server response of #{res.code} #{res.message} #{res['Location']}") - return true + req.on_body do |chunk| + dst.write(chunk) end - outp = File.open(tmp, "wb") - - res.read_body do |chunk| - outp.write(chunk) - size += chunk.length + req.on_complete do |res| + dst.close + size = File.size(filename) + log(" > Downloading of #{filename} completed with #{size} bytes") end - outp.close + hydra.queue req end - File.rename(tmp, dst) - - log(" > Downloading of #{dst} completed with #{size} bytes") + hydra.run end def download_index(dset) - target = URI.parse(config['sonar_base_url'] + dset) + unless config['sonar_api_key'].strip.empty? + based_url = config['sonar_api_base_url'] + dset + target = URI.parse(based_url) + else + target = URI.parse(config['sonar_base_url'] + dset) + end + tries = 0 begin + # + # Acquire a listing of the dataset archives + # tries += 1 http = Net::HTTP.new(target.host, target.port) http.use_ssl = true req = Net::HTTP::Get.new(target.request_uri) + req['X-Api-Key'] = config['sonar_api_key'] unless config['sonar_api_key'].strip.empty? res = http.request(req) - unless (res and res.code.to_i == 200 and res.body.to_s.index('SHA1-Fingerprint')) - if res - raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}") + links = [] + if !config['sonar_api_key'].strip.empty? + unless (res and res.code.to_i == 200 and res.body) + raise RuntimeError.new("Unexpected 'studies' API reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}") + end + + # + # Find the newest archives + # + archives = {} + if dset.include? 'rdns' + archives['rdns'] = JSON.parse(res.body)['sonarfile_set'].shift else + JSON.parse(res.body)['sonarfile_set'].each do |archive| + next unless archive.include? '_' + record = (archive.split /_|\.json\.gz/).last + archives[record] = archive unless archives[record] + end + end + + # + # Generate a download URL for a file (https://opendata.rapid7.com/apihelp/) + # + archives.values.each do |filename| + target = URI.parse("#{based_url}#{filename}/download/") + http = Net::HTTP.new(target.host, target.port) + http.use_ssl = true + + req = Net::HTTP::Get.new(target.request_uri) + req['X-Api-Key'] = config['sonar_api_key'] + res = http.request(req) + + unless (res and res.code.to_i == 200 and res.body) + raise RuntimeError.new("Unexpected 'download' API reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}") + end + + links << ( JSON.parse(res.body)['url'] ) + end + else + unless (res and res.code.to_i == 200 and res.body.to_s.index('SHA1-Fingerprint')) raise RuntimeError.new("Unexpected reply: #{res.code} - #{res['Content-Type']} - #{res.body.inspect}") end - end - links = [] - res.body.scan(/href=\"(#{dset}\d+\-\d+\-\d+\-\d+\-[^\"]+)\"/).each do |link| - link = link.first - if link =~ /\.json.gz/ - links << ( config['sonar_base_url'] + link ) + res.body.scan(/href=\"(#{dset}\d+\-\d+\-\d+\-\d+\-[^\"]+)\"/).each do |link| + link = link.first + if link =~ /\.json.gz/ + links << ( config['sonar_base_url'] + link ) + end end end @@ -124,9 +131,6 @@ def download_rdns_index end def download - dir = storage_path - FileUtils.mkdir_p(dir) - fdns_links = download_fdns_index rdns_links = download_rdns_index @@ -134,10 +138,7 @@ def download queue += rdns_links queue += fdns_links - queue.each do |url| - dst = File.join(dir, url.split("/").last) - download_file(url, dst) - end + download_files(queue) end def normalize