diff --git a/httpretrieve.r2py b/httpretrieve.r2py index 4bf3c4d..d13cc2f 100644 --- a/httpretrieve.r2py +++ b/httpretrieve.r2py @@ -16,14 +16,13 @@ """ - +librepysocket = dy_import_module("librepysocket.r2py") urlparse = dy_import_module('urlparse.r2py') -sockettimeout = dy_import_module('sockettimeout.r2py') urllib = dy_import_module('urllib.r2py') -class HttpConnectionError(Exception): +class HttpConnectionError(RepyException): """ Error indicating that the web server has unexpectedly dropped the connection. @@ -32,7 +31,7 @@ class HttpConnectionError(Exception): -class HttpBrokenServerError(Exception): +class HttpBrokenServerError(RepyException): """ Error indicating that the web server has sent us complete garbage instead of something resembling HTTP. @@ -63,19 +62,22 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ both postdata and querydata are omitted, there is no query string sent in the request. - For both querydata and postdata, strings are sent *unmodified*. - This means you probably should encode them first, with - urllib_quote(). + Per default, we use + `Content-Encoding: application/x-www-form-urlencoded` + for querydata and postdata. However, we do not MIME-encode the + data; the caller must take care of that (using `urllib_quote`). + If you encode querydata/postdata differently, also supply a + `httpheaders` dict, with the key `Content-Encoding` mapping to + the MIME type you use. httpheaders (optional): A dictionary of supplemental HTTP request headers to add to the request. proxy (optional): A proxy server 2-tuple to bind to: ('host', port). timeout (optional): - A timeout for establishing a connection to the web server, - sending headers, and reading the response headers. - - If excluded or None, never times out. + A timeout for establishing a connection to the web server. + Sending headers, and reading the response headers is not + subject to timing out at the moment. ValueError if given an invalid URL, or malformed limit or timeout @@ -127,21 +129,16 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ try: if proxy is not None: # if there is a proxy, open a connection with the proxy instead of the actual server - # use the timeout we are given (or none) - sockobj = sockettimeout.timeout_openconnection(proxy[0], proxy[1], timeout=timeout) + sockobj = librepysocket.openconn(proxy[0], proxy[1], timeout=timeout) else: # if there is no proxy open a connection with server directly - # use the timeout we are given (or none) - sockobj = sockettimeout.timeout_openconnection(gethostbyname(hoststr), portint, timeout=timeout) + sockobj = librepysocket.openconn(gethostbyname(hoststr), portint, timeout=timeout) - except Exception, e: + except TimeoutError: # If a socket object was created, we want to clean in up. if sockobj: sockobj.close() - - if repr(e).startswith("timeout("): - raise HttpConnectionError("Socket timed out connecting to host/port.") - raise + raise HttpConnectionError("Socket timed out connecting to host/port.") try: # Builds the HTTP request: @@ -154,17 +151,6 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ # Now, we're done with the HTTP request part of the session, and we need # to get the HTTP response. - # Check if we've timed out (if the user requested a timeout); update the - # socket timeout to reflect the time taken sending the request. - # XXX I doubt it makes sense to do that. sockettimeout provides - # XXX timeouts for us. Review this, simplify if possible. - if timeout is None: - sockobj.settimeout(0) - elif getruntime() - starttimefloat >= timeout: - raise TimeoutError("Timed out") - else: - sockobj.settimeout(timeout - (getruntime() - starttimefloat)) - # Receive the header lines from the web server (a series of CRLF-terminated # lines, terminated by an empty line, or by the server closing the # connection. @@ -174,11 +160,10 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ # This should probably be replaced with page-sized reads in the future, # but for now, the behavior is at least correct. headersstr += sockobj.recv(1) - except Exception, e: - if str(e) == "Socket closed": - break - else: - raise + except SocketWouldBlockError: + sleep(0.01) + except SocketClosedRemote: + break httpheaderlist = headersstr.split("\r\n") # Ignore (a) trailing blank line(s) (for example, the response header- @@ -205,7 +190,7 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ friendlystatusstr = statuslinelist[2] try: statusint = int(statuslinelist[1]) - except ValueError, e: + except ValueError: raise HttpBrokenServerError("Server returned garbage for HTTP " + \ "response (status code isn't integer).") @@ -219,7 +204,7 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ sockobj.close() try: redirecturlstr = httpheaderdict["Location"][0] - except (KeyError, IndexError), ke: + except (KeyError, IndexError): # When a server returns a redirect status code (3xx) but no Location # header, some clients, e.g. Firefox, just show the response body # as they would normally for a 2xx or 4xx response. So, I think we @@ -228,7 +213,11 @@ def httpretrieve_open(url, querydata=None, postdata=None,\ pass else: # If the server did send a redirect location, let's go there. - return httpretrieve_open(redirecturlstr) + try: + return httpretrieve_open(redirecturlstr) + except Exception, e: + # XXX Clean this mess up. Let's define an HttpRedirectionError or some such. + raise Exception("Error following redirect: " + repr(e)) # If we weren't requested to redirect, and we didn't, return a read-only # file-like object (representing the response body) to the caller. @@ -407,12 +396,11 @@ class _httpretrieve_filelikeobject: while True: try: contentchunkstr = self._sockobj.recv(lefttoread or 4096) - except Exception, e: - if str(e) == "The socket has been closed remotely!": - self._totalcontentisreceived = True - break - else: - raise + except SocketWouldBlockError: + sleep(0.01) + except SocketClosedRemote: + self._totalcontentisreceived = True + break httpcontentstr += contentchunkstr self._totalread += len(contentchunkstr) @@ -520,21 +508,27 @@ def _httpretrieve_build_request(host, port, path, querydata, postdata, \ # Sanity checks: if path == "": raise ValueError("Invalid path -- empty string.") - if postdata is not None and type(postdata) not in (str, dict): - raise TypeError("Postdata should be a dict of form-data or a string") - if querydata is not None and type(querydata) not in (str, dict): - raise TypeError("Querydata should be a dict of form-data or a string") - if httpheaders is not None and type(httpheaders) is not dict: - raise TypeError("Expected HTTP headers as a dictionary.") - - # Type-conversions: - if type(querydata) is dict: - querydata = urllib.urllib_quote_parameters(querydata) - elif querydata is None: - querydata = "" - - if type(postdata) is dict: - postdata = urllib.urllib_quote_parameters(postdata) + if postdata is not None and type(postdata) is not str: + raise TypeError("postdata should be a MIME-encoded string") + + # Initialize querydata if not given, ... + querydata = querydata or "" + + # ... and ensure it is a str + if type(querydata) is not str: + raise TypeError("querydata should be a MIME-encoded string") + + # Initialize the HTTP headers dict if not given, ... + httpheaders = httpheaders or {} + + # ... and ensure it is a dict + if type(httpheaders) is not dict: + raise TypeError("httpheaders must be a dict") + + # If we have query or post data, set a default encoding if not given + if querydata or postdata and "Content-Type" not in httpheaders: + httpheaders['Content-Type'] = 'application/x-www-form-urlencoded' + # Default to GET, unless the caller specifies a message body to send. methodstr = "GET" @@ -554,24 +548,23 @@ def _httpretrieve_build_request(host, port, path, querydata, postdata, \ # there is no proxy; send normal http request requeststr = methodstr + ' ' + path + resourcestr + ' HTTP/1.0\r\n' - if httpheaders is not None: - # Most servers require a 'Host' header for normal functionality - # (especially in the case of multiple domains being hosted on a - # single server). - if "Host" not in httpheaders: - requeststr += "Host: " + host + ':' + str(port) + "\r\n" + # Most servers require a 'Host' header for normal functionality + # (especially in the case of multiple domains being hosted on a + # single server). + if "Host" not in httpheaders: + requeststr += "Host: " + host + ':' + str(port) + "\r\n" - for key, val in httpheaders.items(): - requeststr += key + ": " + val + '\r\n' + for key, val in httpheaders.items(): + requeststr += key + ": " + val + '\r\n' - # Affix post-data related headers and content: + # Add post-data related headers and content: if methodstr == "POST": requeststr += 'Content-Length: ' + str(len(postdata)) + '\r\n' # The empty line terminates HTTP headers. requeststr += '\r\n' - # If we're a POST request, affix any requested data to the message body. + # If we're a POST request, add any requested data to the message body. if methodstr == "POST": requeststr += postdata diff --git a/tests/httpretrieve_test/ut_httpretrieve_content.py b/tests/ut_seattlelib_httpretrieve_content.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_content.py rename to tests/ut_seattlelib_httpretrieve_content.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_content_timeout.py b/tests/ut_seattlelib_httpretrieve_content_timeout.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_content_timeout.py rename to tests/ut_seattlelib_httpretrieve_content_timeout.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_filelikeobj_closed.py b/tests/ut_seattlelib_httpretrieve_filelikeobj_closed.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_filelikeobj_closed.py rename to tests/ut_seattlelib_httpretrieve_filelikeobj_closed.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_header_timeout.py b/tests/ut_seattlelib_httpretrieve_header_timeout.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_header_timeout.py rename to tests/ut_seattlelib_httpretrieve_header_timeout.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_post.py b/tests/ut_seattlelib_httpretrieve_post.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_post.py rename to tests/ut_seattlelib_httpretrieve_post.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_read_done.py b/tests/ut_seattlelib_httpretrieve_read_done.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_read_done.py rename to tests/ut_seattlelib_httpretrieve_read_done.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_read_limit.py b/tests/ut_seattlelib_httpretrieve_read_limit.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_read_limit.py rename to tests/ut_seattlelib_httpretrieve_read_limit.py diff --git a/tests/httpretrieve_test/ut_httpretrieve_redirect_notgiven.py b/tests/ut_seattlelib_httpretrieve_redirect_notgiven.py similarity index 100% rename from tests/httpretrieve_test/ut_httpretrieve_redirect_notgiven.py rename to tests/ut_seattlelib_httpretrieve_redirect_notgiven.py