A fork of pappy proxy
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1999 lines
64 KiB

9 years ago
import base64
import collections
import crochet
import datetime
import gzip
import json
import pappyproxy
9 years ago
import re
import StringIO
import urlparse
import zlib
from twisted.internet import defer, reactor
from pappyproxy.util import PappyException
import bs4
9 years ago
dbpool = None
def init(pool):
Initialize the http module.
:param pool: The ConnectionPool to use to store the request/response objects
:type pool: SQLite ConnectionPool
9 years ago
global dbpool
if dbpool is None:
dbpool = pool
def destruct():
def _decode_encoded(data, encoding):
9 years ago
if encoding == ENCODE_NONE:
return data
if encoding == ENCODE_DEFLATE:
dec_data = zlib.decompress(data, -15)
9 years ago
dec_data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(data))
dec_data = dec_data.read()
return dec_data
9 years ago
def _strip_leading_newlines(string):
while (len(string) > 1 and string[0:2] == '\r\n') or \
(len(string) > 0 and string[0] == '\n'):
if len(string) > 1 and string[0:2] == '\r\n':
string = string[2:]
elif len(string) > 0 and string[0] == '\n':
string = string[1:]
return string
def _consume_line(instr):
# returns (line, rest)
l = []
pos = 0
while pos < len(instr):
if instr[pos] == '\n':
if l and l[-1] == '\r':
l = l[:-1]
return (''.join(l), instr[pos+1:])
pos += 1
return instr
## Functions to use
def get_request(url='', url_params={}):
get_request(url='', url_params={})
Create a request object that makes a GET request to the given url with the
given url params.
r = Request()
r.status_line = 'GET / HTTP/1.1'
r.url = url
r.headers['Host'] = r.host
if url_params:
return r
def post_request(url, post_params={}, url_params={}):
post_request(url, post_params={}, url_params={})
Create a request object that makes a POST request to the given url with the
given post and url params.
r = Request()
r.status_line = 'POST / HTTP/1.1'
r.url = url
r.headers['Host'] = r.host
if url_params:
if post_params:
return r
def repeatable_parse_qs(s):
pairs = s.split('&')
ret_dict = RepeatableDict()
for pair in pairs:
if '=' in pair:
t = tuple(pair.split('=', 1))
ret_dict.append(t[0], t[1])
ret_dict.append(pair, None)
return ret_dict
## Classes
9 years ago
class RepeatableDict:
A dict that retains the order of items inserted and keeps track of
duplicate values. Can optionally treat keys as case insensitive.
Custom made for the proxy, so it has strange features
def __init__(self, from_pairs=None, case_insensitive=False):
# If efficiency becomes a problem, add a dict that keeps a list by key
# and use that for getting data. But until then, this stays.
self._pairs = []
self._keys = set()
self._modify_callback = None
self.case_insensitive = case_insensitive
if from_pairs:
for k, v in from_pairs:
self.append(k, v)
def _ef_key(self, key):
# "effective key", returns key.lower() if we're case insensitive,
# otherwise it returns the same key
if self.case_insensitive:
return key.lower()
return key
def _mod_callback(self):
# Calls the modify callback if we have one
if self._modify_callback:
def __contains__(self, val):
return self._ef_key(val) in self._keys
def __getitem__(self, key):
for p in reversed(self._pairs):
if self._ef_key(p[0]) == self._ef_key(key):
return p[1]
raise KeyError
def __setitem__(self, key, val):
# Replaces first instance of `key` and deletes the rest
self.set_val(key, val)
def __delitem__(self, key):
self._pairs = [p for p in self._pairs if self._ef_key(p[0]) != self._ef_key(key)]
def __nonzero__(self):
if self._pairs:
return True
return False
def _add_key(self, key):
def _remove_key(self, key):
def all_pairs(self):
return self._pairs[:]
def append(self, key, val, do_callback=True):
# Add a duplicate entry for key
self._pairs.append((key, val))
if do_callback:
def set_val(self, key, val, do_callback=True):
new_pairs = []
added = False
for p in self._pairs:
if self._ef_key(p[0]) == self._ef_key(key):
if not added:
# only add the first instance
new_pairs.append((key, val))
added = True
if not added:
new_pairs.append((key, val))
self._pairs = new_pairs
if do_callback:
def update(self, key, val, do_callback=True):
# If key is already in the dict, replace that value with the new value
if key in self:
for k, v in self.all_pairs():
if self._ef_key(k) == self._ef_key(key):
self.set_val(k, val, do_callback=do_callback)
self.set_val(key, val, do_callback=do_callback)
def clear(self, do_callback=True):
self._pairs = []
if do_callback:
def all_vals(self, key):
return [p[1] for p in self._pairs if self._ef_key(p[0]) == self._ef_key(key)]
def add_pairs(self, pairs, do_callback=True):
for pair in pairs:
self._pairs += pairs
if do_callback:
def from_dict(self, d):
self._pairs = list(d.items())
9 years ago
def sort(self):
# Sorts pairs by key alphabetaclly
pairs = sorted(pairs, key=lambda x: x[0])
def set_modify_callback(self, callback):
# Add a function to be called whenever an element is added, changed, or
# deleted. Set to None to remove
self._modify_callback = callback
class LengthData:
def __init__(self, length=None):
self.raw_data = ''
self.complete = False
self.length = length or 0
if self.length == 0:
self.complete = True
def add_data(self, data):
if self.complete:
raise PappyException("Data already complete!")
9 years ago
remaining_length = self.length-len(self.raw_data)
if len(data) >= remaining_length:
self.raw_data += data[:remaining_length]
assert(len(self.raw_data) == self.length)
self.complete = True
self.raw_data += data
class ChunkedData:
def __init__(self):
self.raw_data = ''
self._pos = 0
self._state = 0 # 0=reading length, 1=reading data, 2=going over known string
self._len_str = ''
self._chunk_remaining = 0
self._known_str = ''
self._known_str_pos = 0
self._next_state = 0
self._raw_data = ''
self.complete = False
self.unchunked_data = ''
def add_data(self, data):
self._raw_data += data
def scan_forward(self):
# Don't add more data if we're already done
if self.complete:
while self._pos < len(self._raw_data):
curchar = self._raw_data[self._pos]
if self._state == 0:
if curchar.lower() in '0123456789abcdef':
# Read the next char of the length
self._len_str += curchar
# Move to the next char
self._pos += 1
elif curchar == '\r':
# Save how much chunk to read
self._chunk_remaining = int(self._len_str, 16)
# If the length is 0, chunked encoding is done!
if self._chunk_remaining == 0:
self.complete = True
# I should probably just rename raw_data since it's what
# you use to look at unchunked data, but you're not
# supposed to look at it until after it's complete
# anyways
self._raw_data = self.unchunked_data
self.raw_data = self._raw_data # Expose raw_data
# There should be a newline after the \r
self._known_str = '\n'
self._state = 2
self._next_state = 1
# Reset the length str
self._len_str = ''
# Move to the next char
self._pos += 1
raise Exception("Malformed chunked encoding!")
elif self._state == 1:
if self._chunk_remaining > 0:
# Read next byte of data
self.unchunked_data += curchar
self._chunk_remaining -= 1
self._pos += 1
# Read newline then read a new chunk
self._known_str = '\r\n'
self._next_state = 0 # Read len after newlines
self._state = 2 # Read newlines
# Don't move to the next char because we didn't do anything
elif self._state == 2:
# Read a char of an expected string
# If the expected char doesn't match, throw an error
if self._known_str[self._known_str_pos] != curchar:
raise Exception("Unexpected data")
# Move to the next char in the raw data and in our known string
self._known_str_pos += 1
self._pos += 1
# If we've reached the end of the known string, go to the next state
if self._known_str_pos == len(self._known_str):
self._known_str_pos = 0
self._state = self._next_state
class ResponseCookie(object):
A cookie representing a cookie set by a response
9 years ago
def __init__(self, set_cookie_string=None):
self.key = None
self.val = None
self.expires = None
self.max_age = None
self.domain = None
self.path = None
self.secure = False
self.http_only = False
if set_cookie_string:
9 years ago
def cookie_str(self):
Returns the full string of the cookie. ie ``foo=bar; secure; path=/``
:getter: Returns the full string of the cookie.
:setter: Set the metadata from a cookie string. ie from a ``Set-Cookie`` header
9 years ago
av = '%s=%s' % (self.key, self.val)
to_add = [av]
if self.expires:
if self.max_age:
if self.domain:
if self.path:
if self.secure:
if self.http_only:
return '; '.join(to_add)
def cookie_str(self, val):
def _parse_cookie_av(self, cookie_av):
9 years ago
if '=' in cookie_av:
key, val = cookie_av.split('=', 1)
key = key.lstrip()
if key.lower() == 'expires':
self.expires = val
if key.lower() == 'max-age':
self.max_age = int(val)
if key.lower() == 'domain':
self.domain = val
if key.lower() == 'path':
self.path = val
elif cookie_av.lstrip().lower() == 'secure':
self.secure = True
elif cookie_av.lstrip().lower() == 'httponly':
self.http_only = True
def _from_cookie(self, set_cookie_string):
self.key = None
self.val = None
self.expires = None
self.max_age = None
self.domain = None
self.path = None
self.secure = False
self.http_only = False
9 years ago
if ';' in set_cookie_string:
cookie_pair, rest = set_cookie_string.split(';', 1)
if '=' in cookie_pair:
self.key, self.val = cookie_pair.split('=',1)
elif cookie_pair == '' or re.match('\s+', cookie_pair):
self.key = ''
self.val = ''
self.key = cookie_pair
self.val = ''
9 years ago
cookie_avs = rest.split(';')
for cookie_av in cookie_avs:
9 years ago
self.key, self.val = set_cookie_string.split('=',1)
class Request(object):
:ivar time_end: The datetime that the request ended.
:vartype time_end: datetime.datetime
:ivar time_start: The datetime that the request was made
:vartype time_start: datetime.datetime
:ivar complete: When creating the request with :func:`~pappyproxy.http.Request.add_line`
and :func:`~pappyproxy.http.Request.add_data`, returns whether
the request is complete.
:vartype complete: Bool
:ivar cookies: Cookies sent with the request
:vartype cookies: RepeatableDict
:ivar fragment: The fragment part of the url (The part that comes after the #)
:vartype fragment: String
:ivar url_params: The url parameters of the request (aka the get parameters)
:vartype url_params: RepeatableDict
:ivar headers: The headers of the request
:vartype headers: RepeatableDict
:ivar headers_complete: When creating the request with
:func:`~pappyproxy.http.Request.add_line` and
:func:`~pappyproxy.http.Request.add_data`, returns whether the headers
are complete
:vartype headers_complete: Bool
:ivar path: The path of the request
:vartype path: String
:ivar port: The port that the request was sent to (or will be sent to)
:vartype port: Integer
:ivar post_params: The post parameters of the request
:vartype post_params: RepeatableDict
:ivar reqid: The request id of the request
:vartype reqid: String
:ivar response: The associated response of this request
:vartype response: Response
:ivar submitted: Whether the request has been submitted
:vartype submitted: Bool
:ivar unmangled: If the request was mangled, the version of the request
before it was mangled.
:vartype unmangled: Request
:ivar verb: The HTTP verb of the request (ie POST, GET)
:vartype verb: String
:ivar version: The HTTP version of the request (ie HTTP/1.1)
:vartype version: String
:ivar tags: Tags associated with the request
:vartype tags: List of Strings
9 years ago
def __init__(self, full_request=None, update_content_length=True,
port=None, is_ssl=None):
9 years ago
self.time_end = None
self.time_start = None
self.complete = False
self.cookies = RepeatableDict()
self.fragment = None
self.url_params = RepeatableDict()
9 years ago
self.headers = RepeatableDict(case_insensitive=True)
self.headers_complete = False
self._host = None
self._is_ssl = False
9 years ago
self.path = ''
self.port = None
self.post_params = RepeatableDict()
self._raw_data = ''
self.reqid = None
self.response = None
self.submitted = False
self.unmangled = None
self.verb = ''
self.version = ''
self.tags = []
9 years ago
self._first_line = True
self._data_length = 0
self._partial_data = ''
# Set values from init
if is_ssl:
self.is_ssl = True
if port:
self.port = port
9 years ago
# Get values from the raw request
if full_request is not None:
self._from_full_request(full_request, update_content_length)
def __copy__(self):
if not self.complete:
raise PappyException("Cannot copy incomplete requests")
newreq = Request(self.full_request)
newreq.is_ssl = self.is_ssl
newreq.port = self.port
newreq._host = self._host
newreq.time_start = self.time_start
newreq.time_end = self.time_end
if self.unmangled:
newreq.unmangled = self.unmangled.copy()
if self.response:
newreq.response = self.response.copy()
return newreq
def __eq__(self, other):
if self.full_request != other.full_request:
return False
if self.port != other.port:
return False
if self.is_ssl != other.is_ssl:
return False
if self._host != other._host:
return False
return True
def copy(self):
Returns a copy of the request
:rtype: Request
return self.__copy__()
9 years ago
def rsptime(self):
The response time of the request
:getter: Returns the response time of the request
:type: datetime.timedelta
9 years ago
if self.time_start and self.time_end:
return self.time_end-self.time_start
return None
def status_line(self):
The status line of the request. ie `GET / HTTP/1.1`
:getter: Returns the status line of the request
:setter: Sets the status line of the request
:type: string
if not self.verb and not self.path and not self.version:
return ''
return '%s %s %s' % (self.verb, self.full_path, self.version)
def status_line(self, val):
def full_path(self):
The full path of the request including URL params and fragment.
ie `/path/to/stuff?foo=bar&baz=something#somewhere`
:getter: Returns the full path of the request
:type: string
9 years ago
path = self.path
if self.url_params:
9 years ago
path += '?'
pairs = []
for pair in self.url_params.all_pairs():
9 years ago
if pair[1] is None:
path += '&'.join(pairs)
if self.fragment:
path += '#'
path += self.fragment
return path
9 years ago
def raw_headers(self):
The raw text of the headers including the extra newline at the end.
:getter: Returns the raw text of the headers including the extra newline at the end.
:type: string
9 years ago
ret = self.status_line + '\r\n'
for k, v in self.headers.all_pairs():
ret = ret + "%s: %s\r\n" % (k, v)
ret = ret + '\r\n'
return ret
def full_request(self):
The full text of the request including the headers and data.
:getter: Returns the full text of the request
:type: string
if not self.status_line:
return ''
9 years ago
ret = self.raw_headers
ret = ret + self.raw_data
return ret
def raw_data(self):
The data portion of the request
:getter: Returns the data portion of the request
:setter: Set the data of the request and update metadata
:type: string
9 years ago
return self._raw_data
def raw_data(self, val):
self._raw_data = val
9 years ago
self.complete = True
def url(self):
The full url of the request including url params, protocol, etc.
ie `https://www.google.com`, `http://foo.fakewebsite.com:1234/path?a=b`.
When setting the URL, the port, is_ssl, path, url params, host, etc are all
automatically updated.
:getter: Returns the url of the request
:setter: Sets the url of the request and updates metadata
:type: string
if self.is_ssl:
retstr = 'https://'
retstr = 'http://'
retstr += self.host
if not ((self.is_ssl and self.port == 443) or \
(not self.is_ssl and self.port == 80)):
retstr += ':%d' % self.port
if self.path and self.path != '/':
retstr += self.path
if self.url_params:
retstr += '?'
pairs = []
for p in self.url_params.all_pairs():
retstr += '&'.join(pairs)
if self.fragment:
retstr += '#%s' % self.fragment
return retstr
def url(self, val):
def host(self):
The host of the request. ie `www.google.com`.
:getter: Returns the host of the request
:setter: Changes the host of the request and updates the Host header
:type: string
return self._host
def host(self, val):
self._host = val
self.headers.update('Host', val, do_callback=False)
def is_ssl(self):
Whether the request is sent over SSL
:getter: Returns if the request is sent over SSL
:setter: Sets if the request is sent over SSL
:type: Bool
return self._is_ssl
def is_ssl(self, val):
if val:
self._is_ssl = True
if self.port == 80:
self.port = 443
self._is_ssl = False
if self.port == 443:
self.port = 80
9 years ago
def saved(self):
If the request is saved in the data file
:getter: Returns True if the request is saved in the data file
:type: Bool
if self.reqid is None:
return False
_ = int(self.reqid)
return True
except (ValueError, TypeError):
return False
def path_tuple(self):
The path in tuple form starting with the host. For example, path_parts for
a request to http://www.example.com/foo/bar.php would be::
('www.example.com', 'foo', 'bar.php')
:getter: Returns the path in tuple form
:type: Tuple
# the first element is blank because the path always starts with /
ret = [self.host] + self.path.split('/')[1:]
if ret[-1] == '':
ret = ret[:-1]
return tuple(ret)
def _from_full_request(self, full_request, update_content_length=False):
9 years ago
# Get rid of leading CRLF. Not in spec, should remove eventually
# technically doesn't treat \r\n same as \n, but whatever.
full_request = _strip_leading_newlines(full_request)
if full_request == '':
9 years ago
remaining = full_request
while remaining and not self.headers_complete:
line, remaining = _consume_line(remaining)
9 years ago
if not self.headers_complete:
if not self.complete:
if update_content_length:
self.raw_data = remaining
9 years ago
9 years ago
## Internal update functions
9 years ago
def _set_dict_callbacks(self):
# Add callbacks to dicts
def _update_from_data(self):
9 years ago
# Updates metadata that's based off of data
self.headers.update('Content-Length', str(len(self.raw_data)), do_callback=False)
if 'content-type' in self.headers:
if self.headers['content-type'] == 'application/x-www-form-urlencoded':
self.post_params = repeatable_parse_qs(self.raw_data)
9 years ago
def _update_from_objects(self):
9 years ago
# Updates text values that depend on objects.
if self.cookies:
assignments = []
for ck, cv in self.cookies.all_pairs():
asn = '%s=%s' % (ck, cv)
header_val = '; '.join(assignments)
self.headers.update('Cookie', header_val, do_callback=False)
if self.post_params:
pairs = []
for k, v in self.post_params.all_pairs():
9 years ago
pairs.append('%s=%s' % (k, v))
self.raw_data = '&'.join(pairs)
def _update_from_text(self):
9 years ago
# Updates metadata that depends on header/status line values
self.cookies = RepeatableDict()
9 years ago
for k, v in self.headers.all_pairs():
self._handle_header(k, v)
## Data loading
def add_line(self, line):
Used for building a request from a Twisted protocol.
Add a line (for status line and headers). Lines must be added in order
and the first line must be the status line. The line should not contain
the trailing carriage return/newline. I do not suggest you use this for
:param line: The line to add
:type line: string
if self._first_line and line == '':
# Ignore leading newlines because fuck the spec
if self._first_line:
self._first_line = False
# Either header or newline (end of headers)
if line == '':
self.headers_complete = True
if self._data_length == 0:
self.complete = True
key, val = line.split(':', 1)
val = val.strip()
if self._handle_header(key, val):
self.headers.append(key, val, do_callback=False)
9 years ago
def add_data(self, data):
Used for building a request from a Twisted protocol.
Add data to the request.
I do not suggest that you use this function ever.
:param data: The data to add
:type data: string
9 years ago
# Add data (headers must be complete)
len_remaining = self._data_length - len(self._partial_data)
if len(data) >= len_remaining:
self._partial_data += data[:len_remaining]
self._raw_data = self._partial_data
self.complete = True
9 years ago
self._partial_data += data
## Data parsing
def _process_host(self, hostline):
9 years ago
# Get address and port
# Returns true if port was explicitly stated
port_given = False
9 years ago
if ':' in hostline:
self._host, self.port = hostline.split(':')
9 years ago
self.port = int(self.port)
if self.port == 443:
self._is_ssl = True
port_given = True
9 years ago
self._host = hostline
if not self.port:
9 years ago
self.port = 80
return port_given
9 years ago
def _handle_statusline_uri(self, uri):
if not re.match('(?:^.+)://', uri):
uri = '//' + uri
parsed_path = urlparse.urlparse(uri)
netloc = parsed_path.netloc
port_given = False
if netloc:
port_given = self._process_host(netloc)
if re.match('^https://', uri) or self.port == 443:
self._is_ssl = True
if not port_given:
self.port = 443
if re.match('^http://', uri):
self._is_ssl = False
if not self.port:
if self.is_ssl:
self.port = 443
self.port = 80
reqpath = parsed_path.path
if parsed_path.path:
self.path = parsed_path.path
self.path = '/'
if parsed_path.query:
reqpath += '?'
reqpath += parsed_path.query
self.url_params = repeatable_parse_qs(parsed_path.query)
if parsed_path.fragment:
reqpath += '#'
reqpath += parsed_path.fragment
self.fragment = parsed_path.fragment
def _handle_statusline(self, status_line):
9 years ago
parts = status_line.split()
uri = None
if len(parts) == 3:
self.verb, uri, self.version = parts
elif len(parts) == 2:
self.verb, self.version = parts
raise Exception("Unexpected format of first line of request")
# Get path using urlparse
if uri is not None:
9 years ago
def _handle_header(self, key, val):
9 years ago
# We may have duplicate headers
stripped = False
if key.lower() == 'content-length':
self._data_length = int(val)
elif key.lower() == 'cookie':
# We still want the raw key/val for the cookies header
# because it's still a header
cookie_strs = val.split('; ')
# The only whitespace that matters is the space right after the
# semicolon. If actual implementations mess this up, we could
# probably strip whitespace around the key/value
for cookie_str in cookie_strs:
if '=' in cookie_str:
splitted = cookie_str.split('=',1)
assert(len(splitted) == 2)
(cookie_key, cookie_val) = splitted
cookie_key = cookie_str
cookie_val = ''
9 years ago
# we want to parse duplicate cookies
self.cookies.append(cookie_key, cookie_val, do_callback=False)
elif key.lower() == 'host':
elif key.lower() == 'connection':
#stripped = True
return (not stripped)
def _handle_data_end(self):
9 years ago
if 'content-type' in self.headers:
if self.headers['content-type'] == 'application/x-www-form-urlencoded':
self.post_params = repeatable_parse_qs(self.raw_data)
## Serializing
def to_json(self):
Return a JSON encoding of the request that can be used by
:func:`~pappyproxy.http.Request.from_json` to recreate the request.
The `full_request` portion is base64 encoded because json doesn't play
nice with binary blobs.
# We base64 encode the full response because json doesn't paly nice with
# binary blobs
data = {
'full_request': base64.b64encode(self.full_request),
'reqid': self.reqid,
if self.response:
data['response_id'] = self.response.rspid
data['response_id'] = None
if self.unmangled:
data['unmangled_id'] = self.unmangled.reqid
if self.time_start:
data['start'] = self.time_start.isoformat()
if self.time_end:
data['end'] = self.time_end.isoformat()
data['tags'] = self.tags
data['port'] = self.port
data['is_ssl'] = self.is_ssl
return json.dumps(data)
def from_json(self, json_string):
Update the metadata of the request to match data from
:param json_string: The JSON data to use
:type json_string: JSON data in a string
data = json.loads(json_string)
self.port = data['port']
self._is_ssl = data['is_ssl']
if 'tags' in data:
self.tags = data['tags']
self.tags = []
if data['reqid']:
self.reqid = data['reqid']
9 years ago
## Data store functions
9 years ago
def async_save(self):
Save/update the request in the data file. Returns a twisted deferred which
fires when the save is complete.
:rtype: twisted.internet.defer.Deferred
9 years ago
# Check for intyness
_ = int(self.reqid)
9 years ago
# If we have reqid, we're updating
yield dbpool.runInteraction(self._update)
assert(self.reqid is not None)
yield dbpool.runInteraction(self._update_tags)
except (ValueError, TypeError):
# Either no id or in-memory
9 years ago
yield dbpool.runInteraction(self._insert)
assert(self.reqid is not None)
yield dbpool.runInteraction(self._update_tags)
9 years ago
9 years ago
def save(self):
Save/update the request in the data file.
Saves the request, its unmangled version, the response, and the unmanbled response.
Cannot be called from inside an async function.
yield self.async_deep_save()
def async_deep_save(self):
Saves self, unmangled, response, and unmangled response. Returns a deferred
which fires after everything has been saved.
:rtype: twisted.internet.defer.Deferred
9 years ago
if self.response:
if self.response.unmangled:
yield self.response.unmangled.async_save()
yield self.response.async_save()
9 years ago
if self.unmangled:
yield self.unmangled.async_save()
yield self.async_save()
def _update_tags(self, txn):
# This should never be called on an unsaved or in-memory request
DELETE FROM tagged WHERE reqid=?;
tagids = []
tags_to_add = []
# Find ids that already exist
for tag in self.tags:
SELECT id, tag FROM tags WHERE tag=?;
result = txn.fetchall()
if len(result) == 0:
tagid = int(result[0][0])
# Add new tags
for tag in tags_to_add:
INSERT INTO tags (tag) VALUES (?);
# Tag our request
for tagid in tagids:
INSERT INTO tagged (reqid, tagid) VALUES (?, ?);
(int(self.reqid), tagid)
9 years ago
def _update(self, txn):
# If we don't have an reqid, we're creating a new reuqest row
setnames = ["full_request=?", "port=?"]
queryargs = [self.full_request, self.port]
9 years ago
if self.response:
assert(self.response.rspid is not None) # should be saved first
if self.unmangled:
assert(self.unmangled.reqid is not None) # should be saved first
if self.time_start:
if self.time_end:
if self.is_ssl:
9 years ago
if self.submitted:
UPDATE requests SET %s WHERE id=?;
""" % ','.join(setnames),
def _insert(self, txn):
# If we don't have an reqid, we're creating a new reuqest row
colnames = ["full_request", "port"]
colvals = [self.full_request, self.port]
9 years ago
if self.response:
assert(self.response.rspid is not None) # should be saved first
if self.unmangled:
assert(self.unmangled.reqid is not None) # should be saved first
if self.time_start:
if self.time_end:
if self.submitted:
if self.is_ssl:
9 years ago
INSERT INTO requests (%s) VALUES (%s);
""" % (','.join(colnames), ','.join(['?']*len(colvals))),
self.reqid = str(txn.lastrowid)
9 years ago
assert txn.lastrowid is not None
assert self.reqid is not None
9 years ago
def delete(self):
assert(self.reqid is not None)
yield dbpool.runQuery(
9 years ago
DELETE FROM requests WHERE id=?;
yield dbpool.runQuery(
DELETE FROM tagged WHERE reqid=?;
self.reqid = None
9 years ago
def deep_delete(self):
if self.unmangled:
yield self.unmangled.delete()
if self.response:
if self.response.unmangled:
yield self.response.unmangled.delete()
yield self.response.delete()
yield self.delete()
9 years ago
def _gen_sql_row(tablename=None):
template = "{pre}full_request, {pre}response_id, {pre}id, {pre}unmangled_id, {pre}start_datetime, {pre}end_datetime, {pre}port, {pre}is_ssl"
if tablename:
return template.format(pre=('%s.'%tablename))
9 years ago
return template.format(pre='')
def _from_sql_row(row):
req = Request(row[0])
if row[1]:
rsp = yield Response.load_response(str(row[1]))
req.response = rsp
if row[3]:
unmangled_req = yield Request.load_request(str(row[3]))
req.unmangled = unmangled_req
if row[4]:
req.time_start = datetime.datetime.strptime(row[4], "%Y-%m-%dT%H:%M:%S.%f")
if row[5]:
req.time_end = datetime.datetime.strptime(row[5], "%Y-%m-%dT%H:%M:%S.%f")
if row[6] is not None:
req.port = int(row[6])
if row[7] == 1:
req._is_ssl = True
req.reqid = str(row[2])
# tags
rows = yield dbpool.runQuery(
SELECT tg.tag
FROM tagged tgd, tags tg
WHERE tgd.tagid=tg.id AND tgd.reqid=?;
req.tags = []
for row in rows:
9 years ago
def load_all_requests():
Load all the requests in the data file and return them in a list.
Returns a deferred which calls back with the list of requests when complete.
:rtype: twisted.internet.defer.Deferred
reqs = []
reqs += list(pappyproxy.context.in_memory_requests)
rows = yield dbpool.runQuery(
FROM requests;
""" % Request._gen_sql_row(),
for row in rows:
req = yield Request._from_sql_row(row)
9 years ago
def load_requests_by_tag(tag):
Load all the requests in the data file with a given tag and return them in a list.
Returns a deferred which calls back with the list of requests when complete.
:rtype: twisted.internet.defer.Deferred
# tags
rows = yield dbpool.runQuery(
SELECT tgd.reqid
FROM tagged tgd, tags tg
WHERE tgd.tagid=tg.id AND tg.tag=?;
reqs = []
for row in rows:
req = Request.load_request(row[0])
def load_request(to_load, allow_special=True):
Load a request with the given request id and return it.
Returns a deferred which calls back with the request when complete.
:rtype: twisted.internet.defer.Deferred
9 years ago
if not allow_special:
except (ValueError, TypeError):
raise PappyException('Cannot load special id %s' % to_load)
ret_unmangled = False
rsp_unmangled = False
if to_load[0] == 'u':
ret_unmangled = True
loadid = to_load[1:]
elif to_load[0] == 's':
rsp_unmangled = True
loadid = to_load[1:]
loadid = to_load
def retreq(r):
if ret_unmangled:
if not r.unmangled:
raise PappyException("Request %s was not mangled"%r.reqid)
return r.unmangled
if rsp_unmangled:
if not r.response:
raise PappyException("Request %s does not have a response" % r.reqid)
if not r.response.unmangled:
raise PappyException("Response to request %s was not mangled" % r.reqid)
r.response = r.response.unmangled
return r
return r
for r in pappyproxy.context.in_memory_requests:
if r.reqid == to_load:
for r in pappyproxy.context.all_reqs:
if r.reqid == to_load:
for r in pappyproxy.context.active_requests:
if r.reqid == to_load:
if to_load[0] == 'm':
# An in-memory request should have been loaded in the previous loop
raise PappyException('In-memory request %s not found' % to_load)
9 years ago
rows = yield dbpool.runQuery(
9 years ago
FROM requests
WHERE id=?;
""" % Request._gen_sql_row(),
9 years ago
if len(rows) != 1:
raise PappyException("Request with id %s does not exist" % loadid)
req = yield Request._from_sql_row(rows[0])
req.reqid = to_load
9 years ago
def load_from_filters(filters):
# Not efficient in any way
# But it stays this way until we hit performance issues
rows = yield dbpool.runQuery(
SELECT %s FROM requests r1
9 years ago
LEFT JOIN requests r2 ON r1.id=r2.unmangled_id
WHERE r2.id is NULL;
""" % Request._gen_sql_row('r1'),
9 years ago
reqs = []
for row in rows:
req = yield Request._from_sql_row(row)
reqs += list(pappyproxy.context.in_memory_requests)
(reqs, _) = pappyproxy.context.filter_reqs(reqs, filters)
9 years ago
## Submitting Requests
def submit_new(host, port, is_ssl, full_request):
submit_new(host, port, is_ssl, full_request)
Submits a request with the given parameters and returns a request object
with the response.
:param host: The host to submit to
:type host: string
:param port: The port to submit to
:type port: Integer
:type is_ssl: Whether to use SSL
:param full_request: The request data to send
:type full_request: string
:rtype: Twisted deferred that calls back with a Request
new_obj = Request(full_request)
factory = pappyproxy.proxy.ProxyClientFactory(new_obj, save_all=False)
factory.connection_id = pappyproxy.proxy.get_next_connection_id()
if is_ssl:
reactor.connectSSL(host, port, factory, pappyproxy.proxy.ClientTLSContext())
reactor.connectTCP(host, port, factory)
new_req = yield factory.data_defer
def async_submit(self):
Same as :func:`~pappyproxy.http.Request.submit` but generates deferreds.
Submits the request using its host, port, etc. and updates its response value
to the resulting response.
:rtype: Twisted deferred
new_req = yield Request.submit_new(self.host, self.port, self.is_ssl,
self.response = new_req.response
self.time_start = new_req.time_start
self.time_end = new_req.time_end
def submit(self):
Submits the request using its host, port, etc. and updates its response value
to the resulting response.
Cannot be called in async functions.
This is what you should use to submit your requests in macros.
new_req = yield Request.submit_new(self.host, self.port, self.is_ssl,
self.response = new_req.response
self.time_start = new_req.time_start
self.time_end = new_req.time_end
9 years ago
class Response(object):
:ivar complete: When creating the response with :func:`~pappyproxy.http.Response.add_line`
and :func:`~pappyproxy.http.Response.add_data`, returns whether
the request is complete.
:vartype complete: Bool
:ivar cookies: Cookies set by the response
:vartype cookies: RepeatableDict of ResponseCookie objects
:ivar headers: The headers of the response
:vartype headers: RepeatableDict
:ivar headers_complete: When creating the response with
:func:`~pappyproxy.http.Response.add_line` and
:func:`~pappyproxy.http.Response.add_data`, returns whether the headers
are complete
:vartype headers_complete: Bool
:ivar response_code: The response code of the response
:vartype response_code: Integer
:ivar response_text: The text associated with the response code (ie OK, NOT FOUND, etc)
:vartype response_text: String
:ivar rspid: If the response is saved in the data file, the id of the response
:vartype rspid: String
:ivar unmangled: If the response was mangled, the unmangled version of the response
:vartype unmangled: Response
:ivar version: The version part of the status line (ie HTTP/1.1)
:vartype version: String
9 years ago
def __init__(self, full_response=None, update_content_length=False):
self.complete = False
self.cookies = RepeatableDict()
self.headers = RepeatableDict(case_insensitive=True)
self.headers_complete = False
self._raw_data = ''
self.response_code = 0
self.response_text = ''
self.rspid = None
self.unmangled = None
self.version = ''
self._encoding_type = ENCODE_NONE
self._first_line = True
self._data_obj = None
self._end_after_headers = False
9 years ago
if full_response is not None:
self._from_full_response(full_response, update_content_length)
9 years ago
def __copy__(self):
if not self.complete:
raise PappyException("Cannot copy incomplete responses")
retrsp = Response(self.full_response)
if self.unmangled:
retrsp.unmangled = self.unmangled.copy()
return retrsp
def copy(self):
return self.__copy__()
def __eq__(self, other):
if self.full_response != other.full_response:
return False
return True
9 years ago
def raw_headers(self):
The raw text of the headers including the extra newline at the end.
:getter: Returns the raw text of the headers including the extra newline at the end.
:type: string
9 years ago
ret = self.status_line + '\r\n'
for k, v in self.headers.all_pairs():
ret = ret + "%s: %s\r\n" % (k, v)
ret = ret + '\r\n'
return ret
def status_line(self):
The status line of the response. ie `HTTP/1.1 200 OK`
:getter: Returns the status line of the response
:setter: Sets the status line of the response
:type: string
if not self.version and self.response_code == 0 and not self.version:
return ''
return '%s %d %s' % (self.version, self.response_code, self.response_text)
9 years ago
def status_line(self, val):
9 years ago
def raw_data(self):
The data portion of the response
:getter: Returns the data portion of the response
:setter: Set the data of the response and update metadata
:type: string
9 years ago
return self._raw_data
def raw_data(self, val):
self._raw_data = val
self._data_obj = LengthData(len(val))
if len(val) > 0:
9 years ago
self._encoding_type = ENCODE_NONE
self.complete = True
9 years ago
def full_response(self):
The full text of the response including the headers and data.
Response is automatically converted from compressed/chunked into an
uncompressed response with a Content-Length header.
:getter: Returns the full text of the response
:type: string
if not self.status_line:
return ''
9 years ago
ret = self.raw_headers
ret = ret + self.raw_data
return ret
def soup(self):
Returns a beautifulsoup4 object for parsing the html of the response
9 years ago
:getter: Returns a BeautifulSoup object representing the html of the response
return bs4.BeautifulSoup(self.raw_data, 'lxml')
def _from_full_response(self, full_response, update_content_length=False):
9 years ago
# Get rid of leading CRLF. Not in spec, should remove eventually
full_response = _strip_leading_newlines(full_response)
if full_response == '':
9 years ago
remaining = full_response
while remaining and not self.headers_complete:
line, remaining = _consume_line(remaining)
9 years ago
9 years ago
if not self.headers_complete:
if update_content_length:
self.raw_data = remaining
9 years ago
if not self.complete:
9 years ago
## Internal update functions
def _set_dict_callbacks(self):
# Add callbacks to dicts
9 years ago
def _update_from_data(self):
self.headers.update('Content-Length', str(len(self.raw_data)), do_callback=False)
9 years ago
def _update_from_objects(self):
# Updates headers from objects
9 years ago
# Cookies
new_headers = RepeatableDict()
cookies_added = False
for pair in self.headers.all_pairs():
if pair[0].lower() == 'set-cookie':
# If we haven't added our cookies, add them all. Otherwise
# strip the header (do nothing)
if not cookies_added:
# Add all our cookies here
for k, c in self.cookies.all_pairs():
new_headers.append('Set-Cookie', c.cookie_str)
cookies_added = True
new_headers.append(pair[0], pair[1])
if not cookies_added:
# Add all our cookies to the end
for k, c in self.cookies.all_pairs():
new_headers.append('Set-Cookie', c.cookie_str)
9 years ago
self.headers = new_headers
def _update_from_text(self):
self.cookies = RepeatableDict()
for k, v in self.headers.all_pairs():
if k.lower() == 'set-cookie':
# Parse the cookie
cookie = ResponseCookie(v)
self.cookies.append(cookie.key, cookie, do_callback=False)
## Data parsing
def _handle_statusline(self, status_line):
9 years ago
self._first_line = False
self.version, self.response_code, self.response_text = \
status_line.split(' ', 2)
self.response_code = int(self.response_code)
if self.response_code == 304 or self.response_code == 204 or \
self.response_code/100 == 1:
self._end_after_headers = True
def _handle_header(self, key, val):
9 years ago
stripped = False
if key.lower() == 'content-encoding':
if val in ('gzip', 'x-gzip'):
self._encoding_type = ENCODE_GZIP
elif val in ('deflate'):
self._encoding_type = ENCODE_DEFLATE
# We send our requests already decoded, so we don't want a header
# saying it's encoded
if self._encoding_type != ENCODE_NONE:
stripped = True
elif key.lower() == 'transfer-encoding' and val.lower() == 'chunked':
self._data_obj = ChunkedData()
self.complete = self._data_obj.complete
stripped = True
elif key.lower() == 'content-length':
# We use our own content length
self._data_obj = LengthData(int(val))
elif key.lower() == 'set-cookie':
cookie = ResponseCookie(val)
self.cookies.append(cookie.key, cookie, do_callback=False)
if stripped:
return False
self.headers.append(key, val, do_callback=False)
return True
## Data loading
def add_line(self, line):
Used for building a response from a Twisted protocol.
Add a line (for status line and headers). Lines must be added in order
and the first line must be the status line. The line should not contain
the trailing carriage return/newline. I do not suggest you use this for
:param line: The line to add
:type line: string
assert(not self.headers_complete)
if not line and self._first_line:
if not line:
self.headers_complete = True
9 years ago
if self._end_after_headers:
self.complete = True
9 years ago
if not self._data_obj:
self._data_obj = LengthData(0)
self.complete = self._data_obj.complete
9 years ago
if self._first_line:
self._first_line = False
key, val = line.split(':', 1)
val = val.strip()
self._handle_header(key, val)
9 years ago
def add_data(self, data):
Used for building a response from a Twisted protocol.
Add data to the response. The data must conform to the content encoding
and transfer encoding given in the headers passed in to
:func:`~pappyproxy.http.Response.add_line`. Can be any fragment of the data.
I do not suggest that you use this function ever.
:param data: The data to add
:type data: string
9 years ago
assert(not self._data_obj.complete)
assert not self.complete
if self._data_obj.complete:
self._raw_data = _decode_encoded(self._data_obj.raw_data,
9 years ago
self.complete = True
9 years ago
## Cookie management
9 years ago
def add_cookie(self, cookie):
Add a :class:`pappyproxy.http.ResponseCookie` to the response.
.. warning::
This will add duplicate cookies. If you want to add a cookie you're not sure exists,
use :func:`~pappyproxy.http.Response.set_cookie`
self.cookies.append(cookie.key, cookie)
def set_cookie(self, cookie):
Set a cookie in the response. ``cookie`` must be a :class:`pappyproxy.http.ResponseCookie`
self.cookies[cookie.key] = cookie
def set_cookie_kv(self, key, val):
Set a cookie by key and value. Will not have path, secure, etc set at all.
cookie = ResponseCookie()
cookie.key = key
cookie.val = val
self.cookies[cookie.key] = cookie
def delete_cookie(self, key):
Delete a cookie from the response by its key
del self.cookies[key]
## Serializing
9 years ago
def to_json(self):
Return a JSON encoding of the response that can be used by
:func:`~pappyproxy.http.Response.from_json` to recreate the response.
The ``full_response`` portion is base64 encoded because json doesn't play
nice with binary blobs.
9 years ago
data = {
'rspid': self.rspid,
'full_response': base64.b64encode(self.full_response),
if self.unmangled:
data['unmangled_id'] = self.unmangled.rspid
return json.dumps(data)
9 years ago
def from_json(self, json_string):
Update the metadata of the response to match data from
:param json_string: The JSON data to use
:type json_string: JSON data in a string
9 years ago
data = json.loads(json_string)
9 years ago
if data['rspid']:
self.rspid = str(data['rspid'])
## Database interaction
9 years ago
def async_save(self):
Save/update the just request in the data file. Returns a twisted deferred which
fires when the save is complete. It is suggested that you use
:func: `~pappyproxy.http.Request.async_deep_save` instead to save responses.
:rtype: twisted.internet.defer.Deferred
9 years ago
# Check for intyness
_ = int(self.rspid)
9 years ago
# If we have rspid, we're updating
yield dbpool.runInteraction(self._update)
except (ValueError, TypeError):
9 years ago
yield dbpool.runInteraction(self._insert)
assert(self.rspid is not None)
# Right now responses without requests are unviewable
# @crochet.wait_for(timeout=180.0)
# @defer.inlineCallbacks
# def save(self):
# yield self.save()
9 years ago
def _update(self, txn):
setnames = ["full_response=?"]
queryargs = [self.full_response]
if self.unmangled:
assert(self.unmangled.rspid is not None) # should be saved first
UPDATE responses SET %s WHERE id=?;
""" % ','.join(setnames),
assert(self.rspid is not None)
def _insert(self, txn):
# If we don't have an rspid, we're creating a new one
colnames = ["full_response"]
colvals = [self.full_response]
if self.unmangled is not None:
assert(self.unmangled.rspid is not None) # should be saved first
INSERT INTO responses (%s) VALUES (%s);
""" % (','.join(colnames), ','.join(['?']*len(colvals))),
self.rspid = txn.lastrowid
assert(self.rspid is not None)
9 years ago
def delete(self):
assert(self.rspid is not None)
row = yield dbpool.runQuery(
DELETE FROM responses WHERE id=?;
self.rspid = None
9 years ago
def load_response(respid):
Load a response from its response id. Returns a deferred. I don't suggest you use this.
:rtype: twisted.internet.defer.Deferred
9 years ago
rows = yield dbpool.runQuery(
SELECT full_response, id, unmangled_id
FROM responses
WHERE id=?;
if len(rows) != 1:
raise PappyException("Response with request id %s does not exist" % respid)
9 years ago
full_response = rows[0][0]
resp = Response(full_response)
resp.rspid = str(rows[0][1])
9 years ago
if rows[0][2]:
unmangled_response = yield Response.load_response(int(rows[0][2]))
resp.unmangled = unmangled_response