#! /usr/bin/python import leveldb import os import re import ujson import urllib # each of these pairs of strings corresponds to a collection mode as # described in Section 4. each pair of names contains two synonyms # which could in principle be consolidate into a single name collection_modes = [('mode_1', 'cache_off_fresh_image'), ('mode_2', 'cache_on_fresh_image'), ('mode_3', 'cache_on_persistent_image'), ('mode_4', 'cache_on_persistent_image_all_sites')] # list of all websites tested. sites_tested = ['aclu', 'bank_of_america', 'kaiser', 'legal_zoom', 'mayo_clinic', 'netflix', 'planned_parenthood', 'vanguard', 'wellsfargo', 'youtube'] # list of all defenses tested. note that these generally parallel # the "no_defense" directory, but are listed separately here since # we run more tests for the special "no defense" defense. defense_dirs = ['exponential_defense', 'linear_defense', 'macro_pad_ambi_max_1.03_range_1', 'macro_pad_ambi_max_1.05_range_1', 'macro_pad_ambi_max_1.10_range_1', 'weak_fragmentation_defense'] # directory corresponding to not applying a defense to the traffic no_defense_dir = 'no_defense' # this function checks the contents of the pcaps directory to verify # that all data meets expectations (as described below). def check_pcaps(pcap_dir): print '\nchecking: %s' % pcap_dir # build list of collection jobs. notice that there should be one # colleciton job for each (collection mode, site) pair. note that # for mode 4 ("all sites") the collection job actually included # traffic from each of the sites, where browsing sessions each # containing 75 URLs at a single site were executed in sequence in # a single VM, and we have separated the resulting pcaps back into # separate directories here. This was done for the purpose of # maintaining compatibility with other parts of the codebase. collection_jobs = set() for _, cm in collection_modes: for s in sites_tested: collection_jobs.add('%s_%s' % (cm, s)) # check files jobs = set(os.listdir(pcap_dir)) assert collection_jobs == jobs for cj in collection_jobs: print '\tchecking: %s' % cj files = set(os.listdir(os.path.join(pcap_dir, cj))) # this file contains a list of lists, where each list contains # 75 URLs which should be visited during a browsing session assert 'browse.json' in files # this file contains a list of tuples (one per line) where each # tuple specifies the start and end of a range (end not inclusive) # of browsing sessions which represent contiguous paths through # the website. for the browsing sessions outside these ranges, # there will be at least one transition between URLs which is # outside of the website link structure. this occurs because link # structures can make some pages much easier to reach than others, # which would mean collection of 16 samples for each page with # strict adherence to the link graph during all of collection would # require collecting significant redundant samples of pages which are # easier to reach. assert 'path_ranges' in files # this file is a firefox config file which was placed in each VM # for the corresponding collection job. note that this file is # first augmented as indicated in each of the subdirectories # containing pcap files. assert 'user.js' in files # this file exists to aid our collection infrastructure and # configure VM behavior assert 'vm_general_config' in files # each of session_dirs contains the pcaps from a single, 75 URL # browsing session session_dirs = filter(lambda x: re.match('\d{4}', x), files) # make sure no session_dirs are missing assert sorted(map(int, session_dirs)) == range(max(map(int, session_dirs)) + 1) # make sure there are no extra files assert len(files) == max(map(int, session_dirs)) + 5 # load browse.json and make sure that the URLs visited in each # browsing session correspond to our expectation handle = open(os.path.join(pcap_dir, cj, 'browse.json')) browse_jobs = ujson.load(handle) handle.close() # make sure all browsing jobs have been executed assert len(browse_jobs) == len(session_dirs) # check the contents of each session directory for sd in session_dirs: files = set(os.listdir(os.path.join(pcap_dir, cj, sd))) # there should be 75 pcap files and 4 additional files # note that tcpdump was configured to capture traffic # on all protocols and ports, but only the first 128 bytes # of each packet since this is enough to provide header # information and the contents are encrypted anyways. assert files == set(['%.4d.pcap' % d for d in range(75)] + ['labels.idx', 'user.js', 'vm_config', 'key.idx']) # load the browser config file which corresponded to this # browsing session, make sure that the browsing jobs # it contains match our expectations in browse.json handle = open(os.path.join(pcap_dir, cj, sd, 'user.js')) user_js = handle.read() handle.close() browse_job = ujson.loads(re.match('.*user_pref\(\'greasemonkey.scriptvals.TAAttack/BrowserDriver.path\', \'(.*)\'\);', user_js, flags = re.DOTALL).group(1)) assert browse_job == browse_jobs[int(sd)] # verify contents of key.idx. there is one entry in # key.idx for each sample in the browsing session. the # first field is the pcap name, the second field is the # URL in the browser address bar when the requested page # finished loading, and the third field is the URL requested. # note that due to redirection, scripts, etc. the requested # URL and the final URL may not be the same. handle = open(os.path.join(pcap_dir, cj, sd, 'key.idx')) browse_job = [re.match('\S+?\s+\S+\s+(.*)', x).group(1) for x in handle.read().strip().split('\n')] handle.close() assert browse_job == map(urllib.unquote, map(str, browse_jobs[int(sd)])) print 'done' # check that the contents of the features directory is as expected. # note that this requres the pcaps directory since one of the expectations # is that there will be a set of feature files corresponding to each of # the pcap files. note also that "feature" as used here is a bit of a # misnomer; the contents of this directory would be better understood as # the result of preprocessing steps which happen prior to actual feature # extraction. def check_features(pcaps, features): print '\nchecking: %s' % features # note that each of these pre-processing "feature" files are generated # by FeatureExtractor.py. Each item in the list below corresponds to # to a directory which should exist for each collection job. Inside # the directory, there should be a file corresponding to each pcap # collected during the collection job. Due to the large volume of # small files, this approach requires many inodes and can be slow. # consequently, we have replaced many of the "directories" with LevelDB # LevelDB is a key-value store, where the keys correspond to file names # and values correspond to file contents in our context. # the content of each directory can be summarized as follows preprocessing_features = [# these files contain the (outgoing, incoming, domain) tuples # for bursts on each TCP connection. 'burst_pair_files', # these files contain the sizes of contiguous traffic bursts # seen independent of TCP connection. note that these burst # sizes may not be the same as seen in burst_pair_files due # to TCP stream interweaving. 'burst_stream_files', # these files contain counts of packet sizes 'size_count_files', # these files contain a sequence of the raw packet sizes seen 'raw_sequence_files', # these files contain a sequence of the raw packet sizes # rounded to the nearest 600 bytes. 'rounded_sequence_files'] expected_files = {} for st in sites_tested: for mode_num, mode_name in collection_modes: print '\tchecking: %s_%s' % (mode_num, st) # figure out the file names we expect to see based on the # pcaps directory full_name = '%s_%s' % (mode_name, st) expected_files[full_name] = set() session_dirs = filter(lambda x: re.match('\d{4}', x), os.listdir(os.path.join(pcaps, full_name))) for sd in session_dirs: for sample in ['%.4d' % x for x in range(75)]: expected_files[full_name].add('%s_%s_%s.dat' % (full_name, sd, sample)) for d in defense_dirs + [no_defense_dir]: files = set(os.listdir(os.path.join(features, d, '%s_%s' % (mode_num, st)))) # for each defense, make sure we see the groups of # preprocessing feature files which we expect if d == no_defense_dir: # note that "no_defense" should have all files in the file # system as well as in LevelDB databases assert files == set(['defense_stats', 'stats'] + preprocessing_features + ['%s.ldb' % x for x in preprocessing_features]) else: assert files == set(['defense_stats', 'stats'] + ['%s.ldb' % x for x in preprocessing_features]) for pf in preprocessing_features: # for each of the sets of preprocessing features, make sure # that we have an entry corresponding to each pcap if d == no_defense_dir: found_files = set(os.listdir(os.path.join(features, d, '%s_%s' % (mode_num, st), pf))) assert found_files == expected_files[full_name] ldb_path = os.path.join(features, d, '%s_%s' % (mode_num, st), '%s.ldb' % pf) assert os.path.isdir(ldb_path) ldb = leveldb.LevelDB(ldb_path) found_files = set() for k in ldb.RangeIter(include_value = False): found_files.add(k) assert found_files == expected_files[full_name] print 'done' # check the contents of the sitemaps directory, which has been # cleaned up to only include the defense config files for each site. # these files specify the padding buckets for the Burst defense. # in this function, we simply make sure that padding has been # appropriately applied and that all extracted burst values are # one of the bucket thresholds. def check_sitemaps(sitemaps, features): print 'checking: %s' % sitemaps pad_levels = ['1.03', '1.05', '1.10'] for st in sites_tested: for mode_num, mode_name in collection_modes: for pl in pad_levels: print '\tchecking: %s_%s:%s' % (mode_num, st, pl) handle = open(os.path.join(sitemaps, st, 'defense_config', mode_num, 'max_%s_range_1.out' % pl)) out_levels = [int(x) for x in handle.read().strip().split('\n')[3:]] handle.close() handle = open(os.path.join(sitemaps, st, 'defense_config', mode_num, 'max_%s_range_1.in' % pl)) in_levels = [int(x) for x in handle.read().strip().split('\n')[3:]] handle.close() # note that we compare to burst_pair_files, not burst_stream_files. # burst padding is applied to bursts over a single TCP connection, # but bursts in burst_stream_files are extracted independent of # TCP connection (which means that stream interweaving can cause # bursts extracted independent of TCP connection to be fragmented). ldb_path = os.path.join(features, 'macro_pad_ambi_max_%s_range_1' % pl, '%s_%s' % (mode_num, st), 'burst_pair_files.ldb') ldb = leveldb.LevelDB(ldb_path) for k, v in ldb.RangeIter(): if v.strip().split('\n') == ['']: continue for line in v.strip().split('\n'): (out_val, in_val) = [int(x) for x in line.split()[:2]] assert out_val in out_levels assert in_val in in_levels # check the folds directory. note that the correctness of the folds # directory depends on the pcaps directory, so we need the path to # that directory as well. # the folds directory describes how to divide the collected data to # conduct a multi-fold evaluation; namely which samples should be # used for training, testing and evaluation (holdout) in each fold # of the evaluation. the testing data exists for the purpose of # doing parameter search by training models with varying parameters # on the training data and evaluating the models on the testing data. # once the best performing parameters are identified, we train a # model using both the training and testing data, and then evaluate # the model using the holdout data. In this way, we produce a true # evaluation of each technique since the evaluation data playz zero # role in training, model or parameter selection. # note that within each collection job, there are 5 fold directories: # fold_00, fold_01, fold_02, traintest and holdout. fold_00, fold_01 # and fold_02 divide the data from a collection job such that a 3-fold # evaluation can be conducted using the data from that job alone (i.e. # the training and evaluation data are drawn from the same collection # mode). traintest divides the data in a collection job such that # the entire job can serve as training and testing data (with the # assumption that evaluation (holdout) data will come from a different # collection mode. holdout describes how to use all of the data from # a given collection job as evaluation data (i.e. no data is left for # training or testing). # within each fold directory, there may be the following files/directories: # holdout_paths, label_list, sample_key_holdout.dat, sample_key_test.dat # sample_key_train.dat, trans_matrix. # holdout_paths: this directory contains files very similar to labels.idx # that indicate which sessions in the holdout data for this fold # represent contiguous paths throug the site graph # label_list: this file contains a list of all labels (i.e. pages) seen # in the training and testing data for this fold. note that due to # redirections which occur during data collection, it is possible for # some pages to have few samples and hence appear only in holdout # data. we make no effort to "correct" this as it is a real challenge # an attacker could experience. # sample_key_holdout.dat: identifies the samples corresponding to each # label which should be included in the holdout data for this fold # sample_key_test.dat: identifies the samples corresponding to each # label which should be included in the testing data for this fold # sample_key_train.dat: identifies the samples corresponding to each label # which should be included in the training data for this fold # trans_matrix: contains a square matrix representing the link graph of # the website. the rows (and correspondingly columns) are labeled # by their corresponding line in label_list. # the exact files contained in each fold directory may vary according to # the purpose of the fold directory. for example, the 'holdout' directory # does not have a sample_key_train.dat or sample_key_test.dat file since # the 'holdout' fold is designed to be used in evaluations as a fold which # uses all of the sample from a given collection mode. def check_folds(pcaps, folds): print '\nchecking: %s' % folds for st in sites_tested: for mode_num, mode_name in collection_modes: print '\tchecking: %s_%s' % (mode_num, st) # check that all expected files are here for fold in ['fold_00', 'fold_01', 'fold_02']: files = set(os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), fold))) assert files == set(['holdout_paths', 'label_list', 'sample_key_holdout.dat', 'sample_key_test.dat', 'sample_key_train.dat', 'trans_matrix']) files = set(os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), 'holdout'))) assert files == set(['holdout_paths', 'sample_key_holdout.dat']) files = set(os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), 'traintest'))) assert files == set(['label_list', 'sample_key_test.dat', 'sample_key_train.dat', 'trans_matrix']) # load transition matrix in the form of a dictionary where keys # correspond to pages, and values correspond to sets of pages # reachable from the corresponding key page handle = open(os.path.join(folds, '%s_%s' % (mode_num, st), 'traintest', 'trans_matrix')) trans_matrix_lines = handle.read().strip().split('\n') handle.close() handle = open(os.path.join(folds, '%s_%s' % (mode_num, st), 'traintest', 'label_list')) label_list = handle.read().strip().split('\n') handle.close() assert len(label_list) == len(trans_matrix_lines) trans_matrix = {} for label, neighbors in zip(label_list, trans_matrix_lines): neighbors = neighbors.split() assert len(neighbors) == len(label_list) neighbors = filter(lambda (x, y): x == '1', zip(neighbors, label_list)) neighbors = set(map(lambda (x, y): y, neighbors)) trans_matrix[label] = neighbors # identify all browsing sessions which should adhere to links # according to path_ranges. then, iterate through each browsing # session to make sure the sequence of pages actually adheres to # links. handle = open(os.path.join(pcaps, '%s_%s' % (mode_name, st), 'path_ranges')) ranges = [map(int, x.split()) for x in handle.read().strip().split('\n')] handle.close() for start, end in ranges: for idx in range(start, end): handle = open(os.path.join(pcaps, '%s_%s' % (mode_name, st), '%.4d' % idx, 'labels.idx')) sequence = [re.match('\S+\s+(.*)', x).group(1) for x in handle.read().strip().split('\n')] handle.close() for leave, land in zip(sequence, sequence[1:]): assert land in trans_matrix[leave] # check holdout_paths (reproduction of path_ranges) in folds directory for fold in ['fold_00', 'fold_01', 'fold_02', 'holdout']: for session in os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), fold, 'holdout_paths')): handle = open(os.path.join(folds, '%s_%s' % (mode_num, st), fold, 'holdout_paths', session)) sequence = [re.match('\S+\s+(.*)', x).group(1) for x in handle.read().strip().split('\n')] handle.close() for leave, land in zip(sequence, sequence[1:]): assert land in trans_matrix[leave] print 'done' # this function checks the contents of the "results" directory to # verify that all expected evaluation results are present. note that # the format of a test is as follows: A_mode_B_C_mode_D_E, where A, # B, C, D and E are defined as follows: # A = {02, 04, 08, 16} and indicates the number of samples per page # included in the training data # B = {1,2,3,4} and indicates the collection mode (see "collection_modes" # above) of the training data # C = {fold_00, fold_01, fold_02, traintest, holdout} indicates the # fold which the training data is taken from # D = {1,2,3,4} and indicates the collection mode of the evaluation # data # E = {fold_00, fold_01, fold_02, traintest, holdout} and indicates # the fold which the evaluation data is taken from def check_results(result_dir): print '\nchecking: %s' % result_dir # note that for each defense, we perform only a single test for # each (attack, website) pair. this provides 10 datapoints # indicating how each attack will perform on each defense. we # perform only a single test because the test uses all available # training data in order to get best results, and the cost of # using randomized methods (e.g. jackknifing) to develop statistical # confidence is prohibitivly large. the results of these tests are # responsible for figure 7A. defense_tests = ['16_mode_2_traintest_mode_4_holdout'] # for the "no_defense" defense (i.e. how the attacks perform when # no defense is applied) we conduct more tests. we present the # purpose of each test below. no_defense_tests = [# these tests are used to produce figure 5a '02_mode_2_traintest_mode_4_holdout', '04_mode_2_traintest_mode_4_holdout', '08_mode_2_traintest_mode_4_holdout', '16_mode_2_traintest_mode_4_holdout', # these tests are used to produce figure 4. # note that we only use 8 samples of training data # because for some tests training and evaluation data # must be taken from the same collection mode. # note also that we evaluate using a "fold_X" fold # rather than the "holdout" fold which includes all # data so that we can use the same data when evaluating # how models perform on a given collection mode # without ever including training data in the # evaluation data. '08_mode_1_fold_00_mode_1_fold_00', '08_mode_1_fold_01_mode_1_fold_01', '08_mode_1_fold_02_mode_1_fold_02', '08_mode_2_fold_00_mode_2_fold_00', '08_mode_2_fold_01_mode_2_fold_01', '08_mode_2_fold_02_mode_2_fold_02', '08_mode_2_fold_00_mode_3_fold_00', '08_mode_2_fold_02_mode_3_fold_02', '08_mode_2_fold_01_mode_3_fold_01', '08_mode_2_fold_02_mode_4_fold_02', '08_mode_2_fold_00_mode_4_fold_00', '08_mode_2_fold_01_mode_4_fold_01', '08_mode_3_fold_01_mode_2_fold_01', '08_mode_3_fold_00_mode_2_fold_00', '08_mode_3_fold_02_mode_2_fold_02', '08_mode_3_fold_02_mode_3_fold_02', '08_mode_3_fold_01_mode_3_fold_01', '08_mode_3_fold_00_mode_3_fold_00', '08_mode_3_fold_01_mode_4_fold_01', '08_mode_3_fold_00_mode_4_fold_00', '08_mode_3_fold_02_mode_4_fold_02'] # this dictionary defines the list of files generated by each # attack implementation. note that in the case of bog, some # of the "files" are directories and "lr_accuracy" presents the # accuracy of logistic regression alone, whereas "viterbi_lr_accuracy" # presents the accuracy of the entire attack including the HMM. attack_files = {'ll': ['gnb_accuracy', 'holdout_features', 'training_features'], 'dlsvm': ['raw_sequence_files_accuracy'], 'pan': ['custom_kernel_svm_accuracy', 'holdout_features', 'libsvm_svm_model.clarge.gsmall', 'training_features'], 'bog': ['domain_models', 'holdout_features', 'holdout_points', 'lr_accuracy', 'model', 'predict', 'testing_points', 'training_features', 'training_points', 'viterbi_lr_accuracy']} attack_implementations = ['ll', 'pan', 'bog', 'dlsvm'] # check files defenses = set(os.listdir(result_dir)) # make sure that we have results for each defense assert defenses == set(defense_dirs + [no_defense_dir]) for d in defenses: print '\tchecking: %s' % d attacks = set(os.listdir(os.path.join(result_dir, d))) # make sure we have results for each attack assert attacks == set(attack_implementations) for a in attacks: sites = set(os.listdir(os.path.join(result_dir, d, a))) # make sure we have results for each site assert sites == set(sites_tested) for s in sites: # make sure all proper tests have been run tests = set(os.listdir(os.path.join(result_dir, d, a, s))) if d == no_defense_dir: assert tests == set(no_defense_tests) else: assert d in defense_dirs assert tests == set(defense_tests) # make sure all test output files are present for t in tests: files = set(os.listdir(os.path.join(result_dir, d, a, s, t))) assert files == set(attack_files[a]) print 'done' def main(): check_pcaps('pcaps') check_features('pcaps', 'features') check_sitemaps('sitemaps', 'features') check_folds('pcaps', 'folds') check_results('results') if __name__ == '__main__': main()