#! /usr/bin/python

import leveldb
import os
import re
import ujson
import urllib

# each of these pairs of strings corresponds to a collection mode as 
# described in Section 4.  each pair of names contains two synonyms
# which could in principle be consolidate into a single name
collection_modes = [('mode_1', 'cache_off_fresh_image'),
                    ('mode_2', 'cache_on_fresh_image'),
                    ('mode_3', 'cache_on_persistent_image'), 
                    ('mode_4', 'cache_on_persistent_image_all_sites')]
# list of all websites tested.
sites_tested = ['aclu', 'bank_of_america', 'kaiser', 'legal_zoom', 'mayo_clinic', 'netflix', 'planned_parenthood', 'vanguard', 'wellsfargo', 'youtube']
# list of all defenses tested.  note that these generally parallel
# the "no_defense" directory, but are listed separately here since
# we run more tests for the special "no defense" defense.
defense_dirs = ['exponential_defense',
                'linear_defense',
                'macro_pad_ambi_max_1.03_range_1',
                'macro_pad_ambi_max_1.05_range_1',
                'macro_pad_ambi_max_1.10_range_1',
                'weak_fragmentation_defense']
# directory corresponding to not applying a defense to the traffic
no_defense_dir = 'no_defense'


# this function checks the contents of the pcaps directory to verify
# that all data meets expectations (as described below).
def check_pcaps(pcap_dir):
    print '\nchecking: %s' % pcap_dir
    # build list of collection jobs.  notice that there should be one
    # colleciton job for each (collection mode, site) pair.  note that
    # for mode 4 ("all sites") the collection job actually included 
    # traffic from each of the sites, where browsing sessions each
    # containing 75 URLs at a single site were executed in sequence in
    # a single VM, and we have separated the resulting pcaps back into
    # separate directories here.  This was done for the purpose of
    # maintaining compatibility with other parts of the codebase.
    collection_jobs = set()
    for _, cm in collection_modes:
        for s in sites_tested:
            collection_jobs.add('%s_%s' % (cm, s))
    # check files
    jobs = set(os.listdir(pcap_dir))
    assert collection_jobs == jobs
    for cj in collection_jobs:
        print '\tchecking: %s' % cj
        files = set(os.listdir(os.path.join(pcap_dir, cj)))
        # this file contains a list of lists, where each list contains
        # 75 URLs which should be visited during a browsing session
        assert 'browse.json' in files
        # this file contains a list of tuples (one per line) where each
        # tuple specifies the start and end of a range (end not inclusive)
        # of browsing sessions which represent contiguous paths through
        # the website.  for the browsing sessions outside these ranges,
        # there will be at least one transition between URLs which is
        # outside of the website link structure.  this occurs because link
        # structures can make some pages much easier to reach than others,
        # which would mean collection of 16 samples for each page with
        # strict adherence to the link graph during all of collection would
        # require collecting significant redundant samples of pages which are
        # easier to reach.
        assert 'path_ranges' in files
        # this file is a firefox config file which was placed in each VM
        # for the corresponding collection job.  note that this file is
        # first augmented as indicated in each of the subdirectories
        # containing pcap files.
        assert 'user.js' in files
        # this file exists to aid our collection infrastructure and
        # configure VM behavior
        assert 'vm_general_config' in files
        # each of session_dirs contains the pcaps from a single, 75 URL
        # browsing session
        session_dirs = filter(lambda x: re.match('\d{4}', x), files)
        # make sure no session_dirs are missing
        assert sorted(map(int, session_dirs)) == range(max(map(int, session_dirs)) + 1)
        # make sure there are no extra files
        assert len(files) == max(map(int, session_dirs)) + 5
        # load browse.json and make sure that the URLs visited in each
        # browsing session correspond to our expectation
        handle = open(os.path.join(pcap_dir, cj, 'browse.json'))
        browse_jobs = ujson.load(handle)
        handle.close()
        # make sure all browsing jobs have been executed
        assert len(browse_jobs) == len(session_dirs)
        # check the contents of each session directory
        for sd in session_dirs:
            files = set(os.listdir(os.path.join(pcap_dir, cj, sd)))
            # there should be 75 pcap files and 4 additional files
            # note that tcpdump was configured to capture traffic
            # on all protocols and ports, but only the first 128 bytes
            # of each packet since this is enough to provide header
            # information and the contents are encrypted anyways.
            assert files == set(['%.4d.pcap' % d for d in range(75)] + ['labels.idx', 'user.js', 'vm_config', 'key.idx'])
            # load the browser config file which corresponded to this
            # browsing session, make sure that the browsing jobs
            # it contains match our expectations in browse.json
            handle = open(os.path.join(pcap_dir, cj, sd, 'user.js'))
            user_js = handle.read()
            handle.close()
            browse_job = ujson.loads(re.match('.*user_pref\(\'greasemonkey.scriptvals.TAAttack/BrowserDriver.path\', \'(.*)\'\);', user_js, flags = re.DOTALL).group(1))
            assert browse_job == browse_jobs[int(sd)]
            # verify contents of key.idx.  there is one entry in
            # key.idx for each sample in the browsing session.  the
            # first field is the pcap name, the second field is the
            # URL in the browser address bar when the requested page
            # finished loading, and the third field is the URL requested.
            # note that due to redirection, scripts, etc. the requested
            # URL and the final URL may not be the same.
            handle = open(os.path.join(pcap_dir, cj, sd, 'key.idx'))
            browse_job = [re.match('\S+?\s+\S+\s+(.*)', x).group(1) for x in handle.read().strip().split('\n')]
            handle.close()
            assert browse_job == map(urllib.unquote, map(str, browse_jobs[int(sd)]))
    print 'done'


# check that the contents of the features directory is as expected.
# note that this requres the pcaps directory since one of the expectations
# is that there will be a set of feature files corresponding to each of
# the pcap files.  note also that "feature" as used here is a bit of a
# misnomer; the contents of this directory would be better understood as
# the result of preprocessing steps which happen prior to actual feature
# extraction.
def check_features(pcaps, features):
    print '\nchecking: %s' % features
    # note that each of these pre-processing "feature" files are generated
    # by FeatureExtractor.py.  Each item in the list below corresponds to
    # to a directory which should exist for each collection job.  Inside
    # the directory, there should be a file corresponding to each pcap
    # collected during the collection job.  Due to the large volume of
    # small files, this approach requires many inodes and can be slow.
    # consequently, we have replaced many of the "directories" with LevelDB
    # LevelDB is a key-value store, where the keys correspond to file names
    # and values correspond to file contents in our context.
    # the content of each directory can be summarized as follows
    preprocessing_features = [# these files contain the (outgoing, incoming, domain) tuples
                              # for bursts on each TCP connection.
                              'burst_pair_files',
                              # these files contain the sizes of contiguous traffic bursts
                              # seen independent of TCP connection.  note that these burst
                              # sizes may not be the same as seen in burst_pair_files due
                              # to TCP stream interweaving.
                              'burst_stream_files',
                              # these files contain counts of packet sizes
                              'size_count_files',
                              # these files contain a sequence of the raw packet sizes seen
                              'raw_sequence_files',
                              # these files contain a sequence of the raw packet sizes
                              # rounded to the nearest 600 bytes.
                              'rounded_sequence_files']
    expected_files = {}
    for st in sites_tested:
        for mode_num, mode_name in collection_modes:
            print '\tchecking: %s_%s' % (mode_num, st)
            # figure out the file names we expect to see based on the
            # pcaps directory
            full_name = '%s_%s' % (mode_name, st)
            expected_files[full_name] = set()
            session_dirs = filter(lambda x: re.match('\d{4}', x), os.listdir(os.path.join(pcaps, full_name)))
            for sd in session_dirs:
                for sample in ['%.4d' % x for x in range(75)]:
                    expected_files[full_name].add('%s_%s_%s.dat' % (full_name, sd, sample))
            for d in defense_dirs + [no_defense_dir]:
                files = set(os.listdir(os.path.join(features, d, '%s_%s' % (mode_num, st))))
                # for each defense, make sure we see the groups of
                # preprocessing feature files which we expect
                if d == no_defense_dir:
                    # note that "no_defense" should have all files in the file
                    # system as well as in LevelDB databases
                    assert files == set(['defense_stats', 'stats'] + preprocessing_features + ['%s.ldb' % x for x in preprocessing_features])
                else:
                    assert files == set(['defense_stats', 'stats'] + ['%s.ldb' % x for x in preprocessing_features])
                for pf in preprocessing_features:
                    # for each of the sets of preprocessing features, make sure
                    # that we have an entry corresponding to each pcap 
                    if d == no_defense_dir:
                        found_files = set(os.listdir(os.path.join(features, d, '%s_%s' % (mode_num, st), pf)))
                        assert found_files == expected_files[full_name]
                    ldb_path = os.path.join(features, d, '%s_%s' % (mode_num, st), '%s.ldb' % pf)
                    assert os.path.isdir(ldb_path)
                    ldb = leveldb.LevelDB(ldb_path)
                    found_files = set()
                    for k in ldb.RangeIter(include_value = False):
                        found_files.add(k)
                    assert found_files == expected_files[full_name]
    print 'done'


# check the contents of the sitemaps directory, which has been
# cleaned up to only include the defense config files for each site.
# these files specify the padding buckets for the Burst defense.
# in this function, we simply make sure that padding has been
# appropriately applied and that all extracted burst values are
# one of the bucket thresholds.  
def check_sitemaps(sitemaps, features):
    print 'checking: %s' % sitemaps
    pad_levels = ['1.03', '1.05', '1.10']
    for st in sites_tested:
        for mode_num, mode_name in collection_modes:
            for pl in pad_levels:
                print '\tchecking: %s_%s:%s' % (mode_num, st, pl)
                handle = open(os.path.join(sitemaps, st, 'defense_config', mode_num, 'max_%s_range_1.out' % pl))
                out_levels = [int(x) for x in handle.read().strip().split('\n')[3:]]
                handle.close()
                handle = open(os.path.join(sitemaps, st, 'defense_config', mode_num, 'max_%s_range_1.in' % pl))
                in_levels = [int(x) for x in handle.read().strip().split('\n')[3:]]
                handle.close()
                # note that we compare to burst_pair_files, not burst_stream_files.
                # burst padding is applied to bursts over a single TCP connection,
                # but bursts in burst_stream_files are extracted independent of
                # TCP connection (which means that stream interweaving can cause
                # bursts extracted independent of TCP connection to be fragmented).
                ldb_path = os.path.join(features, 'macro_pad_ambi_max_%s_range_1' % pl, '%s_%s' % (mode_num, st), 'burst_pair_files.ldb')
                ldb = leveldb.LevelDB(ldb_path)
                for k, v in ldb.RangeIter():
                    if v.strip().split('\n') == ['']:
                        continue
                    for line in v.strip().split('\n'):
                        (out_val, in_val) = [int(x) for x in line.split()[:2]]
                        assert out_val in out_levels
                        assert in_val in in_levels

# check the folds directory.  note that the correctness of the folds
# directory depends on the pcaps directory, so we need the path to
# that directory as well.
# the folds directory describes how to divide the collected data to
# conduct a multi-fold evaluation; namely which samples should be
# used for training, testing and evaluation (holdout) in each fold
# of the evaluation.  the testing data exists for the purpose of 
# doing parameter search by training models with varying parameters
# on the training data and evaluating the models on the testing data.
# once the best performing parameters are identified, we train a
# model using both the training and testing data, and then evaluate
# the model using the holdout data.  In this way, we produce a true
# evaluation of each technique since the evaluation data playz zero
# role in training, model or parameter selection.
# note that within each collection job, there are 5 fold directories:
# fold_00, fold_01, fold_02, traintest and holdout.  fold_00, fold_01
# and fold_02 divide the data from a collection job such that a 3-fold
# evaluation can be conducted using the data from that job alone (i.e.
# the training and evaluation data are drawn from the same collection
# mode).  traintest divides the data in a collection job such that
# the entire job can serve as training and testing data (with the
# assumption that evaluation (holdout) data will come from a different
# collection mode.  holdout describes how to use all of the data from
# a given collection job as evaluation data (i.e. no data is left for
# training or testing).
# within each fold directory, there may be the following files/directories:
# holdout_paths, label_list, sample_key_holdout.dat, sample_key_test.dat
# sample_key_train.dat, trans_matrix.
# holdout_paths: this directory contains files very similar to labels.idx
#     that indicate which sessions in the holdout data for this fold
#     represent contiguous paths throug the site graph
# label_list: this file contains a list of all labels (i.e. pages) seen
#     in the training and testing data for this fold.  note that due to
#     redirections which occur during data collection, it is possible for
#     some pages to have few samples and hence appear only in holdout
#     data.  we make no effort to "correct" this as it is a real challenge
#     an attacker could experience.
# sample_key_holdout.dat: identifies the samples corresponding to each
#     label which should be included in the holdout data for this fold
# sample_key_test.dat: identifies the samples corresponding to each
#     label which should be included in the testing data for this fold
# sample_key_train.dat: identifies the samples corresponding to each label
#     which should be included in the training data for this fold
# trans_matrix: contains a square matrix representing the link graph of
#     the website.  the rows (and correspondingly columns) are labeled
#     by their corresponding line in label_list.
# the exact files contained in each fold directory may vary according to
# the purpose of the fold directory.  for example, the 'holdout' directory
# does not have a sample_key_train.dat or sample_key_test.dat file since
# the 'holdout' fold is designed to be used in evaluations as a fold which
# uses all of the sample from a given collection mode.
def check_folds(pcaps, folds):
    print '\nchecking: %s' % folds
    for st in sites_tested:
        for mode_num, mode_name in collection_modes:
            print '\tchecking: %s_%s' % (mode_num, st)
            # check that all expected files are here
            for fold in ['fold_00', 'fold_01', 'fold_02']:
                files = set(os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), fold)))
                assert files == set(['holdout_paths', 'label_list', 'sample_key_holdout.dat', 'sample_key_test.dat', 'sample_key_train.dat', 'trans_matrix'])
            files = set(os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), 'holdout')))
            assert files == set(['holdout_paths', 'sample_key_holdout.dat'])
            files = set(os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), 'traintest')))
            assert files == set(['label_list', 'sample_key_test.dat', 'sample_key_train.dat', 'trans_matrix'])
            # load transition matrix in the form of a dictionary where keys
            # correspond to pages, and values correspond to sets of pages
            # reachable from the corresponding key page
            handle = open(os.path.join(folds, '%s_%s' % (mode_num, st), 'traintest', 'trans_matrix'))
            trans_matrix_lines = handle.read().strip().split('\n')
            handle.close()
            handle = open(os.path.join(folds, '%s_%s' % (mode_num, st), 'traintest', 'label_list'))
            label_list = handle.read().strip().split('\n')
            handle.close()
            assert len(label_list) == len(trans_matrix_lines)
            trans_matrix = {}
            for label, neighbors in zip(label_list, trans_matrix_lines):
                neighbors = neighbors.split()
                assert len(neighbors) == len(label_list)
                neighbors = filter(lambda (x, y): x == '1', zip(neighbors, label_list))
                neighbors = set(map(lambda (x, y): y, neighbors))
                trans_matrix[label] = neighbors
            # identify all browsing sessions which should adhere to links
            # according to path_ranges.  then, iterate through each browsing
            # session to make sure the sequence of pages actually adheres to
            # links.
            handle = open(os.path.join(pcaps, '%s_%s' % (mode_name, st), 'path_ranges'))
            ranges = [map(int, x.split()) for x in handle.read().strip().split('\n')]
            handle.close()
            for start, end in ranges:
                for idx in range(start, end):
                    handle = open(os.path.join(pcaps, '%s_%s' % (mode_name, st), '%.4d' % idx, 'labels.idx'))
                    sequence = [re.match('\S+\s+(.*)', x).group(1) for x in handle.read().strip().split('\n')]
                    handle.close()
                    for leave, land in zip(sequence, sequence[1:]):
                        assert land in trans_matrix[leave]
            # check holdout_paths (reproduction of path_ranges) in folds directory
            for fold in ['fold_00', 'fold_01', 'fold_02', 'holdout']:
                for session in os.listdir(os.path.join(folds, '%s_%s' % (mode_num, st), fold, 'holdout_paths')):
                    handle = open(os.path.join(folds, '%s_%s' % (mode_num, st), fold, 'holdout_paths', session))
                    sequence = [re.match('\S+\s+(.*)', x).group(1) for x in handle.read().strip().split('\n')]
                    handle.close()
                    for leave, land in zip(sequence, sequence[1:]):
                        assert land in trans_matrix[leave]
    print 'done'


# this function checks the contents of the "results" directory to
# verify that all expected evaluation results are present.  note that
# the format of a test is as follows: A_mode_B_C_mode_D_E, where A,
# B, C, D and E are defined as follows:
# A = {02, 04, 08, 16} and indicates the number of samples per page
#     included in the training data
# B = {1,2,3,4} and indicates the collection mode (see "collection_modes"
#     above) of the training data
# C = {fold_00, fold_01, fold_02, traintest, holdout} indicates the
#     fold which the training data is taken from
# D = {1,2,3,4} and indicates the collection mode of the evaluation
#     data
# E = {fold_00, fold_01, fold_02, traintest, holdout} and indicates
#     the fold which the evaluation data is taken from
def check_results(result_dir):
    print '\nchecking: %s' % result_dir
    # note that for each defense, we perform only a single test for
    # each (attack, website) pair.  this provides 10 datapoints 
    # indicating how each attack will perform on each defense.  we
    # perform only a single test because the test uses all available
    # training data in order to get best results, and the cost of
    # using randomized methods (e.g. jackknifing) to develop statistical
    # confidence is prohibitivly large.  the results of these tests are
    # responsible for figure 7A.
    defense_tests = ['16_mode_2_traintest_mode_4_holdout']
    # for the "no_defense" defense (i.e. how the attacks perform when
    # no defense is applied) we conduct more tests.  we present the
    # purpose of each test below.
    no_defense_tests = [# these tests are used to produce figure 5a
                        '02_mode_2_traintest_mode_4_holdout',
                        '04_mode_2_traintest_mode_4_holdout', 
                        '08_mode_2_traintest_mode_4_holdout', 
                        '16_mode_2_traintest_mode_4_holdout',
                        # these tests are used to produce figure 4.
                        # note that we only use 8 samples of training data
                        # because for some tests training and evaluation data
                        # must be taken from the same collection mode.
                        # note also that we evaluate using a "fold_X" fold
                        # rather than the "holdout" fold which includes all
                        # data so that we can use the same data when evaluating
                        # how models perform on a given collection mode
                        # without ever including training data in the
                        # evaluation data.
                        '08_mode_1_fold_00_mode_1_fold_00', 
                        '08_mode_1_fold_01_mode_1_fold_01', 
                        '08_mode_1_fold_02_mode_1_fold_02', 
                        '08_mode_2_fold_00_mode_2_fold_00', 
                        '08_mode_2_fold_01_mode_2_fold_01', 
                        '08_mode_2_fold_02_mode_2_fold_02', 
                        '08_mode_2_fold_00_mode_3_fold_00', 
                        '08_mode_2_fold_02_mode_3_fold_02', 
                        '08_mode_2_fold_01_mode_3_fold_01', 
                        '08_mode_2_fold_02_mode_4_fold_02', 
                        '08_mode_2_fold_00_mode_4_fold_00', 
                        '08_mode_2_fold_01_mode_4_fold_01', 
                        '08_mode_3_fold_01_mode_2_fold_01', 
                        '08_mode_3_fold_00_mode_2_fold_00', 
                        '08_mode_3_fold_02_mode_2_fold_02', 
                        '08_mode_3_fold_02_mode_3_fold_02', 
                        '08_mode_3_fold_01_mode_3_fold_01',
                        '08_mode_3_fold_00_mode_3_fold_00', 
                        '08_mode_3_fold_01_mode_4_fold_01',
                        '08_mode_3_fold_00_mode_4_fold_00', 
                        '08_mode_3_fold_02_mode_4_fold_02'] 

    # this dictionary defines the list of files generated by each
    # attack implementation.  note that in the case of bog, some
    # of the "files" are directories and "lr_accuracy" presents the
    # accuracy of logistic regression alone, whereas "viterbi_lr_accuracy"
    # presents the accuracy of the entire attack including the HMM.
    attack_files = {'ll': ['gnb_accuracy', 'holdout_features', 'training_features'], 
                    'dlsvm': ['raw_sequence_files_accuracy'], 
                    'pan': ['custom_kernel_svm_accuracy', 'holdout_features', 'libsvm_svm_model.clarge.gsmall', 'training_features'],
                    'bog': ['domain_models', 'holdout_features', 'holdout_points', 'lr_accuracy', 'model', 'predict', 'testing_points', 'training_features', 'training_points', 'viterbi_lr_accuracy']}
    attack_implementations = ['ll', 'pan', 'bog', 'dlsvm']
    # check files
    defenses = set(os.listdir(result_dir))
    # make sure that we have results for each defense
    assert defenses == set(defense_dirs + [no_defense_dir])
    for d in defenses:
        print '\tchecking: %s' % d
        attacks = set(os.listdir(os.path.join(result_dir, d)))
        # make sure we have results for each attack
        assert attacks == set(attack_implementations)
        for a in attacks:
            sites = set(os.listdir(os.path.join(result_dir, d, a))) 
            # make sure we have results for each site
            assert sites == set(sites_tested)
            for s in sites:
                # make sure all proper tests have been run
                tests = set(os.listdir(os.path.join(result_dir, d, a, s)))
                if d == no_defense_dir:
                    assert tests == set(no_defense_tests)
                else:
                    assert d in defense_dirs
                    assert tests == set(defense_tests)
                # make sure all test output files are present
                for t in tests:
                    files = set(os.listdir(os.path.join(result_dir, d, a, s, t)))
                    assert files == set(attack_files[a])
    print 'done'

def main():
    check_pcaps('pcaps')
    check_features('pcaps', 'features')
    check_sitemaps('sitemaps', 'features')
    check_folds('pcaps', 'folds')
    check_results('results')

if __name__ == '__main__':
    main()