This module contains all the code for running our experiments for Tango. To reproduce our results, please run each of the cells in this notebook.
random.seed(42)
path = Path("/tf/data")

Data Setup

Description:

  • Number of Participants: 14 (10 Ph.D. Students/Developers and 4 authors)
  • Number of Applications: 6
  • Number of Bug Reports per Application: 10
fps = 30
ds_user = 'user'
vid_user_ds = VideoDataset.from_path(path/"artifacts/videos", fr = fps).label_from_paths()
vid_user_ds.get_labels()

Run Visual-Based Models

Model Setup

Configurations:

  • Number of Visual Words: 1,000, 5,000, 10,000
  • Codebook Number of Image Samples: MAX ~50,000
  • Number of frames kept: 1, 5
  • Model + Bag of Visual Words
  • Model + Fuzzy LCS
  • Model + LCS
  • Model + Bag of Visual Words + Fuzzy LCS
  • Model + Bag of Visual Words + Weight LCS

SIFT - M00

model_00 = 'SIFT'
M00 = SIFTExtractor(cv2.xfeatures2d.SIFT_create(nfeatures = 10)) # limit SIFT features to top 10

SimCLR - M01

model_01 = 'SimCLR'
simclr = SimCLRModel.load_from_checkpoint(checkpoint_path = str(path/'artifacts/models/SimCLR/checkpointepoch=98.ckpt')).eval()
M01 = SimCLRExtractor(simclr)

Change these parameters if you want to only run a subset of our experiments

vwords = [1_000, 5_000, 10_000]
n_imgs = 15_000
n_frames_to_keep = [1, 5]
def generate_rankings(
    path, vid_ds, ds_name, model_name, model, sim_func, vwords, n_imgs,
     n_frames_to_keep, fps
):

    for vw in tqdm(vwords):
        for ftk in tqdm(n_frames_to_keep):
            evaluation_metrics = {}
            fname = path/f'artifacts/models/{model_name}/cookbook_{model_name}_{vw}vw.model'
            codebook = pickle.load(open(fname, 'rb'))
            start = time.time()
            vid_ds_features = gen_extracted_features(vid_ds, model, fps, ftk)
            end = time.time()
            feature_gen_time = end - start
            df, bovw_vid_ds_sims = gen_bovw_similarity(vid_ds, vid_ds_features, model, codebook, vw, ftk)
            lcs_vid_ds_sims = gen_lcs_similarity(vid_ds, vid_ds_features, sim_func, model, codebook, df, vw, ftk)
            
            rankings = approach(
                vid_ds, vid_ds_features, bovw_vid_ds_sims, lcs_vid_ds_sims, model, sim_func,
                codebook, df, vw, fps = fps, ftk = ftk,
            )
            
            evaluation_metrics['bovw'] = evaluate(
                rankings['bovw']
            )
            evaluation_metrics['lcs'] = evaluate(
                rankings['lcs']
            )
            evaluation_metrics['weighted_lcs'] = evaluate(
                rankings['weighted_lcs']
            )
            
            evaluation_metrics['bovw_lcs'] = evaluate(
                rankings['bovw_lcs']
            )
            evaluation_metrics['bovw_weighted_lcs'] = evaluate(
                rankings['bovw_weighted_lcs']
            )
            
            id_name = f'{ds_name}_{n_imgs}n_{vw}vw_{ftk}ftk'
            with open(path/f'outputs/results/{model_name}/rankings_{id_name}.pkl', 'wb') as f:
                pickle.dump(rankings, f, protocol=pickle.HIGHEST_PROTOCOL)

            with open(path/f'outputs/results/{model_name}/evaluation_metrics_{id_name}.pkl', 'wb') as f:
                pickle.dump(evaluation_metrics, f, protocol=pickle.HIGHEST_PROTOCOL)

User Data

The SIFT (M00) model takes a significant amount of time to run (>24 hours) on our machine

generate_rankings(
    path, vid_user_ds, ds_user, model_00, M00, sift_frame_sim, vwords, n_imgs,
    n_frames_to_keep, fps
)

SimCLR (M01) model is a lot faster than SIFT (~6 hours) on our machines

generate_rankings(
    path, vid_user_ds, ds_user, model_01, M01, simclr_frame_sim, vwords, n_imgs,
    n_frames_to_keep, fps
)

Reviewing Results

def get_eval_results(evals, app, item):
    for bug in evals[app]:
        if bug == 'elapsed_time' or bug == 'Bug Hit@1' \
        or bug == 'Bug Hit@5' or bug == 'Bug Hit@10' \
        or bug == 'App std rank' or bug == 'App mean rank' \
        or bug == 'App median rank' or bug == 'App mRR' \
        or bug == 'App mAP' or bug == 'App Hit@1' \
        or bug == 'App Hit@5' or bug == 'App Hit@10': continue
        for vid in evals[app][bug]:
            try:
                print(evals[app][bug][vid][item])
            except: continue
id_name = f'{ds_user}_15000n_1000vw_5ftk'
fname = path/f'outputs/results/{model_01}/evaluation_metrics_{id_name}.pkl'
evals = pickle.load(open(fname, 'rb'))
get_eval_results(evals['weighted_lcs'], 'APOD', 'rank')

Run Textual-Based Model

  1. Generate the settings file
csv_file_path = path/'artifacts/user_assignment.csv'
settings_path = path/'outputs/evaluation_settings'
video_data = read_video_data(csv_file_path)
generate_setting2(video_data, settings_path)
  1. Convert results from model to format of the settings
sim_path = path/'outputs/results'
out_path = path/'outputs'
models = ['SimCLR']
convert_results_format(sim_path, settings_path, out_path, models)
  1. Run OCR text extractor
vid_path = path/'artifacts/videos'
txt_out_path = path/'outputs/extracted_text'
get_all_texts(vid_path, out_path, fps = 1)
get_all_texts(vid_path, out_path, fps = 5)
  1. Run text preprocessing, build document index, and run lucene
%cd {path}/artifacts/models/tango_txt/
! sh build_run.sh {txt_out_path} {settings_path}

Outputs results to /tf/data/artifacts/tango_txt/tango_txt_results

Combining Textual Information

  1. Compute the combination of visual and textual information
combo_out_path = path/'outputs/combined'
dl_ranking_path = path/'outputs/user_rankings_weighted_all/all_rankings.csv'
ir_rankings_path = path/'artifacts/models/tango_txt/tango_txt_rankings/all_rankings.json'

best_dl_models = [
    "SimCLR-1000vw-5ftk-bovw", "SimCLR-5000vw-5ftk-bovw_lcs",
    "SimCLR-5000vw-5ftk-bovw_weighted_lcs", "SimCLR-1000vw-5ftk-bovw_weighted_lcs"
]
best_ir_models = [
    "ocr+ir--1ftk-all_text", "ocr+ir--5ftk-all_text",
    "ocr+ir--5ftk-unique_frames", "ocr+ir--5ftk-unique_words"
]

tango_combined(combo_out_path, dl_ranking_path, ir_rankings_path, settings_path, best_dl_models, best_ir_models)