Source code for truvari.bench

"""
Structural variant benchmarking tool
Given baseline and comparison sets of variants, calculate the recall/precision/f-measure
"""
import os
import sys
import copy
import json
import logging
import argparse
import itertools

from collections import defaultdict, OrderedDict, Counter

import numpy as np

import truvari

##############
# Parameters #
##############


def parse_args(args):
    """
    Pull the command line parameters
    """
    defaults = truvari.VariantParams()
    parser = argparse.ArgumentParser(prog="bench", description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("-b", "--base", type=str, required=True,
                        help="Baseline truth-set calls")
    parser.add_argument("-c", "--comp", type=str, required=True,
                        help="Comparison set of calls")
    parser.add_argument("-o", "--output", type=str, required=True,
                        help="Output directory")
    parser.add_argument("-f", "--reference", type=str, default=None,
                        help="Fasta used to call variants. Only needed with symbolic variants.")
    parser.add_argument("-w", "--write-resolved", action="store_true",
                        help="Replace resolved symbolic variants with their sequence")
    parser.add_argument("--short", action="store_true",
                        help="Short circuit comparisions. Faster, but fewer annotations")
    parser.add_argument("--debug", action="store_true", default=False,
                        help="Verbose logging")

    thresg = parser.add_argument_group("Comparison Threshold Arguments")
    thresg.add_argument("-r", "--refdist", type=truvari.restricted_int, default=defaults.refdist,
                        help="Max reference distance (%(default)s)")
    thresg.add_argument("-p", "--pctseq", type=truvari.restricted_float, default=defaults.pctseq,
                        help="Min sequence similarity. Set to 0 to ignore (%(default)s)")
    thresg.add_argument("-P", "--pctsize", type=truvari.restricted_float, default=defaults.pctsize,
                        help="Min variant size similarity (%(default)s)")
    thresg.add_argument("-O", "--pctovl", type=truvari.restricted_float, default=defaults.pctovl,
                        help="Min reciprocal overlap (%(default)s)")
    thresg.add_argument("-t", "--typeignore", action="store_true", default=defaults.typeignore,
                        help="Don't compare variant types (%(default)s)")
    thresg.add_argument("-n", "--no-roll", action="store_false",
                        help="Turn off rolling sequence similarity")
    thresg.add_argument("--pick", type=str, default=defaults.pick, choices=PICKERS.keys(),
                        help="Number of matches reported per-call (%(default)s)")
    thresg.add_argument("-d", "--dup-to-ins", action="store_true",
                        help="Assume DUP svtypes are INS (%(default)s)")
    thresg.add_argument("-B", "--bnddist", type=int, default=defaults.bnddist,
                        help="Maximum distance allowed between BNDs (%(default)s; -1=off)")
    thresg.add_argument("-C", "--chunksize", type=truvari.restricted_int, default=defaults.chunksize,
                        help="Max reference distance to compare calls (%(default)s)")
    thresg.add_argument("-N", "--no-decompose", action="store_false",
                        help="Disallow symbolic variant decomposition to BNDs")
    thresg.add_argument("-m", "--max-resolve", type=int, default=defaults.max_resolve,
                        help="Maximum size of variant to attempt to sequence resolve ($(default)s)")

    genoty = parser.add_argument_group("Genotype Comparison Arguments")
    genoty.add_argument("--bSample", type=str, default=None,
                        help="Baseline calls' sample to use (first)")
    genoty.add_argument("--cSample", type=str, default=None,
                        help="Comparison calls' sample to use (first)")

    filteg = parser.add_argument_group("Filtering Arguments")
    filteg.add_argument("--passonly", action="store_true", default=defaults.passonly,
                        help="Only consider calls with FILTER == PASS")
    filteg.add_argument("-s", "--sizemin", type=truvari.restricted_int, default=defaults.sizemin,
                        help="Minimum variant size to consider from --comp (%(default)s)")
    filteg.add_argument("-S", "--sizefilt", type=truvari.restricted_int, default=None,
                        help="Minimum variant size to consider from --base (30)")
    filteg.add_argument("--sizemax", type=int, default=defaults.sizemax,
                        help="Maximum variant size to consider (-1 = off)")
    filteg.add_argument("--no-ref", default=defaults.no_ref, choices=['a', 'b', 'c'],
                        help="Exclude 0/0 or ./. GT calls from all (a), base (b), or comp (c) vcfs (%(default)s)")
    filteg.add_argument("--includebed", type=str, default=None,
                        help="Bed file of regions to analyze. Only calls within regions are counted")
    filteg.add_argument("--extend", type=truvari.restricted_int, default=0,
                        help="Distance to allow comp entries outside of includebed regions (%(default)s)")

    args = parser.parse_args(args)
    # When sizefilt is not provided and sizemin has been lowered below the default,
    # set sizefilt to sizemin. Otherwise, if sizefilt not provided, set to default
    # This just makes it easier to specify e.g. `-s 1` instead of `-s 1 -S 1`
    if args.sizefilt is None:
        if args.sizemin < defaults.sizefilt:
            args.sizefilt = args.sizemin
        else:
            args.sizefilt = defaults.sizefilt

    # Setup abspaths
    args.base = os.path.abspath(args.base)
    args.comp = os.path.abspath(args.comp)
    args.includebed = os.path.abspath(
        args.includebed) if args.includebed else args.includebed
    args.reference = os.path.abspath(
        args.reference) if args.reference else args.reference

    return args


def check_params(args):
    """
    Checks parameters as much as possible.
    All errors are written to stderr without logging since failures mean no output
    """
    check_fail = False
    if args.chunksize < args.refdist:
        logging.error("--chunksize must be >= --refdist")
        check_fail = True
    if args.extend and args.includebed is None:
        logging.error("--extend can only be used when --includebed is set")
        check_fail = True
    if os.path.isdir(args.output):
        logging.error("Output directory '%s' already exists", args.output)
        check_fail = True
    if not os.path.exists(args.comp):
        logging.error("File %s does not exist", args.comp)
        check_fail = True
    if not os.path.exists(args.base):
        logging.error("File %s does not exist", args.base)
        check_fail = True
    if not args.comp.endswith(".gz"):
        logging.error("Comparison vcf %s does not end with .gz. Must be bgzip'd",
                      args.comp)
        check_fail = True
    if not truvari.check_vcf_index(args.comp):
        logging.error("Comparison vcf '%s' must be indexed.", args.comp)
        check_fail = True
    if not args.base.endswith(".gz"):
        logging.error("Base vcf %s does not end with .gz. Must be bgzip'd",
                      args.base)
        check_fail = True
    if not truvari.check_vcf_index(args.base):
        logging.error("Base vcf '%s' must be indexed.", args.base)
        check_fail = True
    if args.includebed and not os.path.exists(args.includebed):
        logging.error("Include bed %s does not exist", args.includebed)
        check_fail = True
    if args.reference:
        if not os.path.exists(args.reference):
            logging.error("Reference %s does not exist", args.reference)
            check_fail = True
    return check_fail


def check_sample(vcf_fn, sample_id=None):
    """
    Checks that a sample is inside a vcf
    Returns True if check failed
    """
    vcf_file = truvari.VariantFile(vcf_fn)
    check_fail = False
    if sample_id is not None and sample_id not in vcf_file.header.samples:
        logging.error("Sample %s not found in vcf (%s)", sample_id, vcf_fn)
        check_fail = True
    if len(vcf_file.header.samples) == 0:
        logging.error("No SAMPLE columns found in vcf (%s)", vcf_fn)
        check_fail = True
    if sample_id is None:
        sample_id = vcf_file.header.samples[0]
    return check_fail, sample_id


def check_inputs(args):
    """
    Checks the inputs to ensure expected values are found inside of files
    Returns True if check failed
    """
    b_check, args.bSample = check_sample(args.base, args.bSample)
    c_check, args.cSample = check_sample(args.comp, args.cSample)
    return b_check or c_check


###############
# VCF editing #
###############
def edit_header(my_vcf):
    """
    Add INFO for new fields to vcf
    """
    header = my_vcf.header.copy()
    header.add_line(('##INFO=<ID=TruScore,Number=1,Type=Integer,'
                     'Description="Truvari score for similarity of match">'))
    header.add_line(('##INFO=<ID=PctSeqSimilarity,Number=1,Type=Float,'
                     'Description="Pct sequence similarity between this variant and its closest match">'))
    header.add_line(('##INFO=<ID=PctSizeSimilarity,Number=1,Type=Float,'
                     'Description="Pct size similarity between this variant and its closest match">'))
    header.add_line(('##INFO=<ID=PctRecOverlap,Number=1,Type=Float,'
                     'Description="Percent reciprocal overlap percent of the two calls\' coordinates">'))
    header.add_line(('##INFO=<ID=StartDistance,Number=1,Type=Integer,'
                     'Description="Distance of the base call\'s end from comparison call\'s start">'))
    header.add_line(('##INFO=<ID=EndDistance,Number=1,Type=Integer,'
                     'Description="Distance of the base call\'s end from comparison call\'s end">'))
    header.add_line(('##INFO=<ID=SizeDiff,Number=1,Type=Float,'
                     'Description="Difference in size of base and comp calls">'))
    header.add_line(('##INFO=<ID=GTMatch,Number=1,Type=Integer,'
                     'Description="Base/Comparison genotypes AC difference">'))
    header.add_line(('##INFO=<ID=MatchId,Number=.,Type=String,'
                     'Description="Tuple of base and comparison call ids which were matched">'))
    header.add_line(('##INFO=<ID=Multi,Number=0,Type=Flag,'
                     'Description="Call is false due to non-multimatching">'))
    return header


def annotate_entry(entry, match, header):
    """
    Make a new entry with all the information
    """
    entry.translate(header)
    # pylint: disable=unnecessary-lambda
    attributes = [
        ("seqsim", "PctSeqSimilarity", lambda x: round(x, 4)),
        ("sizesim", "PctSizeSimilarity", lambda x: round(x, 4)),
        ("ovlpct", "PctRecOverlap", lambda x: round(x, 4)),
        ("sizediff", "SizeDiff", lambda x: x),
        ("st_dist", "StartDistance", lambda x: x),
        ("ed_dist", "EndDistance", lambda x: x),
        ("gt_match", "GTMatch", lambda x: x),
        ("score", "TruScore", lambda x: int(x)),
        ("matid", "MatchId", lambda x: x),
    ]
    # pylint: enable=unnecessary-lambda

    for attr, key, transform in attributes:
        value = getattr(match, attr)
        if value is not None:
            entry.info[key] = transform(value)

    entry.info["Multi"] = match.multi


#############
# Core code #
#############
[docs] class StatsBox(OrderedDict): """ Make a blank stats box for counting TP/FP/FN and calculating performance """ def __init__(self): super().__init__() self["TP-base"] = 0 self["TP-comp"] = 0 self["FP"] = 0 self["FN"] = 0 self["precision"] = 0 self["recall"] = 0 self["f1"] = 0 self["base cnt"] = 0 self["comp cnt"] = 0 self["TP-comp_TP-gt"] = 0 self["TP-comp_FP-gt"] = 0 self["TP-base_TP-gt"] = 0 self["TP-base_FP-gt"] = 0 self["gt_concordance"] = 0 self["gt_matrix"] = defaultdict(Counter)
[docs] def calc_performance(self): """ Calculate the precision/recall """ if self["TP-base"] == 0 and self["FN"] == 0: logging.warning("No TP or FN calls in --base VCF!") elif self["TP-comp"] == 0 and self["FP"] == 0: logging.warning("No TP or FP calls in --comp VCF!") precision, recall, f1 = truvari.performance_metrics( self["TP-base"], self["TP-comp"], self["FN"], self["FP"]) self["precision"] = precision self["recall"] = recall self["f1"] = f1 if self["TP-comp_TP-gt"] + self["TP-comp_FP-gt"] != 0: self["gt_concordance"] = float(self["TP-comp_TP-gt"]) / (self["TP-comp_TP-gt"] + self["TP-comp_FP-gt"])
[docs] def clean_out(self): """ When reusing a StatsBox (typically inside refine), gt numbers are typically invalidated. This removes those numbers from self to make a cleaner report """ del self["TP-comp_TP-gt"] del self["TP-comp_FP-gt"] del self["TP-base_TP-gt"] del self["TP-base_FP-gt"] del self["gt_concordance"] del self["gt_matrix"]
[docs] def write_json(self, out_name): """ Write stats as json to file """ with open(out_name, 'w') as fout: fout.write(json.dumps(self, indent=4))
[docs] class BenchOutput(): """ Makes all of the output files for a Bench.run The variable `BenchOutput.vcf_filenames` holds a dictonary. The keys are tpb, tpc for true positive base/comp vcf filename and fn, fp. The variable `stats_box` holds a :class:`StatsBox`. """ def __init__(self, bench, params): """ initialize """ self.m_bench = bench self.m_params = params os.mkdir(self.m_bench.outdir) param_dict = self.m_bench.param_dict() param_dict.update(vars(self.m_params)) if self.m_bench.do_logging: truvari.setup_logging(self.m_bench.debug, truvari.LogFileStderr( os.path.join(self.m_bench.outdir, "log.txt")), show_version=True) logging.info("Params:\n%s", json.dumps(param_dict, indent=4)) with open(os.path.join(self.m_bench.outdir, 'params.json'), 'w') as fout: json.dump(param_dict, fout) b_vcf = truvari.VariantFile( self.m_bench.base_vcf, params=self.m_params) c_vcf = truvari.VariantFile( self.m_bench.comp_vcf, params=self.m_params) self.n_headers = {'b': edit_header(b_vcf), 'c': edit_header(c_vcf)} self.vcf_filenames = {'tpb': os.path.join(self.m_bench.outdir, "tp-base.vcf"), 'tpc': os.path.join(self.m_bench.outdir, "tp-comp.vcf"), 'fn': os.path.join(self.m_bench.outdir, "fn.vcf"), 'fp': os.path.join(self.m_bench.outdir, "fp.vcf")} self.out_vcfs = {} for key in ['tpb', 'fn']: self.out_vcfs[key] = truvari.VariantFile( self.vcf_filenames[key], mode='w', header=self.n_headers['b'], params=self.m_params) for key in ['tpc', 'fp']: self.out_vcfs[key] = truvari.VariantFile( self.vcf_filenames[key], mode='w', header=self.n_headers['c'], params=self.m_params) self.stats_box = StatsBox()
[docs] def write_match(self, match): """ Annotate a MatchResults' entries then write to the apppropriate file and do the stats counting. Writer is responsible for handling FPs between sizefilt-sizemin """ box = self.stats_box if match.base: box["base cnt"] += 1 annotate_entry(match.base, match, self.n_headers['b']) if match.state: gtBase = str(match.base_gt) gtComp = str(match.comp_gt) box["gt_matrix"][gtBase][gtComp] += 1 box["TP-base"] += 1 self.out_vcfs["tpb"].write(match.base) if match.gt_match == 0: box["TP-base_TP-gt"] += 1 else: box["TP-base_FP-gt"] += 1 else: box["FN"] += 1 self.out_vcfs["fn"].write(match.base) if match.comp: annotate_entry(match.comp, match, self.n_headers['c']) if match.state: box["comp cnt"] += 1 box["TP-comp"] += 1 self.out_vcfs["tpc"].write(match.comp) if match.gt_match == 0: box["TP-comp_TP-gt"] += 1 else: box["TP-comp_FP-gt"] += 1 elif match.comp.var_size() >= self.m_params.sizemin: # The if is because we don't count FPs between sizefilt-sizemin box["comp cnt"] += 1 box["FP"] += 1 self.out_vcfs["fp"].write(match.comp)
[docs] def close_outputs(self): """ Close all the files """ for i in self.out_vcfs.values(): i.close() for i in self.vcf_filenames.values(): truvari.compress_index_vcf(i) self.stats_box.calc_performance() self.stats_box.write_json(os.path.join( self.m_bench.outdir, "summary.json"))
[docs] class Bench(): """ Object to perform operations of truvari bench You can build a Bench worker with default matching parameters via: .. code-block:: python m_bench = truvari.Bench() If you'd like to customize the parameters, build and edit a :class:`VariantParams` and pass it to the Bench init .. code-block:: python params = truvari.VariantParams(pctseq=0.50) m_bench = truvari.Bench(params) To run on a chunk of :class:`truvari.VariantRecord` already loaded, pass them in as lists to: .. code-block:: python matches = m_bench.compare_calls(base_variants, comp_variants) For advanced parsing, one can build the match matrix for a chunk of calls with: .. code-block:: python match_matrix = m_bench.build_matrix(base_variants, comp_variants) This can then be used for things such as creating custom match pickers or adding new matching checks. If you want to run on existing files, simply supply the arguments to init .. code-block:: python m_bench = truvari.Bench(params, base_vcf, comp_vcf, outdir) output = m_bench.run() .. note:: Running on files must write to an output directory and is the only way to use things like 'includebed'. However, the returned `BenchOutput` has attributes pointing to all the results. """ def __init__(self, params=None, base_vcf=None, comp_vcf=None, outdir=None, includebed=None, extend=0, debug=False, do_logging=False): """ Initilize """ self.params = params if params is not None else truvari.VariantParams() self.base_vcf = base_vcf self.comp_vcf = comp_vcf self.outdir = outdir self.includebed = includebed self.extend = extend self.debug = debug self.do_logging = do_logging self.refine_candidates = []
[docs] def param_dict(self): """ Returns the parameters as a dict """ return {'base': self.base_vcf, "comp": self.comp_vcf, "output": self.outdir, "includebed": self.includebed, "extend": self.extend, "debug": self.debug}
[docs] def run(self): """ Runs bench and returns the resulting :class:`truvari.BenchOutput` """ if self.base_vcf is None or self.comp_vcf is None or self.outdir is None: raise RuntimeError( "Cannot call Bench.run without base/comp vcf filenames and outdir") output = BenchOutput(self, self.params) base = truvari.VariantFile(self.base_vcf, params=self.params) comp = truvari.VariantFile(self.comp_vcf, params=self.params) region_tree = truvari.build_region_tree(base, comp, self.includebed) truvari.merge_region_tree_overlaps(region_tree) regions_extended = (truvari.extend_region_tree(region_tree, self.extend) if self.extend else region_tree) base_i = base.fetch_regions(region_tree) comp_i = comp.fetch_regions(regions_extended) chunks = truvari.chunker(self.params, ('base', base_i), ('comp', comp_i)) for match in itertools.chain.from_iterable(map(self.compare_chunk, chunks)): # setting non-matched comp variants that are not fully contained in the original regions to None # These don't count as FP or TP and don't appear in the output vcf files if (self.extend and (match.comp is not None) and not match.state and not match.comp.within_tree(region_tree)): match.comp = None output.write_match(match) with open(os.path.join(self.outdir, 'candidate.refine.bed'), 'w') as fout: fout.write("\n".join(self.refine_candidates)) output.close_outputs() return output
[docs] def compare_chunk(self, chunk): """ Given a filtered chunk, (from chunker) compare all of the calls """ chunk_dict, chunk_id = chunk logging.debug("Comparing chunk %s", chunk_id) result = self.compare_calls( chunk_dict["base"], chunk_dict["comp"], chunk_id) self.check_refine_candidate(result) return result
[docs] def compare_calls(self, base_variants, comp_variants, chunk_id=0): """ Builds MatchResults, returns them as a numpy matrix if there's at least one base and one comp variant. Otherwise, returns a list of the variants placed in MatchResults """ # All FPs if len(base_variants) == 0: fps = [] for cid, c in enumerate(comp_variants): ret = truvari.MatchResult() ret.comp = c ret.matid = ["", f"{chunk_id}.{cid}"] fps.append(ret) logging.debug("All FP -> %s", ret) return fps # All FNs if len(comp_variants) == 0: fns = [] for bid, b in enumerate(base_variants): ret = truvari.MatchResult() ret.base = b ret.matid = [f"{chunk_id}.{bid}", ""] logging.debug("All FN -> %s", ret) fns.append(ret) return fns # 5k variants takes too long if self.params.short_circuit and (len(base_variants) + len(comp_variants)) > 5000: pos = [] cnt = 0 chrom = None for i in base_variants: cnt += 1 pos.extend(i.boundaries()) chrom = i.chrom for i in comp_variants: cnt += 1 pos.extend(i.boundaries()) chrom = i.chrom logging.warning("Skipping region %s:%d-%d with %d variants", chrom, min(*pos), max(*pos), cnt) return [] match_matrix = self.build_matrix( base_variants, comp_variants, chunk_id) if isinstance(match_matrix, list): return match_matrix return PICKERS[self.params.pick](match_matrix)
[docs] def build_matrix(self, base_variants, comp_variants, chunk_id=0): """ Builds MatchResults, returns them as a numpy matrix """ if not base_variants or not comp_variants: raise RuntimeError( "Expected at least one base and one comp variant") match_matrix = [] for bid, base in enumerate(base_variants): base_matches = [] for cid, comp in enumerate(comp_variants): mat = base.match(comp) mat.matid = [f"{chunk_id}.{bid}", f"{chunk_id}.{cid}"] logging.debug("Made mat -> %s", mat) base_matches.append(mat) match_matrix.append(base_matches) return np.array(match_matrix)
[docs] def check_refine_candidate(self, result): """ Adds this region as a candidate for refinement if there are unmatched variants """ has_unmatched = False pos = [] chrom = None for match in result: has_unmatched |= not match.state if match.base is not None and match.base.var_size() >= self.params.sizemin: chrom = match.base.chrom pos.extend(match.base.boundaries()) if match.comp is not None: chrom = match.comp.chrom pos.extend(match.comp.boundaries()) if has_unmatched and pos: # min(10, self.matcher.chunksize) need to make sure the refine covers the region buf = 10 start = max(0, min(*pos) - buf) end = max(*pos) + buf self.refine_candidates.append(f"{chrom}\t{start}\t{end}")
################# # Match Pickers # ################# def pick_multi_matches(match_matrix): """ Given a numpy array of MatchResults Pick each base/comp call's best match """ ret = [] for b_max in match_matrix.max(axis=1): b_max = copy.copy(b_max) b_max.comp = None ret.append(b_max) for c_max in match_matrix.max(axis=0): c_max = copy.copy(c_max) c_max.base = None ret.append(c_max) return ret def pick_ac_matches(match_matrix): """ Given a numpy array of MatchResults Find upto allele count mumber of matches """ ret = [] base_cnt, comp_cnt = match_matrix.shape match_matrix = np.ravel(match_matrix) match_matrix.sort() used_comp = Counter() used_base = Counter() for match in match_matrix[::-1]: # No more matches to find if base_cnt == 0 and comp_cnt == 0: break b_key = str(match.base) c_key = str(match.comp) # This is a trick base_is_used = used_base[b_key] >= match.base_gt_count comp_is_used = used_comp[c_key] >= match.comp_gt_count # Only write the comp (FP) if base_cnt == 0 and not comp_is_used: to_process = copy.copy(match) to_process.base = None to_process.multi = to_process.state to_process.state = False comp_cnt -= 1 if used_comp[c_key] == 0: # Only write as F if it hasn't been a T ret.append(to_process) used_comp[c_key] = 9 # Only write the base (FN) elif comp_cnt == 0 and not base_is_used: to_process = copy.copy(match) to_process.comp = None to_process.multi = to_process.state to_process.state = False base_cnt -= 1 if used_base[b_key] == 0: # Only write as F if it hasn't been a T ret.append(to_process) used_base[b_key] = 9 # Write both (any state) elif not base_is_used and not comp_is_used: to_process = copy.copy(match) # Don't write twice if used_base[b_key] != 0: to_process.base = None if used_comp[c_key] != 0: to_process.comp = None used_base[b_key] += match.comp_gt_count used_comp[c_key] += match.base_gt_count # All used up if used_base[b_key] >= match.base_gt_count: base_cnt -= 1 if used_comp[c_key] >= match.comp_gt_count: comp_cnt -= 1 # Safety edge case check if to_process.base is not None or to_process.comp is not None: ret.append(to_process) return ret def pick_single_matches(match_matrix): """ Given a numpy array of MatchResults, find the single best match for calls Once all best pairs yielded, the unpaired calls are set to FP/FN and yielded """ hit_order = np.array(np.unravel_index(np.argsort( match_matrix, axis=None), match_matrix.shape)).T top_hit_idx = len(hit_order) - 1 mask = np.ones(match_matrix.shape, dtype='bool') # True = can use ret = [] used_pairs = [] while top_hit_idx >= 0 and mask.any(): idx = tuple(hit_order[top_hit_idx]) if mask[idx]: mask[idx[0], :] = False mask[:, idx[1]] = False used_pairs.append(idx) ret.append(match_matrix[idx]) top_hit_idx -= 1 used_base, used_comp = zip(*used_pairs) if used_pairs else ([], []) # FNs for base_col in set(range(match_matrix.shape[0])) - set(used_base): comp_col = match_matrix[base_col].argmax() to_process = copy.copy(match_matrix[base_col, comp_col]) to_process.comp = None # The comp will be written elsewhere to_process.multi = to_process.state to_process.state = False ret.append(to_process) # FPs for comp_col in set(range(match_matrix.shape[1])) - set(used_comp): base_col = match_matrix[:, comp_col].argmax() to_process = copy.copy(match_matrix[base_col, comp_col]) to_process.base = None # The base will be written elsewhere to_process.multi = to_process.state to_process.state = False ret.append(to_process) return ret PICKERS = {"single": pick_single_matches, "ac": pick_ac_matches, "multi": pick_multi_matches } def bench_main(cmdargs): """ Main - entry point from command line """ args = parse_args(cmdargs) if check_params(args) or check_inputs(args): sys.stderr.write("Couldn't run Truvari. Please fix parameters\n") sys.exit(100) params = truvari.VariantParams(args, short_circuit=args.short, decompose=args.no_decompose) m_bench = Bench(params=params, base_vcf=args.base, comp_vcf=args.comp, outdir=args.output, includebed=args.includebed, extend=args.extend, debug=args.debug, do_logging=True) output = m_bench.run() logging.info("Stats: %s", json.dumps(output.stats_box, indent=4)) logging.info("Finished bench")