Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-06-15 02:45:57 -04:00
commit 61fcbcb190
116 changed files with 105954 additions and 2056 deletions

View File

@ -2,6 +2,7 @@ library(gsalib)
library(ggplot2)
library(gplots)
library(tools)
library(reshape)
#
# Standard command line switch. Can we loaded interactively for development

View File

@ -59,6 +59,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.io.File;
import java.io.FileInputStream;
@ -221,6 +222,10 @@ public class GenomeAnalysisEngine {
if (this.getArguments().nonDeterministicRandomSeed)
resetRandomGenerator(System.currentTimeMillis());
// TODO -- REMOVE ME WHEN WE STOP BCF testing
if ( this.getArguments().USE_SLOW_GENOTYPES )
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);

View File

@ -336,6 +336,11 @@ public class GATKArgumentCollection {
public boolean generateShadowBCF = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
@Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
@Hidden
public boolean USE_SLOW_GENOTYPES = false;
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
/**
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other

View File

@ -38,10 +38,7 @@ import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.*;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
@ -58,6 +55,8 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
*/
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
private final static int BUFFER_SIZE = 1048576;
protected final File file;
protected OutputStream stream;
protected final VariantContextWriter writer;
@ -93,7 +92,7 @@ public class VariantContextWriterStorage implements Storage<VariantContextWriter
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);

View File

@ -51,6 +51,8 @@ import java.util.List;
* @version 0.1
*/
public class VariantContextWriterStub implements Stub<VariantContextWriter>, VariantContextWriter {
public final static boolean UPDATE_CONTIG_HEADERS = true;
/**
* The engine, central to the GATK's processing.
*/
@ -215,7 +217,8 @@ public class VariantContextWriterStub implements Stub<VariantContextWriter>, Var
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
}
//vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
if ( UPDATE_CONTIG_HEADERS )
vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
}
outputTracker.getStorage(this).writeHeader(vcfHeader);

View File

@ -251,7 +251,7 @@ public class VariantContextAdaptors {
Map<String, Object> attributes = new HashMap<String, Object>();
Collection<Genotype> genotypes = new ArrayList<Genotype>();
Genotype call = new Genotype(name, genotypeAlleles);
Genotype call = GenotypeBuilder.create(name, genotypeAlleles);
// add the call to the genotype list, and then use this list to create a VariantContext
genotypes.add(call);
@ -344,7 +344,7 @@ public class VariantContextAdaptors {
alleles.add(allele2);
}
Genotype g = new Genotype(samples[i], myAlleles);
Genotype g = GenotypeBuilder.create(samples[i], myAlleles);
genotypes.add(g);
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -21,15 +22,12 @@ import java.util.*;
*/
public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
public void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, final GenotypeBuilder gb) {
Double ratio = annotateSNP(stratifiedContext, vc, g);
if (ratio == null)
return null;
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%.2f", ratio.doubleValue()));
return map;
return;
gb.attribute(getKeyNames().get(0), Double.valueOf(String.format("%.2f", ratio.doubleValue())));
}
private Double annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
@ -14,6 +15,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -44,22 +46,20 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
private static final String DEL = "DEL"; // constant, for speed: no need to create a key string for deletion allele every time
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g) {
public void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, GenotypeBuilder gb) {
if ( g == null || !g.isCalled() )
return null;
return;
if ( vc.isSNP() )
return annotateSNP(stratifiedContext, vc);
if ( vc.isIndel() )
return annotateIndel(stratifiedContext, vc);
return null;
annotateSNP(stratifiedContext, vc, gb);
else if ( vc.isIndel() )
annotateIndel(stratifiedContext, vc, gb);
}
private Map<String,Object> annotateSNP(AlignmentContext stratifiedContext, VariantContext vc) {
private void annotateSNP(AlignmentContext stratifiedContext, VariantContext vc, GenotypeBuilder gb) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
return;
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
for ( Allele allele : vc.getAlleles() )
@ -72,22 +72,21 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
}
// we need to add counts in the correct order
Integer[] counts = new Integer[alleleCounts.size()];
int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
return toADAnnotation(counts);
gb.AD(counts);
}
private Map<String,Object> annotateIndel(AlignmentContext stratifiedContext, VariantContext vc) {
private void annotateIndel(AlignmentContext stratifiedContext, VariantContext vc, GenotypeBuilder gb) {
if ( ! stratifiedContext.hasBasePileup() )
return null;
return;
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
if ( pileup == null )
return null;
return;
final HashMap<String, Integer> alleleCounts = new HashMap<String, Integer>();
alleleCounts.put(REF_ALLELE, 0);
@ -123,16 +122,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
}
}
Integer[] counts = new Integer[alleleCounts.size()];
int[] counts = new int[alleleCounts.size()];
counts[0] = alleleCounts.get(REF_ALLELE);
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
counts[i+1] = alleleCounts.get( getAlleleRepresentation(vc.getAlternateAllele(i)) );
return toADAnnotation(counts);
}
private final Map<String, Object> toADAnnotation(final Integer[] counts) {
return Collections.singletonMap(getKeyNames().get(0), (Object)Arrays.asList(counts));
gb.AD(counts);
}
private String getAlleleRepresentation(Allele allele) {
@ -145,7 +140,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
}
// public String getIndelBases()
public List<String> getKeyNames() { return Arrays.asList("AD"); }
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(

View File

@ -36,6 +36,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
@ -47,10 +48,11 @@ import java.util.Map;
* Count for each sample of mapping quality zero reads
*/
public class MappingQualityZeroBySample extends GenotypeAnnotation {
public Map<String, Object> annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context, VariantContext vc, Genotype g) {
public void annotate(RefMetaDataTracker tracker,
AnnotatorCompatibleWalker walker, ReferenceContext ref, AlignmentContext context,
VariantContext vc, Genotype g, GenotypeBuilder gb) {
if ( g == null || !g.isCalled() )
return null;
return;
int mq0 = 0;
if ( context.hasBasePileup() ) {
@ -60,9 +62,8 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation {
mq0++;
}
}
Map<String, Object> map = new HashMap<String, Object>();
map.put(getKeyNames().get(0), String.format("%d", mq0));
return map;
gb.attribute(getKeyNames().get(0), mq0);
}
public List<String> getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); }

View File

@ -261,24 +261,22 @@ public class VariantAnnotatorEngine {
}
private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( requestedGenotypeAnnotations.size() == 0 )
if ( requestedGenotypeAnnotations.isEmpty() )
return vc.getGenotypes();
GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
for ( final Genotype genotype : vc.getGenotypes() ) {
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null ) {
genotypes.add(genotype);
continue;
} else {
final GenotypeBuilder gb = new GenotypeBuilder(genotype);
for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
annotation.annotate(tracker, walker, ref, context, vc, genotype, gb);
}
genotypes.add(gb.make());
}
Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
Map<String, Object> result = annotation.annotate(tracker, walker, ref, context, vc, genotype);
if ( result != null )
genotypeAnnotations.putAll(result);
}
genotypes.add(new Genotype(genotype.getSampleName(), genotype.getAlleles(), genotype.getLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.isPhased()));
}
return genotypes;

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.List;
@ -13,8 +14,9 @@ import java.util.Map;
public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation {
// return annotations for the given contexts/genotype split by sample
public abstract Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker,
ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g);
public abstract void annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker,
ReferenceContext ref, AlignmentContext stratifiedContext,
VariantContext vc, Genotype g, GenotypeBuilder gb );
// return the descriptions used for the VCF FORMAT meta field
public abstract List<VCFFormatHeaderLine> getDescriptions();

View File

@ -204,8 +204,6 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
}
for ( final Genotype g : vc_input.getGenotypes() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
boolean genotypeIsPhased = true;
String sample = g.getSampleName();
@ -271,7 +269,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
// Compute new GQ field = -10*log10Pr(Genotype call is wrong)
// Beagle gives probability that genotype is AA, AB and BB.
// Which, by definition, are prob of hom ref, het and hom var.
Double probWrongGenotype, genotypeQuality;
double probWrongGenotype, genotypeQuality;
Double homRefProbability = Double.valueOf(beagleProbabilities.get(0));
Double hetProbability = Double.valueOf(beagleProbabilities.get(1));
Double homVarProbability = Double.valueOf(beagleProbabilities.get(2));
@ -300,7 +298,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else
genotypeQuality = log10(probWrongGenotype);
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getAttributes());
HashMap<String,Object> originalAttributes = new HashMap<String,Object>(g.getExtendedAttributes());
// get original encoding and add to keynotype attributes
String a1, a2, og;
@ -328,7 +326,7 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
else {
originalAttributes.put("OG",".");
}
Genotype imputedGenotype = new Genotype(g.getSampleName(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
Genotype imputedGenotype = new GenotypeBuilder(g).alleles(alleles).log10PError(genotypeQuality).attributes(originalAttributes).phased(genotypeIsPhased).make();
if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
beagleVarCounts++;
}

View File

@ -36,10 +36,7 @@ import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import java.util.*;
@ -260,7 +257,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStart(), alleles);
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
vcb.filters(statusesToStrings(stats.callableStatuses(thresholds)));
vcb.filters(new HashSet<String>(statusesToStrings(stats.callableStatuses(thresholds))));
attributes.put(VCFConstants.END_KEY, interval.getStop());
attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage());
@ -270,21 +267,20 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage());
}
for (String sample : samples) {
Map<String, Object> infos = new HashMap<String, Object>();
SampleStatistics sampleStat = stats.getSample(sample);
infos.put(VCFConstants.DEPTH_KEY, sampleStat.averageCoverage());
infos.put("Q1", sampleStat.getQuantileDepth(0.25));
infos.put("MED", sampleStat.getQuantileDepth(0.50));
infos.put("Q3", sampleStat.getQuantileDepth(0.75));
final GenotypeBuilder gb = new GenotypeBuilder(sample);
Set<String> filters = new HashSet<String>();
filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
SampleStatistics sampleStat = stats.getSample(sample);
gb.DP((int)sampleStat.averageCoverage());
gb.attribute("Q1", sampleStat.getQuantileDepth(0.25));
gb.attribute("MED", sampleStat.getQuantileDepth(0.50));
gb.attribute("Q3", sampleStat.getQuantileDepth(0.75));
if (debug) {
System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads());
}
gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds)));
genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false));
genotypes.add(gb.make());
}
vcb = vcb.genotypes(genotypes);
@ -299,8 +295,8 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
* @param statuses the set of statuses to be converted
* @return a matching set of strings
*/
private Set<String> statusesToStrings(Set<CallableStatus> statuses) {
Set<String> output = new HashSet<String>(statuses.size());
private List<String> statusesToStrings(Set<CallableStatus> statuses) {
List<String> output = new ArrayList<String>(statuses.size());
for (CallableStatus status : statuses)
output.add(status.name());

View File

@ -79,14 +79,12 @@ class SampleStatistics {
* @return the callable statuses of the entire sample
*/
public Set<CallableStatus> getCallableStatuses(ThresHolder thresholds) {
Set<CallableStatus> output = new HashSet<CallableStatus>();
// We check if reads are present ot prevent div / 0 exceptions
if (nReads == 0) {
output.add(CallableStatus.NO_READS);
return output;
return Collections.singleton(CallableStatus.NO_READS);
}
Set<CallableStatus> output = new HashSet<CallableStatus>();
Map<CallableStatus, Double> totals = new HashMap<CallableStatus, Double>(CallableStatus.values().length);
// initialize map
@ -126,6 +124,7 @@ class SampleStatistics {
if (output.isEmpty()) {
output.add(CallableStatus.PASS);
}
return output;
}

View File

@ -55,8 +55,6 @@ public class BAMDiffableReader implements DiffableReader {
int count = 0;
while ( iterator.hasNext() ) {
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
break;
final SAMRecord record = iterator.next();
// name is the read name + first of pair
@ -88,6 +86,9 @@ public class BAMDiffableReader implements DiffableReader {
if ( ! root.hasElement(name) )
// protect ourselves from malformed files
root.add(readRoot);
count += readRoot.size();
if ( count > maxElementsToRead && maxElementsToRead != -1)
break;
}
reader.close();

View File

@ -147,7 +147,7 @@ public class DiffEngine {
* @param diffs the list of differences to summarize
*/
public void reportSummarizedDifferences(List<Difference> diffs, SummaryReportParams params ) {
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.maxRawDiffsToSummarize), params );
printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params );
}
final protected static String[] diffNameToPath(String diffName) {
@ -161,9 +161,17 @@ public class DiffEngine {
diffs.add(new Difference(diff));
}
return summarizedDifferencesOfPaths(diffs, -1);
return summarizedDifferencesOfPaths(diffs, true, -1);
}
/**
* Computes a minimum set of potential differences between all singleton differences
* in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm.
*
* @param singletonDiffs
* @param maxRawDiffsToSummarize
* @return
*/
private Map<String, Difference> initialPairwiseSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>();
@ -191,9 +199,41 @@ public class DiffEngine {
return summaries;
}
/**
* Computes the possible leaf differences among the singleton diffs.
*
* The leaf differences are all of the form *.*...*.X where all internal
* differences are wildcards and the only summarized difference considered
* interesting to compute is
*
* @param singletonDiffs
* @param maxRawDiffsToSummarize
* @return
*/
private Map<String, Difference> initialLeafSummaries(final List<? extends Difference> singletonDiffs,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = new HashMap<String, Difference>();
// create the initial set of differences
for ( final Difference d : singletonDiffs ) {
final String path = summarizedPath(d.getParts(), 1);
Difference sumDiff = new Difference(path, d.getMaster(), d.getTest());
sumDiff.setCount(0);
addSummaryIfMissing(summaries, sumDiff);
if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize)
return summaries;
}
return summaries;
}
protected List<Difference> summarizedDifferencesOfPaths(final List<? extends Difference> singletonDiffs,
final boolean doPairwise,
final int maxRawDiffsToSummarize) {
Map<String, Difference> summaries = initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize);
final Map<String, Difference> summaries = doPairwise
? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize)
: initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize);
// count differences
for ( Difference diffPath : singletonDiffs ) {
@ -372,18 +412,21 @@ public class DiffEngine {
final int maxCountOneItems;
final int minSumDiffToShow;
final int maxRawDiffsToSummarize;
final boolean doPairwise;
boolean descending = true;
public SummaryReportParams(PrintStream out,
int maxItemsToDisplay,
int maxCountOneItems,
int minSumDiffToShow,
int maxRawDiffsToSummarize) {
int maxRawDiffsToSummarize,
final boolean doPairwise) {
this.out = out;
this.maxItemsToDisplay = maxItemsToDisplay;
this.maxCountOneItems = maxCountOneItems;
this.minSumDiffToShow = minSumDiffToShow;
this.maxRawDiffsToSummarize = maxRawDiffsToSummarize;
this.doPairwise = doPairwise;
}
public void setDescending(boolean descending) {

View File

@ -111,21 +111,21 @@ import java.util.List;
* <p>
*
* <pre>
[testng] path count
[testng] *.*.*.AC 6
[testng] *.*.*.AF 6
[testng] *.*.*.AN 6
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1
</pre>
[testng] path count
[testng] *.*.*.AC 6
[testng] *.*.*.AF 6
[testng] *.*.*.AN 6
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN 1
[testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC 1
</pre>
*
* @author Mark DePristo
* @since 7/4/11
@ -165,6 +165,8 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="maxRawDiffsToSummary", shortName="maxRawDiffsToSummary", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false)
int maxRawDiffsToSummary = -1;
@Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false)
boolean doPairwise = false;
/**
* The max number of differences to display when summarizing. For example, if there are 10M differences, but
@ -199,11 +201,14 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false)
boolean showItemizedDifferences = false;
@Argument(fullName="iterations", doc="Number of iterations to perform, should be 1 unless you are doing memory testing", required=false)
int iterations = 1;
DiffEngine diffEngine;
@Override
public void initialize() {
this.diffEngine = new DiffEngine();
this.diffEngine = new DiffEngine();
}
@Override
@ -223,29 +228,39 @@ public class DiffObjectsWalker extends RodWalker<Integer, Integer> {
@Override
public void onTraversalDone(Integer sum) {
//out.printf("Reading master file %s%n", masterFile);
DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", master.size()));
//out.printf("Reading test file %s%n", testFile);
DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", test.size()));
if ( iterations > 1 ) {
for ( int i = 0; i < iterations; i++ ) {
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, 20, 10, 0, -1, false);
boolean success = DiffEngine.simpleDiffFiles(masterFile, testFile, MAX_OBJECTS_TO_READ, params);
logger.info("Iteration " + i + " success " + success);
}
} else {
//out.printf("Reading master file %s%n", masterFile);
DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", master.size()));
//out.printf("Reading test file %s%n", testFile);
DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ);
logger.info(String.format("Read %d objects", test.size()));
// out.printf("Master diff objects%n");
// out.println(master.toString());
// out.printf("Test diff objects%n");
// out.println(test.toString());
List<Difference> diffs = diffEngine.diff(master, test);
logger.info(String.format("Done computing diff with %d differences found", diffs.size()));
if ( showItemizedDifferences ) {
out.printf("Itemized results%n");
for ( Difference diff : diffs )
out.printf("DIFF: %s%n", diff.toString());
}
List<Difference> diffs = diffEngine.diff(master, test);
logger.info(String.format("Done computing diff with %d differences found", diffs.size()));
if ( showItemizedDifferences ) {
out.printf("Itemized results%n");
for ( Difference diff : diffs )
out.printf("DIFF: %s%n", diff.toString());
}
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, maxRawDiffsToSummary);
params.setDescending(false);
diffEngine.reportSummarizedDifferences(diffs, params);
logger.info(String.format("Done summarizing differences"));
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out,
MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff,
maxRawDiffsToSummary, doPairwise);
params.setDescending(false);
diffEngine.reportSummarizedDifferences(diffs, params);
logger.info(String.format("Done summarizing differences"));
}
}
}
}

View File

@ -29,11 +29,13 @@ import org.broad.tribble.AbstractFeatureReader;
import org.broad.tribble.FeatureReader;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.LineReader;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
@ -79,9 +81,6 @@ public class VCFDiffableReader implements DiffableReader {
String prevName = "";
Iterator<VariantContext> it = reader.iterator();
while ( it.hasNext() ) {
if ( count++ > maxElementsToRead && maxElementsToRead != -1)
break;
VariantContext vc = it.next();
String name = vc.getChr() + ":" + vc.getStart();
if ( name.equals(prevName) ) {
@ -109,9 +108,12 @@ public class VCFDiffableReader implements DiffableReader {
for (Genotype g : vc.getGenotypes() ) {
DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot);
gRoot.add("GT", g.getGenotypeString());
gRoot.add("GQ", g.hasLog10PError() ? g.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4 );
if ( g.hasGQ() ) gRoot.add("GQ", g.getGQ() );
if ( g.hasDP() ) gRoot.add("DP", g.getDP() );
if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD()));
if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL()));
for (Map.Entry<String, Object> attribute : g.getAttributes().entrySet()) {
for (Map.Entry<String, Object> attribute : g.getExtendedAttributes().entrySet()) {
if ( ! attribute.getKey().startsWith("_") )
gRoot.add(attribute.getKey(), attribute.getValue());
}
@ -120,6 +122,9 @@ public class VCFDiffableReader implements DiffableReader {
}
root.add(vcRoot);
count += vcRoot.size();
if ( count > maxElementsToRead && maxElementsToRead != -1)
break;
}
reader.close();

View File

@ -297,13 +297,14 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
// for each genotype, check filters then create a new object
for ( final Genotype g : vc.getGenotypes() ) {
if ( g.isCalled() ) {
Set<String> filters = new LinkedHashSet<String>(g.getFilters());
List<String> filters = new ArrayList<String>(g.getFilters());
for ( VariantContextUtils.JexlVCMatchExp exp : genotypeFilterExps ) {
if ( VariantContextUtils.match(vc, g, exp) )
filters.add(exp.name);
}
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), filters, g.getAttributes(), g.isPhased()));
genotypes.add(new GenotypeBuilder(g).filters(filters).make());
} else {
genotypes.add(g);
}

View File

@ -141,13 +141,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (context.hasBasePileup()) {
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods);
final HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup));
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
b.PL(genotypeLikelihoods);
b.DP(getFilteredDepth(pileup));
genotypes.add(b.make());
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:", sample.getKey(), alleleList.toString());

View File

@ -158,12 +158,11 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
myLikelihoods[i] = allLikelihoods[PLordering[i]];
// normalize in log space so that max element is zero.
final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
final HashMap<String, Object> attributes = new HashMap<String, Object>();
attributes.put(VCFConstants.DEPTH_KEY, sampleData.depth);
attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods);
genotypes.add(new Genotype(sampleData.name, noCall, Genotype.NO_LOG10_PERROR, null, attributes, false));
final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name);
final double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(myLikelihoods, false, true);
gb.PL(genotypeLikelihoods);
gb.DP(sampleData.depth);
genotypes.add(gb.make());
}
return builder.genotypes(genotypes).make();

View File

@ -617,7 +617,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
throw new StingException("Read "+read.getReadName()+" from "+getToolkit().getSourceFileForReaderID(getToolkit().getReaderIDForRead(read))+
"has no Normal/Tumor tag associated with it");
// String rg = (String)read.getAttribute("RG");
// String rg = (String)read.getExtendedAttribute("RG");
// if ( rg == null )
// throw new UserException.MalformedBam(read, "Read "+read.getReadName()+" has no read group in merged stream. RG is required for somatic calls.");
@ -1148,13 +1148,12 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
GenotypesContext genotypes = GenotypesContext.create();
for ( String sample : normalSamples ) {
Map<String,Object> attrs = call.makeStatsAttributes(null);
if ( ! discard_event ) // we made a call - put actual het genotype here:
genotypes.add(new Genotype(sample,alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
else // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
genotypes.add(new Genotype(sample, homref_alleles,Genotype.NO_LOG10_PERROR,null,attrs,false));
final GenotypeBuilder gb = new GenotypeBuilder(sample);
gb.attributes(call.makeStatsAttributes(null));
gb.alleles(! discard_event
? alleles // we made a call - put actual het genotype here:
: homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all)
genotypes.add(gb.make());
}
Set<String> filters = null;
@ -1238,11 +1237,11 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
GenotypesContext genotypes = GenotypesContext.create();
for ( String sample : normalSamples ) {
genotypes.add(new Genotype(sample, homRefN ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsNormal,false));
genotypes.add(GenotypeBuilder.create(sample, homRefN ? homRefAlleles : alleles, attrsNormal));
}
for ( String sample : tumorSamples ) {
genotypes.add(new Genotype(sample, homRefT ? homRefAlleles : alleles,Genotype.NO_LOG10_PERROR,null,attrsTumor,false) );
genotypes.add(GenotypeBuilder.create(sample, homRefT ? homRefAlleles : alleles, attrsTumor));
}
Set<String> filters = null;
@ -2144,7 +2143,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
class VCFIndelAttributes {
public static String ALLELIC_DEPTH_KEY = "AD";
public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY;
public static String MAPQ_KEY = "MQS";

View File

@ -97,10 +97,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
private ArrayList<Sample> trios = new ArrayList<Sample>();
//Matrix of priors for all genotype combinations
private EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> mvCountMatrix;
private EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>> mvCountMatrix;
//Matrix of allele transmission
private EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>> transmissionMatrix;
private EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>> transmissionMatrix;
//Metrics counters hash keys
private final Byte NUM_TRIO_GENOTYPES_CALLED = 0;
@ -138,17 +138,17 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
private EnumMap<FamilyMember,Genotype> trioPhasedGenotypes = new EnumMap<FamilyMember, Genotype>(FamilyMember.class);
private ArrayList<Allele> getAlleles(Genotype.Type genotype){
private ArrayList<Allele> getAlleles(GenotypeType genotype){
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
if(genotype == Genotype.Type.HOM_REF){
if(genotype == GenotypeType.HOM_REF){
alleles.add(REF);
alleles.add(REF);
}
else if(genotype == Genotype.Type.HET){
else if(genotype == GenotypeType.HET){
alleles.add(REF);
alleles.add(VAR);
}
else if(genotype == Genotype.Type.HOM_VAR){
else if(genotype == GenotypeType.HOM_VAR){
alleles.add(VAR);
alleles.add(VAR);
}
@ -158,27 +158,34 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
return alleles;
}
private boolean isPhasable(Genotype.Type genotype){
return genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HET || genotype == Genotype.Type.HOM_VAR;
private boolean isPhasable(GenotypeType genotype){
return genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HET || genotype == GenotypeType.HOM_VAR;
}
//Create a new Genotype based on information from a single individual
//Homozygous genotypes will be set as phased, heterozygous won't be
private void phaseSingleIndividualAlleles(Genotype.Type genotype, FamilyMember familyMember){
if(genotype == Genotype.Type.HOM_REF || genotype == Genotype.Type.HOM_VAR){
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME, getAlleles(genotype), Genotype.NO_LOG10_PERROR, null, null, true));
}
else
trioPhasedGenotypes.put(familyMember, new Genotype(DUMMY_NAME,getAlleles(genotype),Genotype.NO_LOG10_PERROR,null,null,false));
private void phaseSingleIndividualAlleles(GenotypeType genotype, FamilyMember familyMember){
boolean phase = genotype == GenotypeType.HOM_REF || genotype == GenotypeType.HOM_VAR;
trioPhasedGenotypes.put(familyMember, makeGenotype(genotype, phase));
}
private Genotype makeGenotype(final GenotypeType type, boolean phase) {
return makeGenotype(getAlleles(type), phase);
}
private Genotype makeGenotype(final List<Allele> alleles, boolean phase) {
final GenotypeBuilder gb = new GenotypeBuilder(DUMMY_NAME, alleles);
gb.phased(phase);
return gb.make();
}
//Find the phase for a parent/child pair
private void phasePairAlleles(Genotype.Type parentGenotype, Genotype.Type childGenotype, FamilyMember parent){
private void phasePairAlleles(GenotypeType parentGenotype, GenotypeType childGenotype, FamilyMember parent){
//Special case for Het/Het as it is ambiguous
if(parentGenotype == Genotype.Type.HET && childGenotype == Genotype.Type.HET){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, getAlleles(parentGenotype), Genotype.NO_LOG10_PERROR, null, null, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
if(parentGenotype == GenotypeType.HET && childGenotype == GenotypeType.HET){
trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false));
return;
}
@ -190,34 +197,34 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//If there is a possible phasing between the parent and child => phase
int childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(0));
if(childTransmittedAlleleIndex > -1){
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(parent, makeGenotype(parentAlleles, true));
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
if(parent.equals(FamilyMember.MOTHER))
childPhasedAlleles.add(childAlleles.get(0));
else
childPhasedAlleles.add(0,childAlleles.get(0));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true));
}
else if((childTransmittedAlleleIndex = childAlleles.indexOf(parentAlleles.get(1))) > -1){
parentPhasedAlleles.add(parentAlleles.get(1));
parentPhasedAlleles.add(parentAlleles.get(0));
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME, parentPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(parent, makeGenotype(parentPhasedAlleles, true));
childPhasedAlleles.add(childAlleles.remove(childTransmittedAlleleIndex));
if(parent.equals(FamilyMember.MOTHER))
childPhasedAlleles.add(childAlleles.get(0));
else
childPhasedAlleles.add(0,childAlleles.get(0));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME, childPhasedAlleles, Genotype.NO_LOG10_PERROR, null, null, true));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAlleles, true));
}
//This is a Mendelian Violation => Do not phase
else{
trioPhasedGenotypes.put(parent, new Genotype(DUMMY_NAME,getAlleles(parentGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(childGenotype),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(parent, makeGenotype(parentGenotype, false));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childGenotype, false));
}
}
//Phases a family by transmission
private void phaseFamilyAlleles(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
private void phaseFamilyAlleles(GenotypeType mother, GenotypeType father, GenotypeType child){
Set<ArrayList<Allele>> possiblePhasedChildGenotypes = new HashSet<ArrayList<Allele>>();
ArrayList<Allele> motherAlleles = getAlleles(mother);
@ -246,7 +253,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
motherPhasedAlleles.add(motherAlleles.get(0));
else
motherPhasedAlleles.add(motherAlleles.get(1));
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,motherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(motherPhasedAlleles, true));
//Create father's genotype
ArrayList<Allele> fatherPhasedAlleles = new ArrayList<Allele>(2);
@ -255,10 +262,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
fatherPhasedAlleles.add(fatherAlleles.get(0));
else
fatherPhasedAlleles.add(fatherAlleles.get(1));
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,fatherPhasedAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(fatherPhasedAlleles,true));
//Create child's genotype
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,childPhasedAllelesAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(childPhasedAllelesAlleles,true));
//Once a phased combination is found; exit
return;
@ -266,16 +273,16 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
}
//If this is reached then no phasing could be found
trioPhasedGenotypes.put(FamilyMember.MOTHER, new Genotype(DUMMY_NAME,getAlleles(mother),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.FATHER, new Genotype(DUMMY_NAME,getAlleles(father),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, new Genotype(DUMMY_NAME,getAlleles(child),Genotype.NO_LOG10_PERROR,null,null,false));
trioPhasedGenotypes.put(FamilyMember.MOTHER, makeGenotype(mother,false));
trioPhasedGenotypes.put(FamilyMember.FATHER, makeGenotype(father,false));
trioPhasedGenotypes.put(FamilyMember.CHILD, makeGenotype(child,false));
}
/* Constructor: Creates a conceptual trio genotype combination from the given genotypes.
If one or more genotypes are set as NO_CALL or UNAVAILABLE, it will phase them like a pair
or single individual.
*/
public TrioPhase(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
public TrioPhase(GenotypeType mother, GenotypeType father, GenotypeType child){
//Take care of cases where one or more family members are no call
if(!isPhasable(child)){
@ -297,7 +304,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
}
//Special case for Het/Het/Het as it is ambiguous
else if(mother == Genotype.Type.HET && father == Genotype.Type.HET && child == Genotype.Type.HET){
else if(mother == GenotypeType.HET && father == GenotypeType.HET && child == GenotypeType.HET){
phaseSingleIndividualAlleles(mother, FamilyMember.MOTHER);
phaseSingleIndividualAlleles(father, FamilyMember.FATHER);
phaseSingleIndividualAlleles(child, FamilyMember.CHILD);
@ -311,7 +318,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
if(fatherFAlleleFirst && trioPhasedGenotypes.get(FamilyMember.CHILD).isPhased()){
ArrayList<Allele> childAlleles = new ArrayList<Allele>(trioPhasedGenotypes.get(FamilyMember.CHILD).getAlleles());
childAlleles.add(childAlleles.remove(0));
trioPhasedGenotypes.put(FamilyMember.CHILD,new Genotype(DUMMY_NAME,childAlleles,Genotype.NO_LOG10_PERROR,null,null,true));
trioPhasedGenotypes.put(FamilyMember.CHILD,makeGenotype(childAlleles,true));
}
}
@ -347,7 +354,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Add the transmission probability
Map<String, Object> genotypeAttributes = new HashMap<String, Object>();
genotypeAttributes.putAll(genotype.getAttributes());
genotypeAttributes.putAll(genotype.getExtendedAttributes());
if(transmissionProb>NO_TRANSMISSION_PROB)
genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission);
@ -370,7 +377,10 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
else
log10Error = genotype.getLikelihoods().getLog10GQ(phasedGenotype.getType());
return new Genotype(genotype.getSampleName(), phasedAlleles, log10Error, null, genotypeAttributes, phasedGenotype.isPhased());
return new GenotypeBuilder(genotype).alleles(phasedAlleles)
.log10PError(log10Error)
.attributes(genotypeAttributes)
.phased(phasedGenotype.isPhased()).make();
}
@ -438,15 +448,15 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Create the transmission matrices
private void buildMatrices(){
mvCountMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
transmissionMatrix = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>>(Genotype.Type.class);
for(Genotype.Type mother : Genotype.Type.values()){
mvCountMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
transmissionMatrix.put(mother,new EnumMap<Genotype.Type,EnumMap<Genotype.Type,TrioPhase>>(Genotype.Type.class));
for(Genotype.Type father : Genotype.Type.values()){
mvCountMatrix.get(mother).put(father,new EnumMap<Genotype.Type, Integer>(Genotype.Type.class));
transmissionMatrix.get(mother).put(father,new EnumMap<Genotype.Type,TrioPhase>(Genotype.Type.class));
for(Genotype.Type child : Genotype.Type.values()){
mvCountMatrix = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>>(GenotypeType.class);
transmissionMatrix = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>>(GenotypeType.class);
for(GenotypeType mother : GenotypeType.values()){
mvCountMatrix.put(mother,new EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>(GenotypeType.class));
transmissionMatrix.put(mother,new EnumMap<GenotypeType,EnumMap<GenotypeType,TrioPhase>>(GenotypeType.class));
for(GenotypeType father : GenotypeType.values()){
mvCountMatrix.get(mother).put(father,new EnumMap<GenotypeType, Integer>(GenotypeType.class));
transmissionMatrix.get(mother).put(father,new EnumMap<GenotypeType,TrioPhase>(GenotypeType.class));
for(GenotypeType child : GenotypeType.values()){
mvCountMatrix.get(mother).get(father).put(child, getCombinationMVCount(mother, father, child));
transmissionMatrix.get(mother).get(father).put(child,new TrioPhase(mother,father,child));
}
@ -457,16 +467,16 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Returns the number of Mendelian Violations for a given genotype combination.
//If one of the parents genotype is missing, it will consider it as a parent/child pair
//If the child genotype or both parents genotypes are missing, 0 is returned.
private int getCombinationMVCount(Genotype.Type mother, Genotype.Type father, Genotype.Type child){
private int getCombinationMVCount(GenotypeType mother, GenotypeType father, GenotypeType child){
//Child is no call => No MV
if(child == Genotype.Type.NO_CALL || child == Genotype.Type.UNAVAILABLE)
if(child == GenotypeType.NO_CALL || child == GenotypeType.UNAVAILABLE)
return 0;
//Add parents with genotypes for the evaluation
ArrayList<Genotype.Type> parents = new ArrayList<Genotype.Type>();
if (!(mother == Genotype.Type.NO_CALL || mother == Genotype.Type.UNAVAILABLE))
ArrayList<GenotypeType> parents = new ArrayList<GenotypeType>();
if (!(mother == GenotypeType.NO_CALL || mother == GenotypeType.UNAVAILABLE))
parents.add(mother);
if (!(father == Genotype.Type.NO_CALL || father == Genotype.Type.UNAVAILABLE))
if (!(father == GenotypeType.NO_CALL || father == GenotypeType.UNAVAILABLE))
parents.add(father);
//Both parents no calls => No MV
@ -477,35 +487,35 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
int parentsNumRefAlleles = 0;
int parentsNumAltAlleles = 0;
for(Genotype.Type parent : parents){
if(parent == Genotype.Type.HOM_REF){
for(GenotypeType parent : parents){
if(parent == GenotypeType.HOM_REF){
parentsNumRefAlleles++;
}
else if(parent == Genotype.Type.HET){
else if(parent == GenotypeType.HET){
parentsNumRefAlleles++;
parentsNumAltAlleles++;
}
else if(parent == Genotype.Type.HOM_VAR){
else if(parent == GenotypeType.HOM_VAR){
parentsNumAltAlleles++;
}
}
//Case Child is HomRef
if(child == Genotype.Type.HOM_REF){
if(child == GenotypeType.HOM_REF){
if(parentsNumRefAlleles == parents.size())
return 0;
else return (parents.size()-parentsNumRefAlleles);
}
//Case child is HomVar
if(child == Genotype.Type.HOM_VAR){
if(child == GenotypeType.HOM_VAR){
if(parentsNumAltAlleles == parents.size())
return 0;
else return parents.size()-parentsNumAltAlleles;
}
//Case child is Het
if(child == Genotype.Type.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
if(child == GenotypeType.HET && ((parentsNumRefAlleles > 0 && parentsNumAltAlleles > 0) || parents.size()<2))
return 0;
//MV
@ -513,7 +523,7 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
}
//Given two trio genotypes combinations, returns the number of different genotypes between the two combinations.
private int countFamilyGenotypeDiff(Genotype.Type motherOriginal,Genotype.Type fatherOriginal,Genotype.Type childOriginal,Genotype.Type motherNew,Genotype.Type fatherNew,Genotype.Type childNew){
private int countFamilyGenotypeDiff(GenotypeType motherOriginal,GenotypeType fatherOriginal,GenotypeType childOriginal,GenotypeType motherNew,GenotypeType fatherNew,GenotypeType childNew){
int count = 0;
if(motherOriginal!=motherNew)
count++;
@ -526,21 +536,21 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Get a Map of genotype likelihoods.
//In case of null, unavailable or no call, all likelihoods are 1/3.
private EnumMap<Genotype.Type,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){
private EnumMap<GenotypeType,Double> getLikelihoodsAsMapSafeNull(Genotype genotype){
if(genotype == null || !genotype.isCalled()){
EnumMap<Genotype.Type,Double> likelihoods = new EnumMap<Genotype.Type, Double>(Genotype.Type.class);
likelihoods.put(Genotype.Type.HOM_REF,1.0/3.0);
likelihoods.put(Genotype.Type.HET,1.0/3.0);
likelihoods.put(Genotype.Type.HOM_VAR,1.0/3.0);
EnumMap<GenotypeType,Double> likelihoods = new EnumMap<GenotypeType, Double>(GenotypeType.class);
likelihoods.put(GenotypeType.HOM_REF,1.0/3.0);
likelihoods.put(GenotypeType.HET,1.0/3.0);
likelihoods.put(GenotypeType.HOM_VAR,1.0/3.0);
return likelihoods;
}
return genotype.getLikelihoods().getAsMap(true);
}
//Returns the Genotype.Type; returns UNVAILABLE if given null
private Genotype.Type getTypeSafeNull(Genotype genotype){
//Returns the GenotypeType; returns UNVAILABLE if given null
private GenotypeType getTypeSafeNull(Genotype genotype){
if(genotype == null)
return Genotype.Type.UNAVAILABLE;
return GenotypeType.UNAVAILABLE;
return genotype.getType();
}
@ -561,18 +571,18 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
//Always assign the first parent as the parent having genotype information in pairs
//Always assign the mother as the first parent in trios
int parentsCalled = 0;
Map<Genotype.Type,Double> firstParentLikelihoods;
Map<Genotype.Type,Double> secondParentLikelihoods;
ArrayList<Genotype.Type> bestFirstParentGenotype = new ArrayList<Genotype.Type>();
ArrayList<Genotype.Type> bestSecondParentGenotype = new ArrayList<Genotype.Type>();
ArrayList<Genotype.Type> bestChildGenotype = new ArrayList<Genotype.Type>();
Genotype.Type pairSecondParentGenotype = null;
Map<GenotypeType,Double> firstParentLikelihoods;
Map<GenotypeType,Double> secondParentLikelihoods;
ArrayList<GenotypeType> bestFirstParentGenotype = new ArrayList<GenotypeType>();
ArrayList<GenotypeType> bestSecondParentGenotype = new ArrayList<GenotypeType>();
ArrayList<GenotypeType> bestChildGenotype = new ArrayList<GenotypeType>();
GenotypeType pairSecondParentGenotype = null;
if(mother == null || !mother.isCalled()){
firstParentLikelihoods = getLikelihoodsAsMapSafeNull(father);
secondParentLikelihoods = getLikelihoodsAsMapSafeNull(mother);
bestFirstParentGenotype.add(getTypeSafeNull(father));
bestSecondParentGenotype.add(getTypeSafeNull(mother));
pairSecondParentGenotype = mother == null ? Genotype.Type.UNAVAILABLE : mother.getType();
pairSecondParentGenotype = mother == null ? GenotypeType.UNAVAILABLE : mother.getType();
if(father != null && father.isCalled())
parentsCalled = 1;
}
@ -583,12 +593,12 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
bestSecondParentGenotype.add(getTypeSafeNull(father));
if(father == null || !father.isCalled()){
parentsCalled = 1;
pairSecondParentGenotype = father == null ? Genotype.Type.UNAVAILABLE : father.getType();
pairSecondParentGenotype = father == null ? GenotypeType.UNAVAILABLE : father.getType();
}else{
parentsCalled = 2;
}
}
Map<Genotype.Type,Double> childLikelihoods = getLikelihoodsAsMapSafeNull(child);
Map<GenotypeType,Double> childLikelihoods = getLikelihoodsAsMapSafeNull(child);
bestChildGenotype.add(getTypeSafeNull(child));
//Prior vars
@ -604,9 +614,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
int mvCount;
int cumulativeMVCount = 0;
double configurationLikelihood = 0;
for(Map.Entry<Genotype.Type,Double> childGenotype : childLikelihoods.entrySet()){
for(Map.Entry<Genotype.Type,Double> firstParentGenotype : firstParentLikelihoods.entrySet()){
for(Map.Entry<Genotype.Type,Double> secondParentGenotype : secondParentLikelihoods.entrySet()){
for(Map.Entry<GenotypeType,Double> childGenotype : childLikelihoods.entrySet()){
for(Map.Entry<GenotypeType,Double> firstParentGenotype : firstParentLikelihoods.entrySet()){
for(Map.Entry<GenotypeType,Double> secondParentGenotype : secondParentLikelihoods.entrySet()){
mvCount = mvCountMatrix.get(firstParentGenotype.getKey()).get(secondParentGenotype.getKey()).get(childGenotype.getKey());
//For parent/child pairs, sum over the possible genotype configurations of the missing parent
if(parentsCalled<2){
@ -797,9 +807,9 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
updateTrioMetricsCounters(phasedMother,phasedFather,phasedChild,mvCount,metricsCounters);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(),
phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString());
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),
phasedMother.getLikelihoodsString(), phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),Arrays.asList(phasedChild.getDP()),phasedChild.getAD(),phasedChild.getLikelihoodsString());
if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType()))
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
}
@ -809,8 +819,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString());
phasedMother.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getDP(),Arrays.asList(phasedMother.getAD()),phasedMother.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
}
}
else{
@ -820,8 +830,8 @@ public class PhaseByTransmission extends RodWalker<HashMap<Byte,Integer>, HashMa
metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1);
mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s",
vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(),
phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString());
phasedFather.getExtendedAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getDP(),Arrays.asList(phasedFather.getAD()),phasedFather.getLikelihoodsString(),
phasedChild.getGenotypeString(),phasedChild.getDP(),Arrays.asList(phasedChild.getAD()),phasedChild.getLikelihoodsString());
}
//Report violation if set so

View File

@ -109,14 +109,13 @@ class PhasingUtils {
}
double mergedGQ = Math.max(gt1.getLog10PError(), gt2.getLog10PError());
Set<String> mergedGtFilters = new HashSet<String>(); // Since gt1 and gt2 were unfiltered, the Genotype remains unfiltered
Map<String, Object> mergedGtAttribs = new HashMap<String, Object>();
PhaseAndQuality phaseQual = calcPhaseForMergedGenotypes(gt1, gt2);
if (phaseQual.PQ != null)
mergedGtAttribs.put(ReadBackedPhasingWalker.PQ_KEY, phaseQual.PQ);
Genotype mergedGt = new Genotype(gt1.getSampleName(), mergedAllelesForSample, mergedGQ, mergedGtFilters, mergedGtAttribs, phaseQual.isPhased);
Genotype mergedGt = new GenotypeBuilder(gt1.getSampleName(), mergedAllelesForSample).log10PError(mergedGQ).attributes(mergedGtAttribs).phased(phaseQual.isPhased).make();
mergedGenotypes.add(mergedGt);
}

View File

@ -288,7 +288,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
private VariantContext reduceVCToSamples(VariantContext vc, Set<String> samplesToPhase) {
// for ( String sample : samplesToPhase )
// logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() ));
VariantContext subvc = vc.subContextFromSamples(samplesToPhase);
VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true);
// logger.debug("original VC = " + vc);
// logger.debug("sub VC = " + subvc);
return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF);
@ -374,7 +374,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (isUnfilteredCalledDiploidGenotype(gt)) {
if (gt.isHom()) { // Note that this Genotype may be replaced later to contain the PQ of a downstream het site that was phased relative to a het site lying upstream of this hom site:
// true <-> can trivially phase a hom site relative to ANY previous site:
Genotype phasedGt = new Genotype(gt.getSampleName(), gt.getAlleles(), gt.getLog10PError(), gt.getFilters(), gt.getAttributes(), true);
Genotype phasedGt = new GenotypeBuilder(gt).phased(true).make();
uvc.setGenotype(samp, phasedGt);
}
else if (gt.isHet()) { // Attempt to phase this het genotype relative to the previous het genotype
@ -408,9 +408,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
if (DEBUG) logger.debug("THE PHASE CHOSEN HERE:\n" + allelePair + "\n\n");
ensurePhasing(allelePair, prevAllelePair, pr.haplotype);
Map<String, Object> gtAttribs = new HashMap<String, Object>(gt.getAttributes());
gtAttribs.put(PQ_KEY, pr.phaseQuality);
Genotype phasedGt = new Genotype(gt.getSampleName(), allelePair.getAllelesAsList(), gt.getLog10PError(), gt.getFilters(), gtAttribs, genotypesArePhased);
Genotype phasedGt = new GenotypeBuilder(gt)
.alleles(allelePair.getAllelesAsList())
.attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
uvc.setGenotype(samp, phasedGt);
}
@ -428,9 +429,9 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
interiorUvc.setPhasingInconsistent();
if (genotypesArePhased) {
Map<String, Object> handledGtAttribs = new HashMap<String, Object>(handledGt.getAttributes());
handledGtAttribs.put(PQ_KEY, pr.phaseQuality);
Genotype phasedHomGt = new Genotype(handledGt.getSampleName(), handledGt.getAlleles(), handledGt.getLog10PError(), handledGt.getFilters(), handledGtAttribs, genotypesArePhased);
Genotype phasedHomGt = new GenotypeBuilder(handledGt)
.attribute(PQ_KEY, pr.phaseQuality)
.phased(genotypesArePhased).make();
interiorUvc.setGenotype(samp, phasedHomGt);
}
}
@ -1439,7 +1440,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
}
public static boolean isUnfilteredCalledDiploidGenotype(Genotype gt) {
return (gt.isNotFiltered() && gt.isCalled() && gt.getPloidy() == 2);
return (! gt.isFiltered() && gt.isCalled() && gt.getPloidy() == 2);
}
private class MultipleBaseCountsWriter {

View File

@ -423,7 +423,7 @@ public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalk
}
}
else {
// if (!vcComp.hasAttribute("GV"))
// if (!vcComp.hasExtendedAttribute("GV"))
// throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
if (call.isCalledAlt(callConf)) {

View File

@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector {
return true;
// want to include a site in the given samples if it is *likely* to be variant (via the EXACT model)
// first subset to the samples
VariantContext subContext = vc.subContextFromSamples(samples);
VariantContext subContext = vc.subContextFromSamples(samples, true);
// now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec?

View File

@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{
if ( samples == null || samples.isEmpty() )
return true;
VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles());
VariantContext subContext = vc.subContextFromSamples(samples, false);
if ( subContext.isPolymorphicInSamples() ) {
return true;
}

View File

@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Molten;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeType;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -54,7 +55,7 @@ public class GenotypeConcordance extends VariantEvaluator {
* Initialize this object
*/
public GenotypeConcordance() {
final int nGenotypeTypes = Genotype.Type.values().length;
final int nGenotypeTypes = GenotypeType.values().length;
truthByCalledGenotypeCounts = new long[nGenotypeTypes][nGenotypeTypes];
}
@ -75,11 +76,11 @@ public class GenotypeConcordance extends VariantEvaluator {
if (eval != null) {
for (final Genotype g : eval.getGenotypes() ) {
final String sample = g.getSampleName();
final Genotype.Type called = g.getType();
final Genotype.Type truth;
final GenotypeType called = g.getType();
final GenotypeType truth;
if (!validationIsValidVC || !validation.hasGenotype(sample)) {
truth = Genotype.Type.NO_CALL;
truth = GenotypeType.NO_CALL;
} else {
truth = validation.getGenotype(sample).getType();
}
@ -90,19 +91,19 @@ public class GenotypeConcordance extends VariantEvaluator {
// otherwise, mark no-calls for all samples
else {
final Genotype.Type called = Genotype.Type.NO_CALL;
final GenotypeType called = GenotypeType.NO_CALL;
for (final Genotype g : validation.getGenotypes()) {
final Genotype.Type truth = g.getType();
final GenotypeType truth = g.getType();
incrValue(truth, called);
// print out interesting sites
/*
if ( PRINT_INTERESTING_SITES && super.getVEWalker().gcLog != null ) {
if ( (truth == Genotype.Type.HOM_VAR || truth == Genotype.Type.HET) && called == Genotype.Type.NO_CALL ) {
if ( (truth == GenotypeType.HOM_VAR || truth == GenotypeType.HET) && called == GenotypeType.NO_CALL ) {
super.getVEWalker().gcLog.printf("%s FN %s%n", group, validation);
}
if ( (called == Genotype.Type.HOM_VAR || called == Genotype.Type.HET) && truth == Genotype.Type.HOM_REF ) {
if ( (called == GenotypeType.HOM_VAR || called == GenotypeType.HET) && truth == GenotypeType.HOM_REF ) {
super.getVEWalker().gcLog.printf("%s FP %s%n", group, validation);
}
}
@ -121,36 +122,36 @@ public class GenotypeConcordance extends VariantEvaluator {
* @param truth the truth type
* @param called the called type
*/
private void incrValue(final Genotype.Type truth, final Genotype.Type called) {
private void incrValue(final GenotypeType truth, final GenotypeType called) {
truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()]++;
}
private long count(final Genotype.Type truth, final Genotype.Type called) {
private long count(final GenotypeType truth, final GenotypeType called) {
return truthByCalledGenotypeCounts[truth.ordinal()][called.ordinal()];
}
private long count(final EnumSet<Genotype.Type> truth, final Genotype.Type called) {
private long count(final EnumSet<GenotypeType> truth, final GenotypeType called) {
return count(truth, EnumSet.of(called));
}
private long count(final Genotype.Type truth, final EnumSet<Genotype.Type> called) {
private long count(final GenotypeType truth, final EnumSet<GenotypeType> called) {
return count(EnumSet.of(truth), called);
}
private long count(final EnumSet<Genotype.Type> truth, final EnumSet<Genotype.Type> called) {
private long count(final EnumSet<GenotypeType> truth, final EnumSet<GenotypeType> called) {
long sum = 0;
for ( final Genotype.Type truth1 : truth ) {
for ( final Genotype.Type called1 : called ) {
for ( final GenotypeType truth1 : truth ) {
for ( final GenotypeType called1 : called ) {
sum += count(truth1, called1);
}
}
return sum;
}
private long countDiag( final EnumSet<Genotype.Type> d1 ) {
private long countDiag( final EnumSet<GenotypeType> d1 ) {
long sum = 0;
for(final Genotype.Type e1 : d1 ) {
for(final GenotypeType e1 : d1 ) {
sum += truthByCalledGenotypeCounts[e1.ordinal()][e1.ordinal()];
}
@ -159,13 +160,13 @@ public class GenotypeConcordance extends VariantEvaluator {
@Override
public void finalizeEvaluation() {
final EnumSet<Genotype.Type> allVariantGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET);
final EnumSet<Genotype.Type> allCalledGenotypes = EnumSet.of(Genotype.Type.HOM_VAR, Genotype.Type.HET, Genotype.Type.HOM_REF);
final EnumSet<Genotype.Type> allGenotypes = EnumSet.allOf(Genotype.Type.class);
final EnumSet<GenotypeType> allVariantGenotypes = EnumSet.of(GenotypeType.HOM_VAR, GenotypeType.HET);
final EnumSet<GenotypeType> allCalledGenotypes = EnumSet.of(GenotypeType.HOM_VAR, GenotypeType.HET, GenotypeType.HOM_REF);
final EnumSet<GenotypeType> allGenotypes = EnumSet.allOf(GenotypeType.class);
// exact values of the table
for ( final Genotype.Type truth : Genotype.Type.values() ) {
for ( final Genotype.Type called : Genotype.Type.values() ) {
for ( final GenotypeType truth : GenotypeType.values() ) {
for ( final GenotypeType called : GenotypeType.values() ) {
final String field = String.format("n_true_%s_called_%s", truth, called);
final Long value = count(truth, called);
map.put(field, value.toString());
@ -173,20 +174,20 @@ public class GenotypeConcordance extends VariantEvaluator {
}
// counts of called genotypes
for ( final Genotype.Type called : Genotype.Type.values() ) {
for ( final GenotypeType called : GenotypeType.values() ) {
final String field = String.format("total_called_%s", called);
final Long value = count(allGenotypes, called);
map.put(field, value.toString());
}
// counts of true genotypes
for ( final Genotype.Type truth : Genotype.Type.values() ) {
for ( final GenotypeType truth : GenotypeType.values() ) {
final String field = String.format("total_true_%s", truth);
final Long value = count(truth, allGenotypes);
map.put(field, value.toString());
}
for ( final Genotype.Type genotype : Genotype.Type.values() ) {
for ( final GenotypeType genotype : GenotypeType.values() ) {
final String field = String.format("percent_%s_called_%s", genotype, genotype);
long numer = count(genotype, genotype);
long denom = count(EnumSet.of(genotype), allGenotypes);
@ -215,7 +216,7 @@ public class GenotypeConcordance extends VariantEvaluator {
// overall genotype concordance of sites called non-ref in eval track
// MAD: this is the non-reference discrepancy rate
final String field = "percent_non_reference_discrepancy_rate";
long homrefConcords = count(Genotype.Type.HOM_REF, Genotype.Type.HOM_REF);
long homrefConcords = count(GenotypeType.HOM_REF, GenotypeType.HOM_REF);
long allNoHomRef = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords;
long numer = allNoHomRef - countDiag(allVariantGenotypes);
long denom = count(allCalledGenotypes, allCalledGenotypes) - homrefConcords;

View File

@ -121,9 +121,9 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
int ac = 0;
if ( vc.getNAlleles() > 2 ) {
return SiteStatus.POLY;
//// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
//// System.out.printf("multiple alleles %s = %s%n", vc.getAlleles(), vc.getExtendedAttribute(VCFConstants.ALLELE_COUNT_KEY));
// // todo -- omg this is painful. We need a better approach to dealing with multi-valued attributes
// for ( String v : (List<String>)vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY) )
// for ( String v : (List<String>)vc.getExtendedAttribute(VCFConstants.ALLELE_COUNT_KEY) )
// ac += Integer.valueOf(v);
//// System.out.printf(" ac = %d%n", ac);
}

View File

@ -241,7 +241,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval {
// update transition / transversion ratio
if ( titvTable != null ) titvTable.inc(type, g.getSampleName());
if ( g.hasAttribute(VCFConstants.DEPTH_KEY) )
if ( g.hasDP() )
depthPerSample.inc(type, g.getSampleName());
}
}

View File

@ -199,7 +199,7 @@ public class VariantEvalUtils {
* @return a new VariantContext with just the requested samples
*/
public VariantContext getSubsetOfVariantContext(VariantContext vc, Set<String> sampleNames) {
VariantContext vcsub = vc.subContextFromSamples(sampleNames, vc.getAlleles());
VariantContext vcsub = vc.subContextFromSamples(sampleNames, false);
VariantContextBuilder builder = new VariantContextBuilder(vcsub);
final int originalAlleleCount = vc.getHetCount() + 2 * vc.getHomVarCount();

View File

@ -223,7 +223,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
newA = Allele.NO_CALL;
newAlleles.add(newA);
}
newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles));
newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make());
}
return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).referenceBaseForIndel(refBaseForIndel).make();

View File

@ -315,6 +315,15 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="fullyDecode", doc="If true, the incoming VariantContext will be fully decoded", required=false)
private boolean fullyDecode = false;
@Hidden
@Argument(fullName="forceGenotypesDecode", doc="If true, the incoming VariantContext will have its genotypes forcibly decoded by computing AC across all genotypes. For efficiency testing only", required=false)
private boolean forceGenotypesDecode = false;
@Hidden
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
private boolean justRead = false;
/* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure {
private VariantContext vc;
@ -392,11 +401,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
samples.removeAll(XLsamplesFromFile);
samples.removeAll(XLsampleNames);
NO_SAMPLES_SPECIFIED = NO_SAMPLES_SPECIFIED && XLsampleNames.isEmpty();
if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
throw new UserException("All samples requested to be included were also requested to be excluded.");
for ( String sample : samples )
if ( ! NO_SAMPLES_SPECIFIED )
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
@ -494,7 +505,16 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
}
for (VariantContext vc : vcs) {
if ( fullyDecode ) vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
// an option for performance testing only
if ( fullyDecode )
vc = vc.fullyDecode(vcfRods.get(vc.getSource()));
// an option for performance testing only
if ( forceGenotypesDecode ) {
final int x = vc.getCalledChrCount();
//logger.info("forceGenotypesDecode with getCalledChrCount() = " + );
}
if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) )
continue;
@ -538,7 +558,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType()))
continue;
VariantContext sub = subsetRecord(vc, samples, EXCLUDE_NON_VARIANTS);
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(tracker, ref, context, sub)).filters(sub.getFiltersMaybeNull());
@ -559,7 +579,8 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
randomlyAddVariant(++variantNumber, sub);
}
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
vcfWriter.add(sub);
if ( ! justRead )
vcfWriter.add(sub);
}
}
}
@ -687,18 +708,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
* Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF).
*
* @param vc the VariantContext record to subset
* @param samples the samples to extract
* @return the subsetted VariantContext
*/
private VariantContext subsetRecord(final VariantContext vc, final Set<String> samples, final boolean excludeNonVariants) {
if ( samples == null || samples.isEmpty() )
private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) {
if ( NO_SAMPLES_SPECIFIED || samples.isEmpty() )
return vc;
final VariantContext sub;
if ( excludeNonVariants )
sub = vc.subContextFromSamples(samples); // strip out the alternate alleles that aren't being used
else
sub = vc.subContextFromSamples(samples, vc.getAlleles());
final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used
VariantContextBuilder builder = new VariantContextBuilder(sub);
GenotypesContext newGC = sub.getGenotypes();
@ -708,15 +725,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
newGC = VariantContextUtils.stripPLs(sub.getGenotypes());
//Remove a fraction of the genotypes if needed
if(fractionGenotypes>0){
if ( fractionGenotypes > 0 ){
ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
for ( Genotype genotype : newGC ) {
//Set genotype to no call if it falls in the fraction.
if(fractionGenotypes>0 && randomGenotypes.nextDouble()<fractionGenotypes){
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
alleles.add(Allele.create((byte)'.'));
alleles.add(Allele.create((byte)'.'));
genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap<String, Object>(),false));
List<Allele> alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make());
}
else{
genotypes.add(genotype);
@ -750,14 +765,12 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
for (String sample : originalVC.getSampleNames()) {
Genotype g = originalVC.getGenotype(sample);
if ( g.isNotFiltered() ) {
String dp = (String) g.getAttribute("DP");
if (dp != null && ! dp.equals(VCFConstants.MISSING_DEPTH_v3) && ! dp.equals(VCFConstants.MISSING_VALUE_v4) ) {
depth += Integer.valueOf(dp);
}
if ( ! g.isFiltered() ) {
if ( g.hasDP() )
depth += g.getDP();
}
}
builder.attribute("DP", depth);
}

View File

@ -288,8 +288,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getStandardEncoding(Genotype g, int offset) {
byte b;
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
b = NO_CALL;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_REF;
} else if ( g.isHomVar() ) {
@ -305,7 +305,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getFlippedEncoding(Genotype g, int offset) {
byte b;
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_VAR;

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -314,8 +315,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( addGenotypeFields ) {
for ( final String sample : samples ) {
for ( final String gf : genotypeFields ) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAttribute(gf) )
addFieldValue(vc.getGenotype(sample).getAttribute(gf), records);
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) {
if ( gf.equals(VCFConstants.GENOTYPE_KEY) )
addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records);
else
addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records);
}
else
addFieldValue(MISSING_DATA, records);
}

View File

@ -132,7 +132,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
// set the appropriate sample name if necessary
if ( sampleName != null && vc.hasGenotypes() && vc.hasGenotype(variants.getName()) ) {
Genotype g = Genotype.modifyName(vc.getGenotype(variants.getName()), sampleName);
Genotype g = new GenotypeBuilder(vc.getGenotype(variants.getName())).name(sampleName).make();
builder.genotypes(g);
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.samples.Sample;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeType;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -30,7 +31,7 @@ public class MendelianViolation {
private boolean allCalledOnly = true;
//Stores occurrences of inheritance
private EnumMap<Genotype.Type, EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>> inheritance;
private EnumMap<GenotypeType, EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>> inheritance;
private int violations_total=0;
@ -74,119 +75,119 @@ public class MendelianViolation {
//Count of HomRef/HomRef/HomRef trios
public int getRefRefRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
}
//Count of HomVar/HomVar/HomVar trios
public int getVarVarVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR);
}
//Count of HomRef/HomVar/Het trios
public int getRefVarHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) +
inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HET) +
inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
}
//Count of Het/Het/Het trios
public int getHetHetHet(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET);
}
//Count of Het/Het/HomRef trios
public int getHetHetHomRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
}
//Count of Het/Het/HomVar trios
public int getHetHetHomVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR);
}
//Count of ref alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET)
+ 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
//return parentsHetHet_childRef;
}
//Count of var alleles inherited from Het/Het parents (no violation)
public int getParentsHetHetInheritedVar(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET)
+ 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR);
//return parentsHetHet_childVar;
}
//Count of ref alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_REF)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
//return parentsHomRefHet_childRef;
}
//Count of var alleles inherited from HomRef/Het parents (no violation)
public int getParentsRefHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HET)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
//return parentsHomRefHet_childVar;
}
//Count of ref alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HET)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HET);
//return parentsHomVarHet_childRef;
}
//Count of var alleles inherited from HomVar/Het parents (no violation)
public int getParentsVarHetInheritedVar(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_VAR)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR);
//return parentsHomVarHet_childVar;
}
//Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR
public int getParentsRefRefChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_REF -> HET
public int getParentsRefRefChildHet(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HET);
}
//Count of violations of the type HOM_REF/HET -> HOM_VAR
public int getParentsRefHetChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_VAR)
+ inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR
public int getParentsRefVarChildVar(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR)
+ inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR);
}
//Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF
public int getParentsRefVarChildRef(){
return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF)
+ inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF);
}
//Count of violations of the type HOM_VAR/HET -> HOM_REF
public int getParentsVarHetChildRef(){
return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF)
+ inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF)
+ inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_REF);
}
//Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF
public int getParentsVarVarChildRef(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF);
}
//Count of violations of the type HOM_VAR/HOM_VAR -> HET
public int getParentsVarVarChildHet(){
return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET);
return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HET);
}
@ -362,12 +363,12 @@ public class MendelianViolation {
private void createInheritanceMap(){
inheritance = new EnumMap<Genotype.Type,EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>>(Genotype.Type.class);
for(Genotype.Type mType : Genotype.Type.values()){
inheritance.put(mType, new EnumMap<Genotype.Type,EnumMap<Genotype.Type,Integer>>(Genotype.Type.class));
for(Genotype.Type dType : Genotype.Type.values()){
inheritance.get(mType).put(dType, new EnumMap<Genotype.Type,Integer>(Genotype.Type.class));
for(Genotype.Type cType : Genotype.Type.values()){
inheritance = new EnumMap<GenotypeType,EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>>(GenotypeType.class);
for(GenotypeType mType : GenotypeType.values()){
inheritance.put(mType, new EnumMap<GenotypeType,EnumMap<GenotypeType,Integer>>(GenotypeType.class));
for(GenotypeType dType : GenotypeType.values()){
inheritance.get(mType).put(dType, new EnumMap<GenotypeType,Integer>(GenotypeType.class));
for(GenotypeType cType : GenotypeType.values()){
inheritance.get(mType).get(dType).put(cType, 0);
}
}
@ -376,9 +377,9 @@ public class MendelianViolation {
}
private void clearInheritanceMap(){
for(Genotype.Type mType : Genotype.Type.values()){
for(Genotype.Type dType : Genotype.Type.values()){
for(Genotype.Type cType : Genotype.Type.values()){
for(GenotypeType mType : GenotypeType.values()){
for(GenotypeType dType : GenotypeType.values()){
for(GenotypeType cType : GenotypeType.values()){
inheritance.get(mType).get(dType).put(cType, 0);
}
}

View File

@ -225,9 +225,9 @@ public class SequenceDictionaryUtils {
return false;
// todo -- reenable if we want to be really strict here
// if (me.getAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getAttribute(SAMSequenceRecord.MD5_TAG) != null) {
// final BigInteger thisMd5 = new BigInteger((String)me.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
// final BigInteger thatMd5 = new BigInteger((String)that.getAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) {
// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16);
// if (!thisMd5.equals(thatMd5)) {
// return false;
// }

View File

@ -223,6 +223,20 @@ public class Utils {
return ret.toString();
}
public static String join(String separator, int[] ints) {
if ( ints == null || ints.length == 0)
return "";
else {
StringBuilder ret = new StringBuilder();
ret.append(ints[0]);
for (int i = 1; i < ints.length; ++i) {
ret.append(separator);
ret.append(ints[i]);
}
return ret.toString();
}
}
/**
* Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of
* elti objects (note there's no actual space between sep and the elti elements). Returns

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
@ -33,9 +35,7 @@ import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -45,15 +45,45 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
/**
* Decode BCF2 files
*/
public final class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDependentFeatureCodec {
final protected static Logger logger = Logger.getLogger(BCF2Codec.class);
private VCFHeader header = null;
/**
* Maps offsets (encoded in BCF) into contig names (from header) for the CHROM field
*/
private final ArrayList<String> contigNames = new ArrayList<String>();
/**
* Maps header string names (encoded in VCF) into strings found in the BCF header
*
* Initialized when processing the header
*/
private ArrayList<String> dictionary;
/**
* Our decoder that reads low-level objects from the BCF2 records
*/
private final BCF2Decoder decoder = new BCF2Decoder();
private boolean skipGenotypes = false;
/**
* Provides some sanity checking on the header
*/
private final static int MAX_HEADER_SIZE = 0x08000000;
/**
* Genotype field decoders that are initialized when the header is read
*/
private BCF2GenotypeFieldDecoders gtFieldDecoders = null;
// for error handling
private int recordNo = 0;
private int pos = 0;
// ----------------------------------------------------------------------
//
// Feature codec interface functions
@ -62,28 +92,30 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
@Override
public Feature decodeLoc( final PositionalBufferedStream inputStream ) {
return decode(inputStream);
// TODO: a less expensive version of decodeLoc() that doesn't use VariantContext
// TODO: very easy -- just decodeSitesBlock, and then skip to end of end of sites block
// TODO: and then skip genotypes block
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream); // necessary because it's in the stream
decoder.readNextBlock(sitesBlockSize, inputStream);
decodeSiteLoc(builder);
return builder.fullyDecoded(true).make();
}
@Override
public VariantContext decode( final PositionalBufferedStream inputStream ) {
recordNo++;
final VariantContextBuilder builder = new VariantContextBuilder();
final int sitesBlockSize = decoder.readBlockSize(inputStream);
final int genotypeBlockSize = decoder.readBlockSize(inputStream);
decoder.readNextBlock(sitesBlockSize, inputStream);
final SitesInfoForDecoding info = decodeSitesBlock(builder);
if ( isSkippingGenotypes() ) {
decoder.skipNextBlock(genotypeBlockSize, inputStream);
} else {
decoder.readNextBlock(genotypeBlockSize, inputStream);
decodeGenotypes(info, builder);
}
decodeSiteLoc(builder);
final SitesInfoForDecoding info = decodeSitesExtendedInfo(builder);
decoder.readNextBlock(genotypeBlockSize, inputStream);
createLazyGenotypesDecoder(info, builder);
return builder.fullyDecoded(true).make();
}
@ -97,16 +129,16 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
try {
// note that this reads the magic as well, and so does double duty
if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) )
throw new UserException.MalformedBCF2("Input stream does not begin with BCF2 magic");
error("Input stream does not begin with BCF2 magic");
final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream);
if ( headerSizeInBytes <= 0 || headerSizeInBytes > MAX_HEADER_SIZE) // no bigger than 8 MB
throw new UserException.MalformedBCF2("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
error("BCF2 header has invalid length: " + headerSizeInBytes + " must be >= 0 and < "+ MAX_HEADER_SIZE);
final byte[] headerBytes = new byte[headerSizeInBytes];
if ( inputStream.read(headerBytes) != headerSizeInBytes )
throw new UserException.MalformedBCF2("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
error("Couldn't read all of the bytes specified in the header length = " + headerSizeInBytes);
final PositionalBufferedStream bps = new PositionalBufferedStream(new ByteArrayInputStream(headerBytes));
final AsciiLineReader headerReader = new AsciiLineReader(bps);
@ -118,12 +150,24 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
}
// create the config offsets
for ( final VCFContigHeaderLine contig : header.getContigLines())
contigNames.add(contig.getID());
if ( ! header.getContigLines().isEmpty() ) {
logger.info("Found contig lines in BCF2 file, using those");
contigNames.clear();
for ( final VCFContigHeaderLine contig : header.getContigLines()) {
if ( contig.getID() == null || contig.getID().equals("") )
error("found a contig with an invalid ID " + contig);
contigNames.add(contig.getID());
}
} else {
logger.info("Didn't find any contig lines in BCF2 file, falling back (dangerously) to GATK reference dictionary");
}
// create the string dictionary
dictionary = parseDictionary(header);
// prepare the genotype field decoders
gtFieldDecoders = new BCF2GenotypeFieldDecoders(header);
// position right before next line (would be right before first real record byte at end of header)
return new FeatureCodecHeader(header, inputStream.getPosition());
}
@ -153,7 +197,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
//
// --------------------------------------------------------------------------------
@Override
public void setGenomeLocParser(final GenomeLocParser genomeLocParser) {
// initialize contigNames to standard ones in reference
@ -161,14 +204,6 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
contigNames.add(contig.getSequenceName());
}
public boolean isSkippingGenotypes() {
return skipGenotypes;
}
public void setSkipGenotypes(final boolean skipGenotypes) {
this.skipGenotypes = skipGenotypes;
}
// --------------------------------------------------------------------------------
//
// implicit block
@ -182,50 +217,83 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
//
// --------------------------------------------------------------------------------
private final SitesInfoForDecoding decodeSitesBlock(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null"})
private final void decodeSiteLoc(final VariantContextBuilder builder) {
final int contigOffset = decoder.decodeInt(BCF2Type.INT32);
final String contig = lookupContigName(contigOffset);
builder.chr(contig);
final int pos = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
final int refLength = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
this.pos = decoder.decodeInt(BCF2Type.INT32);
final int refLength = decoder.decodeInt(BCF2Type.INT32);
builder.start((long)pos);
builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals
}
/**
* Decode the sites level data from this classes decoder
*
* @param builder
* @return
*/
@Requires({"builder != null", "decoder != null"})
@Ensures({"result != null", "result.isValid()"})
private final SitesInfoForDecoding decodeSitesExtendedInfo(final VariantContextBuilder builder) {
final Object qual = decoder.decodeSingleValue(BCF2Type.FLOAT);
if ( qual != null ) {
builder.log10PError(((Double)qual) / -10.0);
}
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32.getSizeInBytes());
final int nAlleleInfo = decoder.decodeInt(BCF2Type.INT32);
final int nFormatSamples = decoder.decodeInt(BCF2Type.INT32);
final int nAlleles = nAlleleInfo >> 16;
final int nInfo = nAlleleInfo & 0x00FF;
final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x0FFF;
final int nInfo = nAlleleInfo & 0x0000FFFF;
final int nFormatFields = nFormatSamples >> 24;
final int nSamples = nFormatSamples & 0x00FFFFF;
decodeID(builder);
final ArrayList<Allele> alleles = decodeAlleles(builder, pos, nAlleles);
decodeFilter(builder);
decodeInfo(builder, nInfo);
return new SitesInfoForDecoding(pos, nFormatFields, nSamples, alleles);
final SitesInfoForDecoding info = new SitesInfoForDecoding(nFormatFields, nSamples, alleles);
if ( ! info.isValid() )
error("Sites info is malformed: " + info);
return info;
}
private final static class SitesInfoForDecoding {
final int pos;
protected final static class SitesInfoForDecoding {
final int nFormatFields;
final int nSamples;
final ArrayList<Allele> alleles;
private SitesInfoForDecoding(final int pos, final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.pos = pos;
private SitesInfoForDecoding(final int nFormatFields, final int nSamples, final ArrayList<Allele> alleles) {
this.nFormatFields = nFormatFields;
this.nSamples = nSamples;
this.alleles = alleles;
}
public boolean isValid() {
return nFormatFields >= 0 &&
nSamples >= 0 &&
alleles != null && ! alleles.isEmpty() && alleles.get(0).isReference();
}
@Override
public String toString() {
return String.format("nFormatFields = %d, nSamples = %d, alleles = %s", nFormatFields, nSamples, alleles);
}
}
/**
* Decode the id field in this BCF2 file and store it in the builder
* @param builder
*/
private void decodeID( final VariantContextBuilder builder ) {
final String id = (String)decoder.decodeTypedValue();
@ -235,6 +303,15 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.id(id);
}
/**
* Annoying routine that deals with allele clipping from the BCF2 encoding to the standard
* GATK encoding.
*
* @param position
* @param ref
* @param unclippedAlleles
* @return
*/
protected static ArrayList<Allele> clipAllelesIfNecessary(int position, String ref, ArrayList<Allele> unclippedAlleles) {
if ( ! AbstractVCFCodec.isSingleNucleotideEvent(unclippedAlleles) ) {
ArrayList<Allele> clippedAlleles = new ArrayList<Allele>(unclippedAlleles.size());
@ -244,6 +321,14 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
return unclippedAlleles;
}
/**
* Decode the alleles from this BCF2 file and put the results in builder
* @param builder
* @param pos
* @param nAlleles
* @return the alleles
*/
@Requires("nAlleles > 0")
private ArrayList<Allele> decodeAlleles( final VariantContextBuilder builder, final int pos, final int nAlleles ) {
// TODO -- probably need inline decoder for efficiency here (no sense in going bytes -> string -> vector -> bytes
ArrayList<Allele> alleles = new ArrayList<Allele>(nAlleles);
@ -259,15 +344,21 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
alleles.add(Allele.create(allele, false));
}
}
assert ref != null;
alleles = clipAllelesIfNecessary(pos, ref, alleles);
builder.alleles(alleles);
assert ref.length() > 0;
builder.referenceBaseForIndel(ref.getBytes()[0]);
return alleles;
}
/**
* Decode the filter field of this BCF2 file and store the result in the builder
* @param builder
*/
private void decodeFilter( final VariantContextBuilder builder ) {
final Object value = decoder.decodeTypedValue();
@ -275,17 +366,28 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.unfiltered();
else {
if ( value instanceof Integer )
// fast path for single integer result
builder.filter(getDictionaryString((Integer)value));
else {
for ( int offset : (List<Integer>)value )
for ( final int offset : (List<Integer>)value )
builder.filter(getDictionaryString(offset));
}
}
}
/**
* Loop over the info field key / value pairs in this BCF2 file and decode them into the builder
*
* @param builder
* @param numInfoFields
*/
@Requires("numInfoFields >= 0")
private void decodeInfo( final VariantContextBuilder builder, final int numInfoFields ) {
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
if ( numInfoFields == 0 )
// fast path, don't bother doing any work if there are no fields
return;
final Map<String, Object> infoFieldEntries = new HashMap<String, Object>(numInfoFields);
for ( int i = 0; i < numInfoFields; i++ ) {
final String key = getDictionaryString();
Object value = decoder.decodeTypedValue();
@ -297,143 +399,98 @@ public class BCF2Codec implements FeatureCodec<VariantContext>, ReferenceDepende
builder.attributes(infoFieldEntries);
}
private void decodeGenotypes( final SitesInfoForDecoding siteInfo, final VariantContextBuilder builder ) {
final List<String> samples = new ArrayList<String>(header.getGenotypeSamples());
final int nSamples = siteInfo.nSamples;
final int nFields = siteInfo.nFormatFields;
// --------------------------------------------------------------------------------
//
// Decoding Genotypes
//
// --------------------------------------------------------------------------------
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
/**
* Create the lazy loader for the genotypes data, and store it in the builder
* so that the VC will be able to decode on demand the genotypes data
*
* @param siteInfo
* @param builder
*/
private void createLazyGenotypesDecoder( final SitesInfoForDecoding siteInfo,
final VariantContextBuilder builder ) {
if (siteInfo.nSamples > 0) {
final LazyGenotypesContext.LazyParser lazyParser =
new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields);
final int nGenotypes = header.getGenotypeSamples().size();
LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser,
new LazyData(siteInfo.nFormatFields, decoder.getRecordBytes()),
nGenotypes);
final Map<String, List<Object>> fieldValues = decodeGenotypeFieldValues(nFields, nSamples);
final List<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
// all of the information we need for each genotype, with default values
final String sampleName = samples.get(i);
List<Allele> alleles = null;
boolean isPhased = false;
double log10PError = VariantContext.NO_LOG10_PERROR;
Set<String> filters = null;
Map<String, Object> attributes = null;
double[] log10Likelihoods = null;
// did we resort the sample names? If so, we need to load the genotype data
if ( !header.samplesWereAlreadySorted() )
lazy.decode();
for ( final Map.Entry<String, List<Object>> entry : fieldValues.entrySet() ) {
final String field = entry.getKey();
Object value = entry.getValue().get(i);
try {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
alleles = decodeGenotypeAlleles(siteInfo.alleles, (List<Integer>)value);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( value != BCF2Type.INT8.getMissingJavaValue() )
log10PError = ((Integer)value) / -10.0;
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
final List<Integer> pls = (List<Integer>)value;
if ( pls != null ) { // we have a PL field
log10Likelihoods = new double[pls.size()];
for ( int j = 0; j < log10Likelihoods.length; j++ ) {
final double d = pls.get(j);
log10Likelihoods[j] = d == -0.0 ? 0.0 : d / -10.0;
}
}
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2");
//filters = new HashSet<String>(values.get(i));
} else { // add to attributes
if ( value != null ) { // don't add missing values
if ( attributes == null ) attributes = new HashMap<String, Object>(nFields);
if ( value instanceof List && ((List)value).size() == 1)
value = ((List)value).get(0);
attributes.put(field, value);
}
}
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value in the "
+ " BCF file. Value was " + value);
}
}
if ( alleles == null ) throw new UserException.MalformedBCF2("BUG: no alleles found");
final Genotype g = new Genotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10Likelihoods);
genotypes.add(g);
}
builder.genotypes(genotypes);
}
private final List<Allele> decodeGenotypeAlleles(final ArrayList<Allele> siteAlleles, final List<Integer> encoded) {
if ( encoded == null )
// no called sample GT = .
return Collections.emptyList();
else {
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.size());
for ( final Integer encode : encoded ) {
if ( encode == null ) // absent, as are all following by definition
return gt;
else {
final int offset = encode >> 1;
if ( offset == 0 )
gt.add(Allele.NO_CALL);
else
gt.add(siteAlleles.get(offset - 1));
}
}
return gt;
builder.genotypesNoValidation(lazy);
}
}
private final Map<String, List<Object>> decodeGenotypeFieldValues(final int nFields, final int nSamples) {
assert (nFields > 0 && nSamples > 0) || (nFields == 0 && nSamples == 0);
public static class LazyData {
final public int nGenotypeFields;
final public byte[] bytes;
if ( nFields == 0 ) // fast path exit for sites only file
return Collections.emptyMap();
else {
final Map<String, List<Object>> map = new LinkedHashMap<String, List<Object>>(nFields);
for ( int i = 0; i < nFields; i++ ) {
final String field = getDictionaryString();
final byte typeDescriptor = decoder.readTypeDescriptor();
final List<Object> values = new ArrayList<Object>(nSamples);
for ( int j = 0; j < nSamples; j++ )
values.add(decoder.decodeTypedValue(typeDescriptor));
map.put(field, values);
}
return map;
@Requires({"nGenotypeFields > 0", "bytes != null"})
public LazyData(final int nGenotypeFields, final byte[] bytes) {
this.nGenotypeFields = nGenotypeFields;
this.bytes = bytes;
}
}
@Ensures("result != null")
private final String getDictionaryString() {
return getDictionaryString((Integer) decoder.decodeTypedValue());
}
private final String getDictionaryString(final int offset) {
if ( offset >= dictionary.size() ) throw new UserException.MalformedBCF2("BUG: no dictionary field found at offset " + offset);
final String field = dictionary.get(offset);
return field;
@Requires("offset < dictionary.size()")
@Ensures("result != null")
protected final String getDictionaryString(final int offset) {
return dictionary.get(offset);
}
/**
* Translate the config offset as encoded in the BCF file into the actual string
* name of the contig from the dictionary
*
* @param contigOffset
* @return
*/
@Requires({"contigOffset >= 0", "contigOffset < contigNames.size()"})
@Ensures("result != null")
private final String lookupContigName( final int contigOffset ) {
if ( contigOffset < contigNames.size() ) {
return contigNames.get(contigOffset);
}
else {
throw new UserException.MalformedBCF2(String.format("No contig at index %d present in the sequence dictionary from the BCF2 header (%s)", contigOffset, contigNames));
}
return contigNames.get(contigOffset);
}
@Requires("header != null")
@Ensures({"result != null", "! result.isEmpty()"})
private final ArrayList<String> parseDictionary(final VCFHeader header) {
final ArrayList<String> dict = BCF2Utils.makeDictionary(header);
// if we got here we never found a dictionary, or there are no elements in the dictionary
if ( dict.size() == 0 )
throw new UserException.MalformedBCF2("Dictionary header element was absent or empty");
if ( dict.isEmpty() )
error("Dictionary header element was absent or empty");
return dict;
}
/**
* @return the VCFHeader we found in this BCF2 file
*/
protected VCFHeader getHeader() {
return header;
}
@Requires("field != null")
@Ensures("result != null")
protected BCF2GenotypeFieldDecoders.Decoder getGenotypeFieldDecoder(final String field) {
return gtFieldDecoders.getDecoder(field);
}
private final void error(final String message) throws RuntimeException {
throw new UserException.MalformedBCF2(String.format("At record %d with position %d:", recordNo, pos, message));
}
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broad.tribble.FeatureCodec;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -33,12 +35,13 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
public class BCF2Decoder {
public final class BCF2Decoder {
final protected static Logger logger = Logger.getLogger(FeatureCodec.class);
byte[] recordBytes;
ByteArrayInputStream recordStream;
byte[] recordBytes = null;
ByteArrayInputStream recordStream = null;
public BCF2Decoder() {
// nothing to do
@ -66,6 +69,7 @@ public class BCF2Decoder {
* @return
*/
public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
if ( blockSizeInBytes < 0 ) throw new UserException.MalformedBCF2("Invalid block size " + blockSizeInBytes);
setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
}
@ -112,9 +116,9 @@ public class BCF2Decoder {
*
* @param recordBytes
*/
@Requires("recordBytes != null")
@Ensures({"this.recordBytes == recordBytes", "recordStream != null"})
public void setRecordBytes(final byte[] recordBytes) {
assert recordBytes != null;
this.recordBytes = recordBytes;
this.recordStream = new ByteArrayInputStream(recordBytes);
}
@ -131,7 +135,7 @@ public class BCF2Decoder {
}
public final Object decodeTypedValue(final byte typeDescriptor) {
final int size = BCF2Utils.sizeIsOverflow(typeDescriptor) ? decodeVectorSize() : BCF2Utils.decodeSize(typeDescriptor);
final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size >= 0;
@ -155,7 +159,7 @@ public class BCF2Decoder {
public final Object decodeSingleValue(final BCF2Type type) {
// TODO -- decodeTypedValue should integrate this routine
final int value = BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
final int value = decodeInt(type);
if ( value == type.getMissingBytes() )
return null;
@ -184,26 +188,107 @@ public class BCF2Decoder {
final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
try {
recordStream.read(bytes);
final String s = new String(bytes);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
int goodLength = 0;
for ( ; goodLength < bytes.length ; goodLength++ )
if ( bytes[goodLength] == 0 ) break;
if ( goodLength == 0 )
return null;
else {
final String s = new String(bytes, 0, goodLength);
return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s;
}
} catch ( IOException e ) {
throw new ReviewedStingException("readByte failure", e);
}
}
private final int decodeVectorSize() {
final byte typeDescriptor = readTypeDescriptor();
final int size = BCF2Utils.decodeSize(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
assert size == 1;
assert type == BCF2Type.INT8 || type == BCF2Type.INT16 || type == BCF2Type.INT32;
return decodeInt(type.getSizeInBytes());
@Ensures("result >= 0")
public final int decodeNumberOfElements(final byte typeDescriptor) {
if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
// -1 ensures we explode immediately with a bad size if the result is missing
return decodeInt(readTypeDescriptor(), -1);
else
// the size is inline, so just decode it
return BCF2Utils.decodeSize(typeDescriptor);
}
public final int decodeInt(int bytesForEachInt) {
return BCF2Utils.readInt(bytesForEachInt, recordStream);
/**
* Decode an int from the stream. If the value in the stream is missing,
* returns missingValue. Requires the typeDescriptor indicate an inline
* single element event
*
* @param typeDescriptor
* @return
*/
@Requires("BCF2Utils.decodeSize(typeDescriptor) == 1")
public final int decodeInt(final byte typeDescriptor, final int missingValue) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int i = decodeInt(type);
return i == type.getMissingBytes() ? missingValue : i;
}
@Requires("type != null")
public final int decodeInt(final BCF2Type type) {
return BCF2Utils.readInt(type.getSizeInBytes(), recordStream);
}
/**
* Low-level reader for int[]
*
* Requires a typeDescriptor so the function knows how many elements to read,
* and how they are encoded.
*
* If size == 0 => result is null
* If size > 0 => result depends on the actual values in the stream
* -- If the first element read is MISSING, result is null (all values are missing)
* -- Else result = int[N] where N is the first N non-missing values decoded
*
* @param maybeDest if not null we'll not allocate space for the vector, but instead use
* the externally allocated array of ints to store values. If the
* size of this vector is < the actual size of the elements, we'll be
* forced to use freshly allocated arrays. Also note that padded
* int elements are still forced to do a fresh allocation as well.
* @return see description
*/
@Requires({"BCF2Type.INTEGERS.contains(type)", "size >= 0", "type != null"})
public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) {
if ( size == 0 ) {
return null;
} else {
if ( maybeDest != null && maybeDest.length < size )
maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small
final int val1 = decodeInt(type);
if ( val1 == type.getMissingBytes() ) {
// fast path for first element being missing
for ( int i = 1; i < size; i++ ) decodeInt(type);
return null;
} else {
// we know we will have at least 1 element, so making the int[] is worth it
final int[] ints = maybeDest == null ? new int[size] : maybeDest;
ints[0] = val1; // we already read the first one
for ( int i = 1; i < size; i++ ) {
ints[i] = decodeInt(type);
if ( ints[i] == type.getMissingBytes() ) {
// read the rest of the missing values, dropping them
for ( int j = i + 1; j < size; j++ ) decodeInt(type);
// deal with auto-pruning by returning an int[] containing
// only the non-MISSING values. We do this by copying the first
// i elements, as i itself is missing
return Arrays.copyOf(ints, i);
}
}
return ints; // all of the elements were non-MISSING
}
}
}
public final int[] decodeIntArray(final byte typeDescriptor) {
final int size = decodeNumberOfElements(typeDescriptor);
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
return decodeIntArray(size, type, null);
}
public final double rawFloatToFloat(final int rawFloat) {

View File

@ -0,0 +1,282 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import java.util.*;
/**
* An efficient scheme for building and obtaining specialized
* genotype field decoders. Used by the BCFCodec to parse
* with little overhead the fields from BCF2 encoded genotype
* records
*
* @author Mark DePristo
* @since 6/12
*/
public class BCF2GenotypeFieldDecoders {
final protected static Logger logger = Logger.getLogger(BCF2GenotypeFieldDecoders.class);
private final static boolean ENABLE_FASTPATH_GT = true;
private final static int MIN_SAMPLES_FOR_FASTPATH_GENOTYPES = 0; // TODO -- update to reasonable number
// initialized once per writer to allow parallel writers to work
private final HashMap<String, Decoder> genotypeFieldDecoder = new HashMap<String, Decoder>();
private final Decoder defaultDecoder = new GenericDecoder();
public BCF2GenotypeFieldDecoders(final VCFHeader header) {
// TODO -- fill in appropriate decoders for each FORMAT field in the header
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_KEY, new GTDecoder());
// currently the generic decoder handles FILTER values properly, in so far as we don't tolerate multiple filter field values per genotype
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_FILTER_KEY, new FTDecoder());
genotypeFieldDecoder.put(VCFConstants.DEPTH_KEY, new DPDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new ADDecoder());
genotypeFieldDecoder.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new PLDecoder());
genotypeFieldDecoder.put(VCFConstants.GENOTYPE_QUALITY_KEY, new GQDecoder());
}
// -----------------------------------------------------------------
//
// Genotype field decoder
//
// -----------------------------------------------------------------
/**
* Return decoder appropriate for field, or the generic decoder if no
* specialized one is bound
* @param field the GT field to decode
* @return a non-null decoder
*/
@Requires("field != null")
@Ensures("result != null")
public Decoder getDecoder(final String field) {
final Decoder d = genotypeFieldDecoder.get(field);
return d == null ? defaultDecoder : d;
}
/**
* Decoder a field (implicit from creation) encoded as
* typeDescriptor in the decoder object in the GenotypeBuilders
* one for each sample in order.
*
* The way this works is that this decode method
* iterates over the builders, decoding a genotype field
* in BCF2 for each sample from decoder.
*
* This system allows us to easily use specialized
* decoders for specific genotype field values. For example,
* we use a special decoder to directly read the BCF2 data for
* the PL field into a int[] rather than the generic List of Integer
*/
public interface Decoder {
@Requires({"siteAlleles != null", "! siteAlleles.isEmpty()",
"field != null", "decoder != null", "gbs != null", "! gbs.isEmpty()"})
public void decode(final List<Allele> siteAlleles,
final String field,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs);
}
private class GTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
// we have to do a bit of low-level processing here as we want to know the size upfronta
final int ploidy = decoder.decodeNumberOfElements(typeDescriptor);
if ( ENABLE_FASTPATH_GT && siteAlleles.size() == 2 && ploidy == 2 && gbs.size() >= MIN_SAMPLES_FOR_FASTPATH_GENOTYPES )
fastBiallelicDiploidDecode(siteAlleles, decoder, typeDescriptor, gbs);
else {
generalDecode(siteAlleles, ploidy, decoder, typeDescriptor, gbs);
}
}
/**
* fast path for many samples with diploid genotypes
*
* The way this would work is simple. Create a List<Allele> diploidGenotypes[] object
* After decoding the offset, if that sample is diploid compute the
* offset into the alleles vector which is simply offset = allele0 * nAlleles + allele1
* if there's a value at diploidGenotypes[offset], use it, otherwise create the genotype
* cache it and use that
*
* Some notes. If there are nAlleles at the site, there are implicitly actually
* n + 1 options including
*/
@Requires("siteAlleles.size() == 2")
@SuppressWarnings({"unchecked"})
private final void fastBiallelicDiploidDecode(final List<Allele> siteAlleles,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
final int nPossibleGenotypes = 3 * 3;
final Object allGenotypes[] = new Object[nPossibleGenotypes];
for ( final GenotypeBuilder gb : gbs ) {
final int a1 = decoder.decodeInt(type);
final int a2 = decoder.decodeInt(type);
if ( a1 == type.getMissingBytes() ) {
assert a2 == type.getMissingBytes();
// no called sample GT = .
gb.alleles(null);
} else if ( a2 == type.getMissingBytes() ) {
gb.alleles(Arrays.asList(getAlleleFromEncoded(siteAlleles, a1)));
} else {
// downshift to remove phase
final int offset = (a1 >> 1) * 3 + (a2 >> 1);
assert offset < allGenotypes.length;
// TODO -- how can I get rid of this cast?
List<Allele> gt = (List<Allele>)allGenotypes[offset];
if ( gt == null ) {
final Allele allele1 = getAlleleFromEncoded(siteAlleles, a1);
final Allele allele2 = getAlleleFromEncoded(siteAlleles, a2);
gt = Arrays.asList(allele1, allele2);
allGenotypes[offset] = gt;
}
gb.alleles(gt);
}
}
}
private final void generalDecode(final List<Allele> siteAlleles,
final int ploidy,
final BCF2Decoder decoder,
final byte typeDescriptor,
final List<GenotypeBuilder> gbs) {
final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
// a single cache for the encoded genotypes, since we don't actually need this vector
final int[] tmp = new int[ploidy];
for ( final GenotypeBuilder gb : gbs ) {
final int[] encoded = decoder.decodeIntArray(ploidy, type, tmp);
if ( encoded == null )
// no called sample GT = .
gb.alleles(null);
else {
assert encoded.length > 0;
// we have at least some alleles to decode
final List<Allele> gt = new ArrayList<Allele>(encoded.length);
// note that the auto-pruning of fields magically handles different
// ploidy per sample at a site
for ( final int encode : encoded )
gt.add(getAlleleFromEncoded(siteAlleles, encode));
gb.alleles(gt);
}
}
}
@Requires({"siteAlleles != null && ! siteAlleles.isEmpty()", "encode >= 0"})
@Ensures("result != null")
private final Allele getAlleleFromEncoded(final List<Allele> siteAlleles, final int encode) {
final int offset = encode >> 1;
return offset == 0 ? Allele.NO_CALL : siteAlleles.get(offset - 1);
}
}
private class DPDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.DP(decoder.decodeInt(typeDescriptor, -1));
}
}
}
private class GQDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
// the -1 is for missing
gb.GQ(decoder.decodeInt(typeDescriptor, -1));
}
}
}
private class ADDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.AD(decoder.decodeIntArray(typeDescriptor));
}
}
}
private class PLDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
gb.PL(decoder.decodeIntArray(typeDescriptor));
}
}
}
private class GenericDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
if ( value instanceof List && ((List)value).size() == 1) {
// todo -- I really hate this, and it suggests that the code isn't completely right
// the reason it's here is that it's possible to prune down a vector to a singleton
// value and there we have the contract that the value comes back as an atomic value
// not a vector of size 1
value = ((List)value).get(0);
}
gb.attribute(field, value);
}
}
}
}
private class FTDecoder implements Decoder {
@Override
public void decode(final List<Allele> siteAlleles, final String field, final BCF2Decoder decoder, final byte typeDescriptor, final List<GenotypeBuilder> gbs) {
for ( final GenotypeBuilder gb : gbs ) {
Object value = decoder.decodeTypedValue(typeDescriptor);
if ( value != null ) { // don't add missing values
gb.filters(value instanceof String ? Collections.singletonList((String)value) : (List<String>)value);
}
}
}
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
/**
* Lazy version of genotypes decoder for BCF2 genotypes
*
* @author Mark DePristo
* @since 5/12
*/
class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser {
final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class);
// the essential information for us to use to decode the genotypes data
// initialized when this lazy decoder is created, as we know all of this from the BCF2Codec
// and its stored here again for code cleanliness
private final BCF2Codec codec;
private final ArrayList<Allele> siteAlleles;
private final int nSamples;
private final int nFields;
BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList<Allele> alleles, final int nSamples, final int nFields) {
this.codec = codec;
this.siteAlleles = alleles;
this.nSamples = nSamples;
this.nFields = nFields;
}
@Override
public LazyGenotypesContext.LazyData parse(final Object data) {
if ( logger.isDebugEnabled() )
logger.debug("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each");
// load our byte[] data into the decoder
final BCF2Decoder decoder = new BCF2Decoder(((BCF2Codec.LazyData)data).bytes);
// TODO -- fast path for sites only
// go ahead and decode everyone
final List<String> samples = new ArrayList<String>(codec.getHeader().getGenotypeSamples());
if ( samples.size() != nSamples )
throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " +
"different numbers of samples per record. Saw " + samples.size() +
" samples in header but have a record with " + nSamples + " samples");
// create and initialize the genotypes array
final ArrayList<GenotypeBuilder> builders = new ArrayList<GenotypeBuilder>(nSamples);
for ( int i = 0; i < nSamples; i++ ) {
builders.add(new GenotypeBuilder(samples.get(i)));
}
for ( int i = 0; i < nFields; i++ ) {
// get the field name
final int offset = (Integer) decoder.decodeTypedValue();
final String field = codec.getDictionaryString(offset);
// the type of each element
final byte typeDescriptor = decoder.readTypeDescriptor();
final BCF2GenotypeFieldDecoders.Decoder fieldDecoder = codec.getGenotypeFieldDecoder(field);
try {
fieldDecoder.decode(siteAlleles, field, decoder, typeDescriptor, builders);
} catch ( ClassCastException e ) {
throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field
+ " inconsistent with the value observed in the decoded value");
}
}
final ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nSamples);
for ( final GenotypeBuilder gb : builders )
genotypes.add(gb.make());
return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset());
}
}

View File

@ -1,143 +0,0 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
import org.broad.tribble.FeatureCodecHeader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
import java.io.*;
import java.util.*;
/**
* Testing BCF2
*
* @author Mark DePristo
* @since 2012
*/
public class BCF2TestWalker extends RodWalker<Integer, Integer> {
/**
* Variants from this VCF file are used by this tool as input.
* The file must at least contain the standard VCF header lines, but
* can be empty (i.e., no variants are contained in the file).
*/
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@Argument(doc="keep variants", required=false)
public boolean keepVariants = false;
@Argument(doc="quiet", required=false)
public boolean quiet = false;
@Argument(doc="dontIndexOnTheFly", required=false)
public boolean dontIndexOnTheFly = false;
@Output(doc="File to which results should be written",required=true)
protected File bcfFile;
private final List<VariantContext> vcs = new ArrayList<VariantContext>();
protected VariantContextWriter writer;
@Override
public void initialize() {
final Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), Collections.singletonList(variants));
final VCFHeader header = VCFUtils.withUpdatedContigs(vcfRods.values().iterator().next(), getToolkit());
try {
EnumSet<Options> options = EnumSet.of(Options.FORCE_BCF);
if ( !dontIndexOnTheFly ) options.add(Options.INDEX_ON_THE_FLY);
writer = VariantContextWriterFactory.create(bcfFile, new FileOutputStream(bcfFile), getToolkit().getMasterSequenceDictionary(), options);
writer.writeHeader(header);
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, e);
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
for ( VariantContext vc : tracker.getValues(variants, context.getLocation())) {
writer.add(vc);
if ( keepVariants ) vcs.add(vc);
}
return 1;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {
try {
writer.close();
logger.info("Closed writer");
// read in the BCF records
BCF2Codec codec = new BCF2Codec();
PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
FeatureCodecHeader header = codec.readHeader(pbs);
pbs.close();
pbs = new PositionalBufferedStream(new FileInputStream(bcfFile));
pbs.skip(header.getHeaderEnd());
Iterator<VariantContext> it = vcs.iterator();
while ( ! pbs.isDone() ) {
if ( keepVariants ) {
VariantContext expected = it.next();
if ( ! quiet )
System.out.printf("vcf = %s %d %s%n", expected.getChr(), expected.getStart(), expected);
}
VariantContext bcfRaw = codec.decode(pbs);
VariantContext bcf = new VariantContextBuilder(bcfRaw).source("variant").make();
if ( ! quiet ) {
System.out.printf("bcf = %s %d %s%n", bcf.getChr(), bcf.getStart(), bcf.toString());
System.out.printf("--------------------------------------------------%n");
}
}
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(bcfFile, "bad user!");
}
}
}

View File

@ -24,18 +24,22 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Requires;
import java.util.EnumSet;
/**
* BCF2 types and information
* BCF2 types and associated information
*
* @author depristo
* @since 05/12
*/
public enum BCF2Type {
INT8(1, 1, BCF2Utils.INT8_MISSING_VALUE, -127, 127), // todo -- confirm range
INT16(2, 2, BCF2Utils.INT16_MISSING_VALUE, -32767, 32767),
INT32(3, 4, BCF2Utils.INT32_MISSING_VALUE, -2147483647, 2147483647),
FLOAT(5, 4, BCF2Utils.FLOAT_MISSING_VALUE),
CHAR(7);
INT8 (1, 1, 0xFFFFFF80, -127, 127), // todo -- confirm range
INT16(2, 2, 0xFFFF8000, -32767, 32767),
INT32(3, 4, 0x80000000, -2147483647, 2147483647),
FLOAT(5, 4, 0x7F800001),
CHAR (7, 1, 0x00000000);
private final int id;
private final Object missingJavaValue;
@ -60,11 +64,53 @@ public enum BCF2Type {
this.maxValue = maxValue;
}
/**
* How many bytes are used to represent this type on disk?
* @return
*/
public int getSizeInBytes() {
return sizeInBytes;
}
/**
* The ID according to the BCF2 specification
* @return
*/
public int getID() { return id; }
/**
* Can we encode value v in this type, according to its declared range.
*
* Only makes sense for integer values
*
* @param v
* @return
*/
@Requires("INTEGERS.contains(this)")
public final boolean withinRange(final long v) { return v >= minValue && v <= maxValue; }
/**
* Return the java object (aka null) that is used to represent a missing value for this
* type in Java
*
* @return
*/
public Object getMissingJavaValue() { return missingJavaValue; }
/**
* The bytes (encoded as an int) that are used to represent a missing value
* for this type in BCF2
*
* @return
*/
public int getMissingBytes() { return missingBytes; }
/**
* An enum set of the types that might represent Integer values
*/
public final static EnumSet<BCF2Type> INTEGERS = EnumSet.of(INT8, INT16, INT32);
public boolean isIntegerType() {
return INTEGERS.contains(this);
}
}

View File

@ -24,6 +24,8 @@
package org.broadinstitute.sting.utils.codecs.bcf2;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
@ -33,9 +35,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.io.OutputStream;
import java.util.*;
/**
* Common utilities for working with BCF2 files
@ -45,7 +46,7 @@ import java.util.List;
* @author depristo
* @since 5/12
*/
public class BCF2Utils {
public final class BCF2Utils {
public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes();
public static final int MAX_ALLELES_IN_GENOTYPES = 127;
@ -53,12 +54,6 @@ public class BCF2Utils {
public static final int OVERFLOW_ELEMENT_MARKER = 15;
public static final int MAX_INLINE_ELEMENTS = 14;
// Note that these values are prefixed by FFFFFF for convenience
public static final int INT8_MISSING_VALUE = 0xFFFFFF80;
public static final int INT16_MISSING_VALUE = 0xFFFF8000;
public static final int INT32_MISSING_VALUE = 0x80000000;
public static final int FLOAT_MISSING_VALUE = 0x7F800001;
public final static BCF2Type[] INTEGER_TYPES_BY_SIZE = new BCF2Type[]{BCF2Type.INT8, BCF2Type.INT16, BCF2Type.INT32};
public final static BCF2Type[] ID_TO_ENUM;
@ -77,11 +72,17 @@ public class BCF2Utils {
* The dictionary is an ordered list of common VCF identifers (FILTER, INFO, and FORMAT)
* fields.
*
* Note that its critical that the list be dedupped and sorted in a consistent manner each time,
* as the BCF2 offsets are encoded relative to this dictionary, and if it isn't determined exactly
* the same way as in the header each time it's very bad
*
* @param header the VCFHeader from which to build the dictionary
* @return a non-null dictionary of elements, may be empty
*/
@Requires("header != null")
@Ensures({"result != null", "new HashSet(result).size() == result.size()"})
public final static ArrayList<String> makeDictionary(final VCFHeader header) {
final ArrayList<String> dict = new ArrayList<String>();
final Set<String> dict = new TreeSet<String>();
// set up the strings dictionary
dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field
@ -92,23 +93,27 @@ public class BCF2Utils {
}
}
return dict;
return new ArrayList<String>(dict);
}
@Requires({"nElements >= 0", "type != null"})
public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) {
int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER);
byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F));
return typeByte;
}
@Ensures("result >= 0")
public final static int decodeSize(final byte typeDescriptor) {
return (0xF0 & typeDescriptor) >> 4;
}
@Ensures("result >= 0")
public final static int decodeTypeID(final byte typeDescriptor) {
return typeDescriptor & 0x0F;
}
@Ensures("result != null")
public final static BCF2Type decodeType(final byte typeDescriptor) {
return ID_TO_ENUM[decodeTypeID(typeDescriptor)];
}
@ -117,6 +122,7 @@ public class BCF2Utils {
return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER;
}
@Requires("nElements >= 0")
public final static boolean willOverflow(final long nElements) {
return nElements > MAX_INLINE_ELEMENTS;
}
@ -128,6 +134,7 @@ public class BCF2Utils {
}
public final static byte readByte(final InputStream stream) {
// TODO -- shouldn't be capturing error here
try {
return (byte)(stream.read() & 0xFF);
} catch ( IOException e ) {
@ -135,6 +142,7 @@ public class BCF2Utils {
}
}
@Requires({"stream != null", "bytesForEachInt > 0"})
public final static int readInt(int bytesForEachInt, final InputStream stream) {
switch ( bytesForEachInt ) {
case 1: {
@ -161,10 +169,10 @@ public class BCF2Utils {
* @param strings size > 1 list of strings
* @return
*/
@Requires({"strings != null", "strings.size() > 1"})
@Ensures("result != null")
public static final String collapseStringList(final List<String> strings) {
assert strings.size() > 1;
StringBuilder b = new StringBuilder();
final StringBuilder b = new StringBuilder();
for ( final String s : strings ) {
assert s.indexOf(",") == -1; // no commas in individual strings
b.append(",").append(s);
@ -181,12 +189,15 @@ public class BCF2Utils {
* @param collapsed
* @return
*/
@Requires({"collapsed != null", "isCollapsedString(collapsed)"})
@Ensures("result != null")
public static final List<String> exploreStringList(final String collapsed) {
assert isCollapsedString(collapsed);
final String[] exploded = collapsed.substring(1).split(",");
return Arrays.asList(exploded);
}
@Requires("s != null")
public static final boolean isCollapsedString(final String s) {
return s.charAt(0) == ',';
}
@ -200,6 +211,8 @@ public class BCF2Utils {
* @param vcfFile
* @return
*/
@Requires("vcfFile != null")
@Ensures("result != null")
public static final File shadowBCF(final File vcfFile) {
final String path = vcfFile.getAbsolutePath();
if ( path.contains(".vcf") )
@ -207,4 +220,109 @@ public class BCF2Utils {
else
return new File( path + ".bcf" );
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Returns the maximum BCF2 integer size of t1 and t2
*
* For example, if t1 == INT8 and t2 == INT16 returns INT16
*
* @param t1
* @param t2
* @return
*/
@Requires({"BCF2Type.INTEGERS.contains(t1)","BCF2Type.INTEGERS.contains(t2)"})
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) {
switch ( t1 ) {
case INT8: return t2;
case INT16: return t2 == BCF2Type.INT32 ? t2 : t1;
case INT32: return t1;
default: throw new ReviewedStingException("BUG: unexpected BCF2Type " + t1);
}
}
@Ensures("BCF2Type.INTEGERS.contains(result)")
public final static BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
public final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
public final static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
switch ( type.getSizeInBytes() ) {
case 1:
encodeStream.write(0xFF & value);
break;
case 2:
encodeStream.write((0xFF00 & value) >> 8);
encodeStream.write(0xFF & value);
break;
case 4:
encodeStream.write((0xFF000000 & value) >> 24);
encodeStream.write((0x00FF0000 & value) >> 16);
encodeStream.write((0x0000FF00 & value) >> 8);
encodeStream.write((0x000000FF & value));
break;
default:
throw new ReviewedStingException("BUG: unexpected type size " + type);
}
// general case for reference
// for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
// final int shift = i * 8;
// int mask = 0xFF << shift;
// int byteValue = (mask & value) >> shift;
// encodeStream.write(byteValue);
// }
}
}

View File

@ -28,6 +28,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
// we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null;
protected VCFHeaderVersion version = null;
// a mapping of the allele
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
@ -48,7 +49,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
protected final String[] locParts = new String[6];
// for performance we cache the hashmap of filter encodings for quick lookup
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
protected HashMap<String,List<String>> filterHash = new HashMap<String,List<String>>();
// we store a name to give to each of the variant contexts we emit
protected String name = "Unknown";
@ -91,24 +92,12 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
*/
public abstract Object readHeader(LineReader reader);
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public abstract LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
/**
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
* @param filterString the string to parse
* @return a set of the filters applied
*/
protected abstract Set<String> parseFilters(String filterString);
protected abstract List<String> parseFilters(String filterString);
/**
* create a VCF header from a set of header record lines
@ -117,6 +106,8 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
* @return a VCFHeader object
*/
protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) {
this.version = version;
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> sampleNames = new LinkedHashSet<String>();
int contigCounter = 0;
@ -320,7 +311,9 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
String ref = getCachedString(parts[3].toUpperCase());
String alts = getCachedString(parts[4].toUpperCase());
builder.log10PError(parseQual(parts[5]));
builder.filters(parseFilters(getCachedString(parts[6])));
final List<String> filters = parseFilters(getCachedString(parts[6]));
if ( filters != null ) builder.filters(new HashSet<String>(filters));
final Map<String, Object> attrs = parseInfo(parts[7]);
builder.attributes(attrs);
@ -719,4 +712,115 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext>
try { stream.close(); } catch ( IOException e ) {}
}
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
final List<Allele> alleles,
final String chr,
final int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
final String sampleName = sampleNameIterator.next();
final GenotypeBuilder gb = new GenotypeBuilder(sampleName);
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gb.maxAttributes(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = genotypeKeyArray[i];
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if ( missing ) {
// if its truly missing (there no provided value) skip adding it to the attributes
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
final List<String> filters = parseFilters(getCachedString(GTValueArray[i]));
if ( filters != null ) gb.filters(filters);
} else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) {
// don't add missing values to the map
} else {
if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
if ( GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) )
gb.noGQ();
else
gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i])));
} else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
gb.AD(decodeInts(GTValueArray[i]));
} else if (gtKey.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)) {
gb.PL(decodeInts(GTValueArray[i]));
} else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs());
} else if (gtKey.equals(VCFConstants.DEPTH_KEY)) {
gb.DP(Integer.valueOf(GTValueArray[i]));
} else {
gb.attribute(gtKey, GTValueArray[i]);
}
}
}
}
// check to make sure we found a genotype field if our version is less than 4.1 file
if ( version != VCFHeaderVersion.VCF4_1 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
gb.alleles(GTalleles);
gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1);
// add it to the list
try {
genotypes.add(gb.make());
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset());
}
private final static String[] INT_DECODE_ARRAY = new String[10000];
private final static int[] decodeInts(final String string) {
final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ',');
final int[] values = new int[nValues];
for ( int i = 0; i < nValues; i++ )
values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]);
return values;
}
}

View File

@ -1,3 +1,27 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.TribbleException;
@ -78,24 +102,24 @@ public class VCF3Codec extends AbstractVCFCodec {
* @param filterString the string to parse
* @return a set of the filters applied
*/
protected Set<String> parseFilters(String filterString) {
protected List<String> parseFilters(String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
List<String> fFields = new ArrayList<String>();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
return fFields;
return new ArrayList<String>(fFields);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status");
// do we have the filter string cached?
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
return new ArrayList<String>(filterHash.get(filterString));
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
@ -108,93 +132,6 @@ public class VCF3Codec extends AbstractVCFCodec {
return fFields;
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @param chr chrom
* @param pos position
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_LOG10_PERROR;
Set<String> genotypeFilters = null;
Map<String, Object> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, Object>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) {
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
} else {
gtAttributes.put(gtKey, new String(GTValueArray[i]));
}
}
}
// check to make sure we found a genotype field
if ( genotypeAlleleLocation < 0 )
generateException("Unable to find the GT field for the record; the GT field is required");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.add(new Genotype(sampleName,
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
GTQual,
genotypeFilters,
gtAttributes,
phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
}
@Override
public boolean canDecode(final String potentialInput) {
return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER);

View File

@ -48,7 +48,6 @@ import java.util.*;
public class VCFCodec extends AbstractVCFCodec {
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
private VCFHeaderVersion version = null;
/**
* A VCF header the contains master info/filter/format records that we use to 'fill in'
@ -127,121 +126,33 @@ public class VCFCodec extends AbstractVCFCodec {
* @param filterString the string to parse
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/
protected Set<String> parseFilters(String filterString) {
return parseFilters(filterHash, lineNo, filterString);
}
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
protected List<String> parseFilters(String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
return Collections.emptySet();
return Collections.emptyList();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached?
if ( cache != null && cache.containsKey(filterString) )
return Collections.unmodifiableSet(cache.get(filterString));
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
List<String> fFields = new LinkedList<String>();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
fFields = fFields;
if ( cache != null ) cache.put(filterString, fFields);
filterHash.put(filterString, Collections.unmodifiableList(fFields));
return Collections.unmodifiableSet(fFields);
}
/**
* create a genotype map
*
* @param str the string
* @param alleles the list of alleles
* @return a mapping of sample name to genotype object
*/
public LazyGenotypesContext.LazyData createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
if (genotypeParts == null)
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
if ( nParts != genotypeParts.length )
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
// get the format keys
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
// cycle through the sample names
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
// clear out our allele mapping
alleleMap.clear();
// cycle through the genotype strings
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
double GTQual = VariantContext.NO_LOG10_PERROR;
Set<String> genotypeFilters = null;
Map<String, Object> gtAttributes = null;
String sampleName = sampleNameIterator.next();
// check to see if the value list is longer than the key list, which is a problem
if (nGTKeys < GTValueSplitSize)
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
int genotypeAlleleLocation = -1;
if (nGTKeys >= 1) {
gtAttributes = new HashMap<String, Object>(nGTKeys - 1);
for (int i = 0; i < nGTKeys; i++) {
final String gtKey = new String(genotypeKeyArray[i]);
boolean missing = i >= GTValueSplitSize;
// todo -- all of these on the fly parsing of the missing value should be static constants
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
genotypeAlleleLocation = i;
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
} else if ( missing ) {
// if its truly missing (there no provided value) skip adding it to the attributes
} else {
gtAttributes.put(gtKey, GTValueArray[i]);
}
}
}
// check to make sure we found a genotype field if we are a VCF4.0 file
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
if ( genotypeAlleleLocation > 0 )
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
// add it to the list
try {
genotypes.add(new Genotype(sampleName, GTalleles, GTQual, genotypeFilters, gtAttributes, phased));
} catch (TribbleException e) {
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
}
}
return new LazyGenotypesContext.LazyData(genotypes, header.sampleNamesInOrder, header.sampleNameToOffset);
return fFields;
}
@Override

View File

@ -56,8 +56,9 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF
public String getDescription() { return description; }
public VCFHeaderLineType getType() { return type; }
public VCFHeaderLineCount getCountType() { return countType; }
public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; }
public int getCount() {
if ( countType != VCFHeaderLineCount.INTEGER )
if ( ! isFixedCount() )
throw new ReviewedStingException("Asking for header line count when type is not an integer");
return count;
}

View File

@ -48,6 +48,7 @@ public final class VCFConstants {
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods
public static final String GENOTYPE_POSTERIORS_KEY = "GP";
public static final String GENOTYPE_QUALITY_KEY = "GQ";
public static final String GENOTYPE_ALLELE_DEPTHS = "AD";
public static final String HAPMAP2_KEY = "H2";
public static final String HAPMAP3_KEY = "H3";
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
@ -113,7 +114,5 @@ public final class VCFConstants {
public static final String EMPTY_GENOTYPE = "./.";
public static final int MAX_GENOTYPE_QUAL = 99;
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00";
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
}

View File

@ -24,12 +24,18 @@
package org.broadinstitute.sting.utils.codecs.vcf;
import org.apache.log4j.Logger;
import org.broad.tribble.util.ParsingUtils;
import java.util.*;
/**
* This class is really a POS. It allows duplicate entries in the metadata,
* stores header lines in lots of places, and all around f*cking sucks.
*
* todo -- clean this POS up
*
* @author aaron
* <p/>
* Class VCFHeader
@ -37,6 +43,7 @@ import java.util.*;
* A class representing the VCF header
*/
public class VCFHeader {
final protected static Logger logger = Logger.getLogger(VCFHeader.class);
// the mandatory header fields
public enum HEADER_FIELDS {
@ -68,8 +75,8 @@ public class VCFHeader {
private boolean samplesWereAlreadySorted = true;
// cache for efficient conversion of VCF -> VariantContext
protected ArrayList<String> sampleNamesInOrder = null;
protected HashMap<String, Integer> sampleNameToOffset = null;
private ArrayList<String> sampleNamesInOrder = null;
private HashMap<String, Integer> sampleNameToOffset = null;
private boolean writeEngineHeaders = true;
private boolean writeCommandLine = true;
@ -164,10 +171,10 @@ public class VCFHeader {
for ( VCFHeaderLine line : mMetaData ) {
if ( line instanceof VCFInfoHeaderLine ) {
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
mInfoMetaData.put(infoLine.getID(), infoLine);
addMetaDataMapBinding(mInfoMetaData, infoLine);
} else if ( line instanceof VCFFormatHeaderLine ) {
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
mFormatMetaData.put(formatLine.getID(), formatLine);
addMetaDataMapBinding(mFormatMetaData, formatLine);
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData.add((VCFContigHeaderLine)line);
} else {
@ -176,6 +183,21 @@ public class VCFHeader {
}
}
/**
* Add line to map, issuing warnings about duplicates
*
* @param map
* @param line
* @param <T>
*/
private final <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, T line) {
final String key = line.getID();
if ( map.containsKey(key) )
logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" );
else
map.put(key, line);
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
@ -193,7 +215,7 @@ public class VCFHeader {
*/
public Set<VCFHeaderLine> getMetaData() {
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString()));
lines.addAll(mMetaData);
return Collections.unmodifiableSet(lines);
}
@ -221,13 +243,17 @@ public class VCFHeader {
return mGenotypeSampleNames;
}
public int getNGenotypeSamples() {
return mGenotypeSampleNames.size();
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return mGenotypeSampleNames.size() > 0;
return getNGenotypeSamples() > 0;
}
/**
@ -244,6 +270,14 @@ public class VCFHeader {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
return mInfoMetaData.values();
}
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
return mFormatMetaData.values();
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
@ -299,4 +333,12 @@ public class VCFHeader {
public void setWriteCommandLine(boolean writeCommandLine) {
this.writeCommandLine = writeCommandLine;
}
public ArrayList<String> getSampleNamesInOrder() {
return sampleNamesInOrder;
}
public HashMap<String, Integer> getSampleNameToOffset() {
return sampleNameToOffset;
}
}

View File

@ -336,9 +336,15 @@ public class GATKSAMRecord extends BAMRecord {
* Clears all attributes except ReadGroup of the read.
*/
public GATKSAMRecord simplify () {
GATKSAMReadGroupRecord rg = getReadGroup();
this.clearAttributes();
setReadGroup(rg);
GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information
byte[] insQuals = (this.getAttribute(BQSR_BASE_INSERTION_QUALITIES) == null) ? null : getBaseInsertionQualities();
byte[] delQuals = (this.getAttribute(BQSR_BASE_DELETION_QUALITIES) == null) ? null : getBaseDeletionQualities();
this.clearAttributes(); // clear all attributes from the read
this.setReadGroup(rg); // restore read group
if (insQuals != null)
this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any
if (delQuals != null)
this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any
return this;
}

View File

@ -336,7 +336,7 @@ public class Allele implements Comparable<Allele> {
*
* @return the segregating bases
*/
public String getBaseString() { return new String(getBases()); }
public String getBaseString() { return isNoCall() ? NO_CALL_STRING : new String(getBases()); }
/**
* Return the printed representation of this allele.

View File

@ -226,12 +226,12 @@ final class CommonInfo {
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
}
// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
// public String getAttributeAsString(String key) { return (String.valueOf(getExtendedAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
// public int getAttributeAsInt(String key) { Object x = getExtendedAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
// public double getAttributeAsDouble(String key) { Object x = getExtendedAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
// public boolean getAttributeAsBoolean(String key) { Object x = getExtendedAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
// public String getAttributeAsStringNoException(String key) { if (getExtendedAttribute(key) == null) return null; return getAttributeAsString(key); }
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
}

View File

@ -0,0 +1,190 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import java.util.*;
/**
* This class encompasses all the basic information about a genotype. It is immutable.
*
* A genotype has several key fields
*
* -- a sample name, must be a non-null string
*
* -- an ordered list of alleles, intrepreted as the genotype of the sample,
* each allele for each chromosome given in order. If alleles = [a*, t]
* then the sample is a/t, with a (the reference from the *) the first
* chromosome and t on the second chromosome
*
* -- a isPhased marker indicting where the alleles are phased with respect to some global
* coordinate system. See VCF4.1 spec for a detailed discussion
*
* -- Inline, optimized ints and int[] values for:
* -- GQ: the phred-scaled genotype quality, of -1 if it's missing
*
* -- DP: the count of reads at this locus for this sample, of -1 if missing
*
* -- AD: an array of counts of reads at this locus, one for each Allele at the site.
* that is, for each allele in the surrounding VariantContext. Null if missing.
*
* -- PL: phred-scaled genotype likelihoods in standard VCF4.1 order for
* all combinations of the alleles in the surrounding VariantContext, given
* the ploidy of the sample (from the alleles vector). Null if missing.
*
* -- A general map from String keys to -> Object values for all other attributes in
* this genotype. Note that this map should not contain duplicate values for the
* standard bindings for GQ, DP, AD, and PL. Genotype filters can be put into
* this genotype, but it isn't respected by the GATK in analyses
*
* The only way to build a Genotype object is with a GenotypeBuilder, which permits values
* to be set in any order, which means that GenotypeBuilder may at some in the chain of
* sets pass through invalid states that are not permitted in a fully formed immutable
* Genotype.
*
* Note this is a simplified, refactored Genotype object based on the original
* generic (and slow) implementation from the original VariantContext + Genotype
* codebase.
*
* @author Mark DePristo
* @since 05/12
*/
public final class FastGenotype extends Genotype {
private final List<Allele> alleles;
private final boolean isPhased;
private final int GQ;
private final int DP;
private final int[] AD;
private final int[] PL;
private final Map<String, Object> extendedAttributes;
/**
* The only way to make one of these, for use by GenotypeBuilder only
*
* @param sampleName
* @param alleles
* @param isPhased
* @param GQ
* @param DP
* @param AD
* @param PL
* @param extendedAttributes
*/
@Requires({
"sampleName != null",
"alleles != null",
"GQ >= -1",
"DP >= -1",
"validADorPLField(AD)",
"validADorPLField(PL)",
"extendedAttributes != null",
"! hasForbiddenKey(extendedAttributes)"})
protected FastGenotype(final String sampleName,
final List<Allele> alleles,
final boolean isPhased,
final int GQ,
final int DP,
final int[] AD,
final int[] PL,
final Map<String, Object> extendedAttributes) {
super(sampleName);
this.alleles = alleles;
this.isPhased = isPhased;
this.GQ = GQ;
this.DP = DP;
this.AD = AD;
this.PL = PL;
this.extendedAttributes = extendedAttributes;
}
// ---------------------------------------------------------------------------------------------------------
//
// Implmenting the abstract methods
//
// ---------------------------------------------------------------------------------------------------------
@Override public List<Allele> getAlleles() {
return alleles;
}
@Override public Allele getAllele(int i) {
return alleles.get(i);
}
@Override public boolean isPhased() {
return isPhased;
}
@Override public int getDP() {
return DP;
}
@Override public int[] getAD() {
return AD;
}
@Override public int getGQ() {
return GQ;
}
@Override public List<String> getFilters() {
return (List<String>) getExtendedAttribute(VCFConstants.GENOTYPE_FILTER_KEY, Collections.emptyList());
}
@Override
public boolean filtersWereApplied() {
return hasExtendedAttribute(VCFConstants.GENOTYPE_FILTER_KEY);
}
@Override public int[] getPL() {
return PL;
}
// ---------------------------------------------------------------------------------------------------------
//
// get routines for extended attributes
//
// ---------------------------------------------------------------------------------------------------------
public Map<String, Object> getExtendedAttributes() {
return extendedAttributes;
}
/**
* Is values a valid AD or PL field
* @param values
* @return
*/
private final static boolean validADorPLField(final int[] values) {
if ( values != null )
for ( int v : values )
if ( v < 0 )
return false;
return true;
}
}

View File

@ -1,6 +1,9 @@
package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -12,132 +15,187 @@ import java.util.*;
*
* @author Mark DePristo
*/
public class Genotype implements Comparable<Genotype> {
@Invariant({
"getAlleles() != null",
"getSampleName() != null",
"getPloidy() >= 0",
"! hasForbiddenKey(getExtendedAttributes())"})
public abstract class Genotype implements Comparable<Genotype> {
/**
* A list of genotype field keys corresponding to values we
* manage inline in the Genotype object. They must not appear in the
* extended attributes map
*/
public final static Collection<String> PRIMARY_KEYS = Arrays.asList(
VCFConstants.GENOTYPE_KEY,
VCFConstants.GENOTYPE_QUALITY_KEY,
VCFConstants.DEPTH_KEY,
VCFConstants.GENOTYPE_ALLELE_DEPTHS,
VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
public final static String PHASED_ALLELE_SEPARATOR = "|";
public final static String UNPHASED_ALLELE_SEPARATOR = "/";
protected CommonInfo commonInfo;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
protected List<Allele> alleles = null; // new ArrayList<Allele>();
protected Type type = null;
private final String sampleName;
private GenotypeType type = null;
protected boolean isPhased = false;
public Genotype(String sampleName, List<Allele> alleles, double log10PError, Set<String> filters, Map<String, Object> attributes, boolean isPhased) {
this(sampleName, alleles, log10PError, filters, attributes, isPhased, null);
protected Genotype(final String sampleName) {
this.sampleName = sampleName;
}
public Genotype(String sampleName, List<Allele> alleles, double log10PError, Set<String> filters, Map<String, Object> attributes, boolean isPhased, double[] log10Likelihoods) {
if ( alleles == null || alleles.isEmpty() )
this.alleles = Collections.emptyList();
else
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes);
if ( log10Likelihoods != null )
commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods));
this.isPhased = isPhased;
validate();
protected Genotype(final String sampleName, final GenotypeType type) {
this.sampleName = sampleName;
this.type = type;
}
/**
* Creates a new Genotype for sampleName with genotype according to alleles.
* @param sampleName
* @param alleles
* @param log10PError the confidence in these alleles
* @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known
* @return the alleles for this genotype. Cannot be null. May be empty
*/
public Genotype(String sampleName, List<Allele> alleles, double log10PError, double[] log10Likelihoods) {
this(sampleName, alleles, log10PError, null, null, false, log10Likelihoods);
}
public Genotype(String sampleName, List<Allele> alleles, double log10PError) {
this(sampleName, alleles, log10PError, null, null, false);
}
public Genotype(String sampleName, List<Allele> alleles) {
this(sampleName, alleles, NO_LOG10_PERROR, null, null, false);
}
public Genotype(String sampleName, Genotype parent) {
this(sampleName, parent.getAlleles(), parent.getLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased());
}
// ---------------------------------------------------------------------------------------------------------
//
// Partial-cloning routines (because Genotype is immutable).
//
// ---------------------------------------------------------------------------------------------------------
public static Genotype modifyName(Genotype g, String name) {
return new Genotype(name, g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
}
public static Genotype modifyAttributes(Genotype g, Map<String, Object> attributes) {
return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased());
}
public static Genotype modifyAlleles(Genotype g, List<Allele> alleles) {
return new Genotype(g.getSampleName(), alleles, g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
}
@Ensures("result != null")
public abstract List<Allele> getAlleles();
/**
* @return the alleles for this genotype
* Returns how many times allele appears in this genotype object?
*
* @param allele
* @return a value >= 0 indicating how many times the allele occurred in this sample's genotype
*/
public List<Allele> getAlleles() {
return alleles;
}
public List<Allele> getAlleles(Allele allele) {
List<Allele> al = new ArrayList<Allele>();
for ( Allele a : alleles )
@Requires("allele != null")
@Ensures("result >= 0")
public int countAllele(final Allele allele) {
int c = 0;
for ( final Allele a : getAlleles() )
if ( a.equals(allele) )
al.add(a);
c++;
return Collections.unmodifiableList(al);
return c;
}
public Allele getAllele(int i) {
if ( getType() == Type.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
return alleles.get(i);
}
public boolean isPhased() { return isPhased; }
/**
* Get the ith allele in this genotype
*
* @param i the ith allele, must be < the ploidy, starting with 0
* @return the allele at position i, which cannot be null
*/
@Requires({"i >=0 && i < getPloidy()", "getType() != GenotypeType.UNAVAILABLE"})
@Ensures("result != null")
public abstract Allele getAllele(int i);
/**
* Are the alleles phased w.r.t. the global phasing system?
*
* @return true if yes
*/
public abstract boolean isPhased();
/**
* What is the ploidy of this sample?
*
* @return the ploidy of this genotype. 0 if the site is no-called.
*/
@Ensures("result >= 0")
public int getPloidy() {
return alleles.size();
return getAlleles().size();
}
public enum Type {
NO_CALL,
HOM_REF,
HET,
HOM_VAR,
UNAVAILABLE,
MIXED // no-call and call in the same genotype
/**
* @return the sequencing depth of this sample, or -1 if this value is missing
*/
@Ensures("result >= -1")
public abstract int getDP();
/**
* @return the count of reads, one for each allele in the surrounding Variant context,
* matching the corresponding allele, or null if this value is missing. MUST
* NOT BE MODIFIED!
*/
public abstract int[] getAD();
/**
* Returns the name associated with this sample.
*
* @return a non-null String
*/
@Ensures("result != null")
public String getSampleName() {
return sampleName;
}
public Type getType() {
/**
* Returns a phred-scaled quality score, or -1 if none is available
* @return
*/
@Ensures("result >= -1")
public abstract int getGQ();
/**
* Does the PL field have a value?
* @return true if there's a PL field value
*/
@Ensures("(result == false && getPL() == null) || (result == true && getPL() != null)")
public boolean hasPL() {
return getPL() != null;
}
/**
* Does the AD field have a value?
* @return true if there's a AD field value
*/
@Ensures("(result == false && getAD() == null) || (result == true && getAD() != null)")
public boolean hasAD() {
return getAD() != null;
}
/**
* Does the GQ field have a value?
* @return true if there's a GQ field value
*/
@Ensures("(result == false && getGQ() == -1) || (result == true && getGQ() >= 0)")
public boolean hasGQ() {
return getGQ() != -1;
}
/**
* Does the DP field have a value?
* @return true if there's a DP field value
*/
@Ensures("(result == false && getDP() == -1) || (result == true && getDP() >= 0)")
public boolean hasDP() {
return getDP() != -1;
}
// ---------------------------------------------------------------------------------------------------------
//
// The type of this genotype
//
// ---------------------------------------------------------------------------------------------------------
/**
* @return the high-level type of this sample's genotype
*/
@Ensures({"type != null", "result != null"})
public GenotypeType getType() {
if ( type == null ) {
type = determineType();
}
return type;
}
protected Type determineType() {
if ( alleles.size() == 0 )
return Type.UNAVAILABLE;
/**
* Internal code to determine the type of the genotype from the alleles vector
* @return the type
*/
@Requires("type == null") // we should never call if already calculated
protected GenotypeType determineType() {
// TODO -- this code is slow and could be optimized for the diploid case
final List<Allele> alleles = getAlleles();
if ( alleles.isEmpty() )
return GenotypeType.UNAVAILABLE;
boolean sawNoCall = false, sawMultipleAlleles = false;
Allele observedAllele = null;
for ( Allele allele : alleles ) {
for ( final Allele allele : alleles ) {
if ( allele.isNoCall() )
sawNoCall = true;
else if ( observedAllele == null )
@ -148,14 +206,14 @@ public class Genotype implements Comparable<Genotype> {
if ( sawNoCall ) {
if ( observedAllele == null )
return Type.NO_CALL;
return Type.MIXED;
return GenotypeType.NO_CALL;
return GenotypeType.MIXED;
}
if ( observedAllele == null )
throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null");
return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR;
return sawMultipleAlleles ? GenotypeType.HET : observedAllele.isReference() ? GenotypeType.HOM_REF : GenotypeType.HOM_VAR;
}
/**
@ -166,101 +224,108 @@ public class Genotype implements Comparable<Genotype> {
/**
* @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false.
*/
public boolean isHomRef() { return getType() == Type.HOM_REF; }
public boolean isHomRef() { return getType() == GenotypeType.HOM_REF; }
/**
* @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false.
*/
public boolean isHomVar() { return getType() == Type.HOM_VAR; }
public boolean isHomVar() { return getType() == GenotypeType.HOM_VAR; }
/**
* @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false.
*/
public boolean isHet() { return getType() == Type.HET; }
public boolean isHet() { return getType() == GenotypeType.HET; }
/**
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false.
*/
public boolean isNoCall() { return getType() == Type.NO_CALL; }
public boolean isNoCall() { return getType() == GenotypeType.NO_CALL; }
/**
* @return true if this genotype is comprised of any alleles that are not no-calls (even if some are).
*/
public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
public boolean isCalled() { return getType() != GenotypeType.NO_CALL && getType() != GenotypeType.UNAVAILABLE; }
/**
* @return true if this genotype is comprised of both calls and no-calls.
*/
public boolean isMixed() { return getType() == Type.MIXED; }
public boolean isMixed() { return getType() == GenotypeType.MIXED; }
/**
* @return true if the type of this genotype is set.
*/
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
public boolean isAvailable() { return getType() != GenotypeType.UNAVAILABLE; }
// ------------------------------------------------------------------------------
//
// Useful methods for getting genotype likelihoods for a genotype object, if present
// methods for getting genotype likelihoods for a genotype object, if present
//
// ------------------------------------------------------------------------------
/**
* @return Returns true if this Genotype has PL field values
*/
@Ensures("(result && getLikelihoods() != null) || (! result && getLikelihoods() == null)")
public boolean hasLikelihoods() {
return (hasAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)) ||
(hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4));
return getPL() != null;
}
/**
* Convenience function that returns a string representation of the PL field of this
* genotype, or . if none is available.
*
* @return
* @return a non-null String representation for the PL of this sample
*/
@Ensures("result != null")
public String getLikelihoodsString() {
GenotypeLikelihoods gl = getLikelihoods();
return gl == null ? VCFConstants.MISSING_VALUE_v4 : gl.toString();
return hasLikelihoods() ? getLikelihoods().toString() : VCFConstants.MISSING_VALUE_v4;
}
/**
* Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing
* @return null or a GenotypesLikelihood object for this sample's PL field
*/
@Ensures("(hasLikelihoods() && result != null) || (! hasLikelihoods() && result == null)")
public GenotypeLikelihoods getLikelihoods() {
GenotypeLikelihoods x = getLikelihoods(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, true);
if ( x != null )
return x;
else {
x = getLikelihoods(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, false);
return x;
}
return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null;
}
private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) {
Object x = getAttribute(key);
if ( x instanceof String ) {
if ( asPL )
return GenotypeLikelihoods.fromPLField((String)x);
else
return GenotypeLikelihoods.fromGLField((String)x);
}
else if ( x instanceof GenotypeLikelihoods ) return (GenotypeLikelihoods)x;
else return null;
}
/**
* Unsafe low-level accessor the PL field itself, may be null.
*
* @return a pointer to the underlying PL data. MUST NOT BE MODIFIED!
*/
public abstract int[] getPL();
public void validate() {
if ( alleles.size() == 0) return;
// int nNoCalls = 0;
for ( Allele allele : alleles ) {
if ( allele == null )
throw new IllegalArgumentException("BUG: allele cannot be null in Genotype");
// nNoCalls += allele.isNoCall() ? 1 : 0;
}
// Technically, the spec does allow for the below case so this is not an illegal state
//if ( nNoCalls > 0 && nNoCalls != alleles.size() )
// throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
}
// ---------------------------------------------------------------------------------------------------------
//
// Many different string representations
//
// ---------------------------------------------------------------------------------------------------------
/**
* Return a VCF-like string representation for the alleles of this genotype.
*
* Does not append the reference * marker on the alleles.
*
* @return a string representing the genotypes, or null if the type is unavailable.
*/
@Ensures("result != null || ! isAvailable()")
public String getGenotypeString() {
return getGenotypeString(true);
}
/**
* Return a VCF-like string representation for the alleles of this genotype.
*
* If ignoreRefState is true, will not append the reference * marker on the alleles.
*
* @return a string representing the genotypes, or null if the type is unavailable.
*/
@Ensures("result != null || ! isAvailable()")
public String getGenotypeString(boolean ignoreRefState) {
if ( alleles.size() == 0 )
return null;
if ( getPloidy() == 0 )
return "NA";
// Notes:
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
@ -270,29 +335,54 @@ public class Genotype implements Comparable<Genotype> {
ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles())));
}
private List<String> getAlleleStrings() {
List<String> al = new ArrayList<String>();
for ( Allele a : alleles )
/**
* Utility that returns a list of allele strings corresponding to the alleles in this sample
* @return
*/
protected List<String> getAlleleStrings() {
final List<String> al = new ArrayList<String>(getPloidy());
for ( Allele a : getAlleles() )
al.add(a.getBaseString());
return al;
}
public String toString() {
int Q = getPhredScaledQual();
return String.format("[%s %s Q%s %s]", getSampleName(), getGenotypeString(false),
Q == -1 ? "." : String.format("%2d",Q), sortedString(getAttributes()));
return String.format("[%s %s%s%s%s%s%s]",
getSampleName(),
getGenotypeString(false),
toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, getGQ()),
toStringIfExists(VCFConstants.DEPTH_KEY, getDP()),
toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, getAD()),
toStringIfExists(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, getPL()),
sortedString(getExtendedAttributes()));
}
public String toBriefString() {
return String.format("%s:Q%d", getGenotypeString(false), getPhredScaledQual());
return String.format("%s:Q%d", getGenotypeString(false), getGQ());
}
public boolean sameGenotype(Genotype other) {
// ---------------------------------------------------------------------------------------------------------
//
// Comparison operations
//
// ---------------------------------------------------------------------------------------------------------
/**
* comparable genotypes -> compareTo on the sample names
* @param genotype
* @return
*/
@Override
public int compareTo(final Genotype genotype) {
return getSampleName().compareTo(genotype.getSampleName());
}
public boolean sameGenotype(final Genotype other) {
return sameGenotype(other, true);
}
public boolean sameGenotype(Genotype other, boolean ignorePhase) {
public boolean sameGenotype(final Genotype other, boolean ignorePhase) {
if (getPloidy() != other.getPloidy())
return false; // gotta have the same number of allele to be equal
@ -308,6 +398,146 @@ public class Genotype implements Comparable<Genotype> {
return thisAlleles.equals(otherAlleles);
}
// ---------------------------------------------------------------------------------------------------------
//
// get routines for extended attributes
//
// ---------------------------------------------------------------------------------------------------------
/**
* Returns the extended attributes for this object
* @return is never null, but is often isEmpty()
*/
@Ensures({"result != null", "! hasForbiddenKey(result)"})
public abstract Map<String, Object> getExtendedAttributes();
/**
* Is key associated with a value (even a null one) in the extended attributes?
*
* Note this will not return true for the inline attributes DP, GQ, AD, or PL
*
* @param key a non-null string key to check for an association
* @return true if key has a value in the extendedAttributes
*/
@Requires({"key != null", "! isForbiddenKey(key)"})
public boolean hasExtendedAttribute(final String key) {
return getExtendedAttributes().containsKey(key);
}
/**
* Get the extended attribute value associated with key, if possible
*
* @param key a non-null string key to fetch a value for
* @param defaultValue the value to return if key isn't in the extended attributes
* @return a value (potentially) null associated with key, or defaultValue if no association exists
*/
@Requires({"key != null", "! isForbiddenKey(key)"})
@Ensures("hasExtendedAttribute(key) || result == defaultValue")
public Object getExtendedAttribute(final String key, final Object defaultValue) {
return hasExtendedAttribute(key) ? getExtendedAttributes().get(key) : defaultValue;
}
/**
* Same as #getExtendedAttribute with a null default
*
* @param key
* @return
*/
public Object getExtendedAttribute(final String key) {
return getExtendedAttribute(key, null);
}
/**
*
* @return
*/
@Ensures({"result != null", "filtersWereApplied() || result.isEmpty()"})
public abstract List<String> getFilters();
@Ensures({"result != getFilters().isEmpty()"})
public boolean isFiltered() {
return ! getFilters().isEmpty();
}
@Ensures("result == true || getFilters().isEmpty()")
public abstract boolean filtersWereApplied();
@Deprecated public boolean hasLog10PError() { return hasGQ(); }
@Deprecated public double getLog10PError() { return getGQ() / -10.0; }
@Deprecated public int getPhredScaledQual() { return getGQ(); }
@Deprecated
public String getAttributeAsString(String key, String defaultValue) {
Object x = getExtendedAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof String ) return (String)x;
return String.valueOf(x); // throws an exception if this isn't a string
}
@Deprecated
public int getAttributeAsInt(String key, int defaultValue) {
Object x = getExtendedAttribute(key);
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
if ( x instanceof Integer ) return (Integer)x;
return Integer.valueOf((String)x); // throws an exception if this isn't a string
}
@Deprecated
public double getAttributeAsDouble(String key, double defaultValue) {
Object x = getExtendedAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
}
/**
* A totally generic getter, that allows you to specific keys that correspond
* to even inline values (GQ, for example). Can be very expensive. Additionally,
* all int[] are converted inline into List<Integer> for convenience.
*
* @param key
* @return
*/
public Object getAnyAttribute(final String key) {
if (key.equals(VCFConstants.GENOTYPE_KEY)) {
return getAlleles();
} else if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
return getGQ();
} else if (key.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
return Arrays.asList(getAD());
} else if (key.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)) {
return Arrays.asList(getPL());
} else if (key.equals(VCFConstants.DEPTH_KEY)) {
return getDP();
} else {
return getExtendedAttribute(key);
}
}
public boolean hasAnyAttribute(final String key) {
if (key.equals(VCFConstants.GENOTYPE_KEY)) {
return isAvailable();
} else if (key.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
return hasGQ();
} else if (key.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
return hasAD();
} else if (key.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)) {
return hasPL();
} else if (key.equals(VCFConstants.DEPTH_KEY)) {
return hasDP();
} else {
return hasExtendedAttribute(key);
}
}
// TODO -- add getAttributesAsX interface here
// ------------------------------------------------------------------------------
//
// private utilities
//
// ------------------------------------------------------------------------------
/**
* a utility method for generating sorted strings from a map key set.
* @param c the map
@ -315,63 +545,70 @@ public class Genotype implements Comparable<Genotype> {
* @param <V> the value type
* @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys
*/
private static <T extends Comparable<T>, V> String sortedString(Map<T, V> c) {
@Requires("c != null")
protected static <T extends Comparable<T>, V> String sortedString(Map<T, V> c) {
// NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS
List<T> t = new ArrayList<T>(c.keySet());
final List<T> t = new ArrayList<T>(c.keySet());
Collections.sort(t);
List<String> pairs = new ArrayList<String>();
for (T k : t) {
final List<String> pairs = new ArrayList<String>();
for (final T k : t) {
pairs.add(k + "=" + c.get(k));
}
return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}";
}
// ---------------------------------------------------------------------------------------------------------
//
// get routines to access context info fields
//
// ---------------------------------------------------------------------------------------------------------
public String getSampleName() { return commonInfo.getName(); }
public Set<String> getFilters() { return commonInfo.getFilters(); }
public Set<String> getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); }
public boolean isFiltered() { return commonInfo.isFiltered(); }
public boolean isNotFiltered() { return commonInfo.isNotFiltered(); }
public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); }
public boolean hasLog10PError() { return commonInfo.hasLog10PError(); }
public double getLog10PError() { return commonInfo.getLog10PError(); }
/**
* Returns a phred-scaled quality score, or -1 if none is available
* @return
* Returns a display name for field name with value v if this isn't -1. Otherwise returns ""
* @param name of the field ("AD")
* @param v the value of the field, or -1 if missing
* @return a non-null string for display if the field is not missing
*/
public int getPhredScaledQual() {
final int i = (int)Math.round(commonInfo.getPhredScaledQual());
return i < 0 ? -1 : i;
@Requires("name != null")
@Ensures("result != null")
protected final static String toStringIfExists(final String name, final int v) {
return v == -1 ? "" : " " + name + " " + v;
}
public Map<String, Object> getAttributes() { return commonInfo.getAttributes(); }
public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); }
public Object getAttribute(String key) { return commonInfo.getAttribute(key); }
public Object getAttribute(String key, Object defaultValue) {
return commonInfo.getAttribute(key, defaultValue);
/**
* Returns a display name for field name with values vs if this isn't null. Otherwise returns ""
* @param name of the field ("AD")
* @param vs the value of the field, or null if missing
* @return a non-null string for display if the field is not missing
*/
@Requires("name != null")
@Ensures("result != null")
protected final static String toStringIfExists(final String name, final int[] vs) {
if ( vs == null )
return "";
else {
StringBuilder b = new StringBuilder();
b.append(" ").append(name).append(" ");
for ( int i = 0; i < vs.length; i++ ) {
if ( i != 0 ) b.append(",");
b.append(vs[i]);
}
return b.toString();
}
}
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
/**
* comparable genotypes -> compareTo on the sample names
* @param genotype
* Does the attribute map have a mapping involving a forbidden key (i.e.,
* one that's managed inline by this Genotypes object?
*
* @param attributes the extended attributes key
* @return
*/
@Override
public int compareTo(final Genotype genotype) {
return getSampleName().compareTo(genotype.getSampleName());
protected final static boolean hasForbiddenKey(final Map<String, Object> attributes) {
for ( final String forbidden : PRIMARY_KEYS)
if ( attributes.containsKey(forbidden) )
return true;
return false;
}
protected final static boolean isForbiddenKey(final String key) {
return PRIMARY_KEYS.contains(key);
}
}

View File

@ -0,0 +1,417 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import java.util.*;
/**
* A builder class for genotypes
*
* Provides convenience setter methods for all of the Genotype field
* values. Setter methods can be used in any order, allowing you to
* pass through states that wouldn't be allowed in the highly regulated
* immutable Genotype class.
*
* All fields default to meaningful MISSING values.
*
* Call make() to actually create the corresponding Genotype object from
* this builder. Can be called multiple times to create independent copies,
* or with intervening sets to conveniently make similar Genotypes with
* slight modifications.
*
* @author Mark DePristo
* @since 06/12
*/
public final class GenotypeBuilder {
public static boolean MAKE_FAST_BY_DEFAULT = true;
private String sampleName = null;
private List<Allele> alleles = Collections.emptyList();
private boolean isPhased = false;
private int GQ = -1;
private int DP = -1;
private int[] AD = null;
private int[] PL = null;
private Map<String, Object> extendedAttributes = null;
private int initialAttributeMapSize = 5;
private boolean useFast = MAKE_FAST_BY_DEFAULT;
private final static Map<String, Object> NO_ATTRIBUTES =
Collections.unmodifiableMap(new HashMap<String, Object>(0));
// -----------------------------------------------------------------
//
// Factory methods
//
// -----------------------------------------------------------------
public final static Genotype create(final String sampleName, final List<Allele> alleles) {
return new GenotypeBuilder(sampleName, alleles).make();
}
public final static Genotype create(final String sampleName,
final List<Allele> alleles,
final Map<String, Object> attributes) {
return new GenotypeBuilder(sampleName, alleles).attributes(attributes).make();
}
protected final static Genotype create(final String sampleName,
final List<Allele> alleles,
final double[] gls) {
return new GenotypeBuilder(sampleName, alleles).PL(gls).make();
}
public final static Genotype create(final String sampleName,
final List<Allele> alleles,
final double log10Perror,
final Map<String, Object> attributes) {
return new GenotypeBuilder(sampleName, alleles)
.GQ(log10Perror == SlowGenotype.NO_LOG10_PERROR ? -1 : (int)(log10Perror * -10))
.attributes(attributes).make();
}
/**
* Create a empty builder. Both a sampleName and alleles must be provided
* before trying to make a Genotype from this builder.
*/
public GenotypeBuilder() {}
/**
* Create a builder using sampleName. Alleles must be provided
* before trying to make a Genotype from this builder.
* @param sampleName
*/
public GenotypeBuilder(final String sampleName) {
name(sampleName);
}
/**
* Make a builder using sampleName and alleles for starting values
* @param sampleName
* @param alleles
*/
public GenotypeBuilder(final String sampleName, final List<Allele> alleles) {
name(sampleName);
alleles(alleles);
}
/**
* Create a new builder starting with the values in Genotype g
* @param g
*/
public GenotypeBuilder(final Genotype g) {
copy(g);
}
/**
* Copy all of the values for this builder from Genotype g
* @param g
* @return
*/
public GenotypeBuilder copy(final Genotype g) {
name(g.getSampleName());
alleles(g.getAlleles());
phased(g.isPhased());
GQ(g.getGQ());
DP(g.getDP());
AD(g.getAD());
PL(g.getPL());
attributes(g.getExtendedAttributes());
return this;
}
/**
* Reset all of the builder attributes to their defaults. After this
* function you must provide sampleName and alleles before trying to
* make more Genotypes.
*/
public final void reset() {
sampleName = null;
alleles = null;
isPhased = false;
GQ = -1;
DP = -1;
AD = null;
PL = null;
extendedAttributes = null;
}
/**
* Create a new Genotype object using the values set in this builder.
*
* After creation the values in this builder can be modified and more Genotypes
* created, althrough the contents of array values like PL should never be modified
* inline as they are not copied for efficiency reasons.
*
* @return a newly minted Genotype object with values provided from this builder
*/
@Ensures({"result != null"})
public Genotype make() {
if ( useFast ) {
final Map<String, Object> ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes;
return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, ea);
} else {
final Map<String, Object> attributes = new LinkedHashMap<String, Object>();
if ( extendedAttributes != null ) attributes.putAll(extendedAttributes);
final double log10PError = GQ == -1 ? SlowGenotype.NO_LOG10_PERROR : (GQ == 0 ? 0 : GQ / -10.0);
Set<String> filters = null;
if ( extendedAttributes != null && extendedAttributes.containsKey(VCFConstants.GENOTYPE_FILTER_KEY) )
{
final Object f = extendedAttributes.get(VCFConstants.GENOTYPE_FILTER_KEY);
if ( f != null )
filters = new LinkedHashSet<String>((List<String>)f);
attributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);
}
if ( DP != -1 ) attributes.put(VCFConstants.DEPTH_KEY, DP);
if ( AD != null ) attributes.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, AD);
final double[] log10likelihoods = PL != null ? GenotypeLikelihoods.fromPLs(PL).getAsVector() : null;
return new SlowGenotype(sampleName, alleles, log10PError, filters, attributes, isPhased, log10likelihoods);
}
}
public GenotypeBuilder useFast(boolean useFast) {
this.useFast = useFast;
return this;
}
/**
* Set this genotype's name
* @param sampleName
* @return
*/
@Requires({"sampleName != null"})
@Ensures({"this.sampleName != null"})
public GenotypeBuilder name(final String sampleName) {
this.sampleName = sampleName;
return this;
}
/**
* Set this genotype's alleles
* @param alleles
* @return
*/
@Ensures({"this.alleles != null"})
public GenotypeBuilder alleles(final List<Allele> alleles) {
if ( alleles == null )
this.alleles = Collections.emptyList();
else
this.alleles = alleles;
return this;
}
/**
* Is this genotype phased?
* @param phased
* @return
*/
public GenotypeBuilder phased(final boolean phased) {
isPhased = phased;
return this;
}
@Requires({"GQ >= -1"})
@Ensures({"this.GQ == GQ", "this.GQ >= -1"})
public GenotypeBuilder GQ(final int GQ) {
this.GQ = GQ;
return this;
}
/**
* Adaptor interface from the pLog10Error system.
*
* Will be retired when
*
* @param pLog10Error
* @return
*/
@Deprecated
public GenotypeBuilder log10PError(final double pLog10Error) {
if ( pLog10Error == CommonInfo.NO_LOG10_PERROR )
return GQ(-1);
else
return GQ((int)Math.round(pLog10Error * -10));
}
/**
* This genotype has no GQ value
* @return
*/
public GenotypeBuilder noGQ() { GQ = -1; return this; }
/**
* This genotype has no AD value
* @return
*/
public GenotypeBuilder noAD() { AD = null; return this; }
/**
* This genotype has no DP value
* @return
*/
public GenotypeBuilder noDP() { DP = -1; return this; }
/**
* This genotype has no PL value
* @return
*/
public GenotypeBuilder noPL() { PL = null; return this; }
/**
* This genotype has this DP value
* @return
*/
@Requires({"DP >= -1"})
@Ensures({"this.DP == DP"})
public GenotypeBuilder DP(final int DP) {
this.DP = DP;
return this;
}
/**
* This genotype has this AD value
* @return
*/
@Requires({"AD == null || AD.length > 0"})
@Ensures({"this.AD == AD"})
public GenotypeBuilder AD(final int[] AD) {
this.AD = AD;
return this;
}
/**
* This genotype has this PL value, as int[]. FAST
* @return
*/
@Requires("PL == null || PL.length > 0")
@Ensures({"this.PL == PL"})
public GenotypeBuilder PL(final int[] PL) {
this.PL = PL;
return this;
}
/**
* This genotype has this PL value, converted from double[]. SLOW
* @return
*/
@Requires("PL == null || PL.length > 0")
@Ensures({"this.PL == PL"})
public GenotypeBuilder PL(final double[] GLs) {
this.PL = GenotypeLikelihoods.fromLog10Likelihoods(GLs).getAsPLs();
return this;
}
/**
* This genotype has these attributes.
*
* Cannot contain inline attributes (DP, AD, GQ, PL)
* @return
*/
@Requires("attributes != null")
@Ensures("attributes.isEmpty() || extendedAttributes != null")
public GenotypeBuilder attributes(final Map<String, Object> attributes) {
for ( Map.Entry<String, Object> pair : attributes.entrySet() )
attribute(pair.getKey(), pair.getValue());
return this;
}
/**
* Tells this builder to remove all extended attributes
*
* @return
*/
public GenotypeBuilder noAttributes() {
this.extendedAttributes = null;
return this;
}
/**
* This genotype has this attribute key / value pair.
*
* Cannot contain inline attributes (DP, AD, GQ, PL)
* @return
*/
@Requires({"key != null"})
@Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"})
public GenotypeBuilder attribute(final String key, final Object value) {
if ( extendedAttributes == null )
extendedAttributes = new HashMap<String, Object>(initialAttributeMapSize);
extendedAttributes.put(key, value);
return this;
}
/**
* Tells this builder to make a Genotype object that has had filters applied,
* which may be empty (passes) or have some value indicating the reasons
* why it's been filtered.
*
* @param filters non-null list of filters. empty list => PASS
* @return this builder
*/
@Requires("filters != null")
public GenotypeBuilder filters(final List<String> filters) {
attribute(VCFConstants.GENOTYPE_FILTER_KEY, filters);
return this;
}
/**
* varargs version of #filters
* @param filters
* @return
*/
@Requires("filters != null")
public GenotypeBuilder filters(final String ... filters) {
return filters(Arrays.asList(filters));
}
/**
* This genotype is unfiltered
*
* @return
*/
public GenotypeBuilder unfiltered() {
if ( extendedAttributes != null )
extendedAttributes.remove(VCFConstants.GENOTYPE_FILTER_KEY);
return this;
}
/**
* Tell's this builder that we have at most these number of attributes
* @return
*/
public GenotypeBuilder maxAttributes(final int i) {
initialAttributeMapSize = i;
return this;
}
}

View File

@ -48,6 +48,7 @@ public class GenotypeLikelihoods {
return new GenotypeLikelihoods(PLs);
}
@Deprecated
public final static GenotypeLikelihoods fromGLField(String GLs) {
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
}
@ -122,25 +123,25 @@ public class GenotypeLikelihoods {
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
//Returns null in case of missing likelihoods
public EnumMap<Genotype.Type,Double> getAsMap(boolean normalizeFromLog10){
public EnumMap<GenotypeType,Double> getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
if(likelihoods == null)
return null;
EnumMap<Genotype.Type,Double> likelihoodsMap = new EnumMap<Genotype.Type, Double>(Genotype.Type.class);
likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
EnumMap<GenotypeType,Double> likelihoodsMap = new EnumMap<GenotypeType, Double>(GenotypeType.class);
likelihoodsMap.put(GenotypeType.HOM_REF,likelihoods[GenotypeType.HOM_REF.ordinal()-1]);
likelihoodsMap.put(GenotypeType.HET,likelihoods[GenotypeType.HET.ordinal()-1]);
likelihoodsMap.put(GenotypeType.HOM_VAR, likelihoods[GenotypeType.HOM_VAR.ordinal() - 1]);
return likelihoodsMap;
}
//Return the neg log10 Genotype Quality (GQ) for the given genotype
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
public double getLog10GQ(Genotype.Type genotype){
return getQualFromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
public double getLog10GQ(GenotypeType genotype){
return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
}
public static double getQualFromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
if(likelihoods == null)
return Double.NEGATIVE_INFINITY;

View File

@ -0,0 +1,46 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext;
/**
* Summary types for Genotype objects
*
* @author Your Name
* @since Date created
*/
public enum GenotypeType {
/** The sample is no-called (all alleles are NO_CALL */
NO_CALL,
/** The sample is homozygous reference */
HOM_REF,
/** The sample is heterozygous, with at least one ref and at least one one alt in any order */
HET,
/** All alleles are non-reference */
HOM_VAR,
/** There is no allele data availble for this sample (alleles.isEmpty) */
UNAVAILABLE,
/** Some chromosomes are NO_CALL and others are called */
MIXED // no-call and call in the same genotype
}

View File

@ -272,6 +272,17 @@ public class GenotypesContext implements List<Genotype> {
}
}
// ---------------------------------------------------------------------------
//
// Lazy methods
//
// ---------------------------------------------------------------------------
public boolean isLazyWithData() {
return this instanceof LazyGenotypesContext &&
((LazyGenotypesContext)this).getUnparsedGenotypeData() != null;
}
// ---------------------------------------------------------------------------
//
// Map methods

View File

@ -0,0 +1,189 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* This class encompasses all the basic information about a genotype. It is immutable.
*
* @author Mark DePristo
*/
@Deprecated
public class SlowGenotype extends Genotype {
protected CommonInfo commonInfo;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
protected List<Allele> alleles = null;
protected boolean isPhased = false;
protected SlowGenotype(String sampleName, List<Allele> alleles, double log10PError, Set<String> filters, Map<String, Object> attributes, boolean isPhased, double[] log10Likelihoods) {
super(sampleName);
if ( alleles == null || alleles.isEmpty() )
this.alleles = Collections.emptyList();
else
this.alleles = Collections.unmodifiableList(alleles);
commonInfo = new CommonInfo(sampleName, log10PError, filters, attributes);
if ( log10Likelihoods != null )
commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods));
this.isPhased = isPhased;
validate();
}
@Override public List<Allele> getAlleles() {
return alleles;
}
@Override public Allele getAllele(int i) {
if ( getType() == GenotypeType.UNAVAILABLE )
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
return alleles.get(i);
}
@Override public boolean isPhased() { return isPhased; }
//
// Useful methods for getting genotype likelihoods for a genotype object, if present
//
@Override public boolean hasLikelihoods() {
return (commonInfo.hasAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) && !commonInfo.getAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)) ||
(commonInfo.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && !commonInfo.getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4));
}
@Override public GenotypeLikelihoods getLikelihoods() {
GenotypeLikelihoods x = getLikelihoods(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, true);
if ( x != null )
return x;
else {
x = getLikelihoods(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, false);
return x;
}
}
private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) {
Object x = commonInfo.getAttribute(key);
if ( x instanceof String ) {
if ( asPL )
return GenotypeLikelihoods.fromPLField((String)x);
else
return GenotypeLikelihoods.fromGLField((String)x);
}
else if ( x instanceof GenotypeLikelihoods ) return (GenotypeLikelihoods)x;
else return null;
}
private final void validate() {
if ( alleles.size() == 0) return;
for ( Allele allele : alleles ) {
if ( allele == null )
throw new IllegalArgumentException("BUG: allele cannot be null in Genotype");
}
}
// ---------------------------------------------------------------------------------------------------------
//
// get routines to access context info fields
//
// ---------------------------------------------------------------------------------------------------------
@Override public List<String> getFilters() { return new ArrayList<String>(commonInfo.getFilters()); }
@Override public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); }
@Override public boolean hasLog10PError() { return commonInfo.hasLog10PError(); }
@Override public double getLog10PError() { return commonInfo.getLog10PError(); }
@Override
public boolean hasExtendedAttribute(String key) { return commonInfo.hasAttribute(key); }
@Override
public Object getExtendedAttribute(String key) { return commonInfo.getAttribute(key); }
@Override
public Object getExtendedAttribute(String key, Object defaultValue) {
return commonInfo.getAttribute(key, defaultValue);
}
// public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
// public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
// public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
// public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
@Override
public int[] getPL() {
return hasPL() ? getLikelihoods().getAsPLs() : null;
}
@Override
public boolean hasPL() {
return hasLikelihoods();
}
@Override
public int getDP() {
return commonInfo.getAttributeAsInt(VCFConstants.DEPTH_KEY, -1);
}
@Override
public boolean hasDP() {
return commonInfo.hasAttribute(VCFConstants.DEPTH_KEY);
}
@Override
public int[] getAD() {
if ( hasAD() ) {
return (int[])commonInfo.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
} else
return null;
}
@Override
public boolean hasAD() {
return commonInfo.hasAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
}
@Override
public int getGQ() {
if ( commonInfo.hasLog10PError() )
return (int)Math.round(commonInfo.getPhredScaledQual());
else
return -1;
}
@Override
public boolean hasGQ() {
return hasLog10PError();
}
@Override
public Map<String, Object> getExtendedAttributes() {
final Map<String, Object> ea = new LinkedHashMap<String, Object>(commonInfo.getAttributes());
for ( final String primary : FastGenotype.PRIMARY_KEYS )
ea.remove(primary);
return ea;
}
}

View File

@ -327,19 +327,36 @@ public class VariantContext implements Feature { // to enable tribble integratio
//
// ---------------------------------------------------------------------------------------------------------
public VariantContext subContextFromSamples(Set<String> sampleNames, Collection<Allele> alleles) {
VariantContextBuilder builder = new VariantContextBuilder(this);
return builder.genotypes(genotypes.subsetToSamples(sampleNames)).alleles(alleles).make();
}
/**
* This method subsets down to a set of samples.
*
* At the same time returns the alleles to just those in use by the samples,
* if rederiveAllelesFromGenotypes is true, otherwise the full set of alleles
* in this VC is returned as the set of alleles in the subContext, even if
* some of those alleles aren't in the samples
*
* @param sampleNames
* @return
*/
public VariantContext subContextFromSamples(Set<String> sampleNames, final boolean rederiveAllelesFromGenotypes ) {
if ( ! rederiveAllelesFromGenotypes && sampleNames.containsAll(getSampleNames()) ) {
return this; // fast path when you don't have any work to do
} else {
VariantContextBuilder builder = new VariantContextBuilder(this);
GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames);
public VariantContext subContextFromSamples(Set<String> sampleNames) {
VariantContextBuilder builder = new VariantContextBuilder(this);
GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames);
return builder.genotypes(newGenotypes).alleles(allelesOfGenotypes(newGenotypes)).make();
if ( rederiveAllelesFromGenotypes )
builder.alleles(allelesOfGenotypes(newGenotypes));
else {
builder.alleles(alleles);
}
return builder.genotypes(newGenotypes).make();
}
}
public VariantContext subContextFromSample(String sampleName) {
return subContextFromSamples(Collections.singleton(sampleName));
return subContextFromSamples(Collections.singleton(sampleName), true);
}
/**
@ -849,7 +866,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
* @return chromosome count
*/
public int getCalledChrCount() {
return getCalledChrCount(new HashSet<String>(0));
final Set<String> noSamples = Collections.emptySet();
return getCalledChrCount(noSamples);
}
/**
@ -892,7 +910,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds);
for ( final Genotype g : genotypes ) {
n += g.getAlleles(a).size();
n += g.countAllele(a);
}
return n;
@ -922,7 +940,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
private void calculateGenotypeCounts() {
if ( genotypeCounts == null ) {
genotypeCounts = new int[Genotype.Type.values().length];
genotypeCounts = new int[GenotypeType.values().length];
for ( final Genotype g : getGenotypes() ) {
genotypeCounts[g.getType().ordinal()]++;
@ -937,7 +955,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
*/
public int getNoCallCount() {
calculateGenotypeCounts();
return genotypeCounts[Genotype.Type.NO_CALL.ordinal()];
return genotypeCounts[GenotypeType.NO_CALL.ordinal()];
}
/**
@ -947,7 +965,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
*/
public int getHomRefCount() {
calculateGenotypeCounts();
return genotypeCounts[Genotype.Type.HOM_REF.ordinal()];
return genotypeCounts[GenotypeType.HOM_REF.ordinal()];
}
/**
@ -957,7 +975,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
*/
public int getHetCount() {
calculateGenotypeCounts();
return genotypeCounts[Genotype.Type.HET.ordinal()];
return genotypeCounts[GenotypeType.HET.ordinal()];
}
/**
@ -967,7 +985,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
*/
public int getHomVarCount() {
calculateGenotypeCounts();
return genotypeCounts[Genotype.Type.HOM_VAR.ordinal()];
return genotypeCounts[GenotypeType.HOM_VAR.ordinal()];
}
/**
@ -977,7 +995,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
*/
public int getMixedCount() {
calculateGenotypeCounts();
return genotypeCounts[Genotype.Type.MIXED.ordinal()];
return genotypeCounts[GenotypeType.MIXED.ordinal()];
}
// ---------------------------------------------------------------------------------------------------------
@ -1412,8 +1430,8 @@ public class VariantContext implements Feature { // to enable tribble integratio
}
private final Genotype fullyDecodeGenotypes(final Genotype g, final VCFHeader header) {
final Map<String, Object> map = fullyDecodeAttributes(g.getAttributes(), header);
return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.getFilters(), map, g.isPhased());
final Map<String, Object> map = fullyDecodeAttributes(g.getExtendedAttributes(), header);
return new GenotypeBuilder(g).attributes(map).make();
}
// ---------------------------------------------------------------------------------------------------------

View File

@ -99,7 +99,7 @@ public class VariantContextUtils {
// if there are alternate alleles, record the relevant tags
if ( vc.getAlternateAlleles().size() > 0 ) {
ArrayList<String> alleleFreqs = new ArrayList<String>();
ArrayList<Double> alleleFreqs = new ArrayList<Double>();
ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
ArrayList<Integer> foundersAlleleCounts = new ArrayList<Integer>();
double totalFoundersChromosomes = (double)vc.getCalledChrCount(founderIds);
@ -109,10 +109,9 @@ public class VariantContextUtils {
alleleCounts.add(vc.getCalledChrCount(allele));
foundersAlleleCounts.add(foundersAltChromosomes);
if ( AN == 0 ) {
alleleFreqs.add("0.0");
alleleFreqs.add(0.0);
} else {
// todo -- this is a performance problem
final String freq = String.format(makePrecisionFormatStringFromDenominatorValue(totalFoundersChromosomes), ((double)foundersAltChromosomes / totalFoundersChromosomes));
final Double freq = (double)foundersAltChromosomes / totalFoundersChromosomes;
alleleFreqs.add(freq);
}
}
@ -155,22 +154,11 @@ public class VariantContextUtils {
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
}
public static String makePrecisionFormatStringFromDenominatorValue(double maxValue) {
int precision = 1;
while ( maxValue > 1 ) {
precision++;
maxValue /= 10.0;
}
return "%." + precision + "f";
}
public static Genotype removePLs(Genotype g) {
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY);
return new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased());
if ( g.hasLikelihoods() )
return new GenotypeBuilder(g).noPL().make();
else
return g;
}
/**
@ -257,8 +245,7 @@ public class VariantContextUtils {
newGenotypeAlleles.add(Allele.NO_CALL);
}
}
genotypes.add(new Genotype(g.getSampleName(), newGenotypeAlleles, g.getLog10PError(),
g.getFilters(), g.getAttributes(), g.isPhased()));
genotypes.add(new GenotypeBuilder(g).alleles(newGenotypeAlleles).make());
}
@ -475,9 +462,10 @@ public class VariantContextUtils {
// Genotypes
final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples());
for ( final Genotype g : vc.getGenotypes() ) {
Map<String, Object> genotypeAttributes = subsetAttributes(g.commonInfo, keysToPreserve);
genotypes.add(new Genotype(g.getSampleName(), g.getAlleles(), g.getLog10PError(), g.getFilters(),
genotypeAttributes, g.isPhased()));
final GenotypeBuilder gb = new GenotypeBuilder(g);
// remove AD, DP, PL, and all extended attributes, keeping just GT and GQ
gb.noAD().noDP().noPL().noAttributes();
genotypes.add(gb.make());
}
return builder.genotypes(genotypes).attributes(attributes);
@ -833,7 +821,7 @@ public class VariantContextUtils {
else
trimmedAlleles.add(Allele.NO_CALL);
}
genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles));
genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make());
}
@ -878,7 +866,7 @@ public class VariantContextUtils {
else
trimmedAlleles.add(Allele.NO_CALL);
}
genotypes.add(Genotype.modifyAlleles(genotype, trimmedAlleles));
genotypes.add(new GenotypeBuilder(genotype).alleles(trimmedAlleles).make());
}
return new VariantContextBuilder(inputVC).stop(inputVC.getStart() + alleles.get(0).length() + (inputVC.isMixed() ? -1 : 0)).alleles(alleles).genotypes(genotypes).make();
@ -1073,7 +1061,7 @@ public class VariantContextUtils {
if ( uniqifySamples || alleleMapping.needsRemapping() ) {
final List<Allele> alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles();
newG = new Genotype(name, alleles, g.getLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased());
newG = new GenotypeBuilder(g).name(name).alleles(alleles).make();
}
mergedGenotypes.add(newG);
@ -1113,7 +1101,7 @@ public class VariantContextUtils {
newAllele = Allele.NO_CALL;
newAlleles.add(newAllele);
}
newGenotypes.add(Genotype.modifyAlleles(genotype, newAlleles));
newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make());
}
return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make();
@ -1126,11 +1114,11 @@ public class VariantContextUtils {
GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples());
for ( final Genotype genotype : vc.getGenotypes() ) {
Map<String, Object> attrs = new HashMap<String, Object>();
for ( Map.Entry<String, Object> attr : genotype.getAttributes().entrySet() ) {
for ( Map.Entry<String, Object> attr : genotype.getExtendedAttributes().entrySet() ) {
if ( allowedAttributes.contains(attr.getKey()) )
attrs.put(attr.getKey(), attr.getValue());
}
newGenotypes.add(Genotype.modifyAttributes(genotype, attrs));
newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make());
}
return new VariantContextBuilder(vc).genotypes(newGenotypes).make();
@ -1247,7 +1235,7 @@ public class VariantContextUtils {
for ( int k = 0; k < oldGTs.size(); k++ ) {
final Genotype g = oldGTs.get(sampleIndices.get(k));
if ( !g.hasLikelihoods() ) {
newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
continue;
}
@ -1268,51 +1256,35 @@ public class VariantContextUtils {
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) {
newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
}
else {
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
final GenotypeBuilder gb = new GenotypeBuilder(g);
if ( numNewAltAlleles == 0 )
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
gb.noPL();
else
attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods));
gb.PL(newLikelihoods);
// if we weren't asked to assign a genotype, then just no-call the sample
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL )
newGTs.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, attrs, false));
else
newGTs.add(assignDiploidGenotype(g, newLikelihoods, allelesToUse, attrs));
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) {
gb.alleles(NO_CALL_ALLELES);
}
else {
// find the genotype with maximum likelihoods
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2)));
if ( numNewAltAlleles != 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
}
newGTs.add(gb.make());
}
}
return newGTs;
}
/**
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
*
* @param originalGT the original genotype
* @param newLikelihoods the PL array
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
* @param attrs the annotations to use when creating the genotype
*
* @return genotype
*/
private static Genotype assignDiploidGenotype(final Genotype originalGT, final double[] newLikelihoods, final List<Allele> allelesToUse, final Map<String, Object> attrs) {
final int numNewAltAlleles = allelesToUse.size() - 1;
// find the genotype with maximum likelihoods
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
myAlleles.add(allelesToUse.get(alleles.alleleIndex1));
myAlleles.add(allelesToUse.get(alleles.alleleIndex2));
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
}
/**
* Returns true iff VC is an non-complex indel where every allele represents an expansion or
* contraction of a series of identical bases in the reference.

View File

@ -192,8 +192,8 @@ class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
infoMap.put("isHomRef", g.isHomRef() ? "1" : "0");
infoMap.put("isHet", g.isHet() ? "1" : "0");
infoMap.put("isHomVar", g.isHomVar() ? "1" : "0");
infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, g.getPhredScaledQual());
for ( Map.Entry<String, Object> e : g.getAttributes().entrySet() ) {
infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, g.getGQ());
for ( Map.Entry<String, Object> e : g.getExtendedAttributes().entrySet() ) {
if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) )
infoMap.put(e.getKey(), e.getValue());
}

View File

@ -22,22 +22,25 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.bcf2;
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.*;
/**
* Simple BCF2 encoder
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
*
* @author depristo
* @since 5/12
* @author Mark DePristo
* @since 06/12
*/
public class BCF2Encoder {
public final class BCF2Encoder {
// TODO -- increase default size?
public static final int WRITE_BUFFER_INITIAL_SIZE = 16384;
private ByteArrayOutputStream encodeStream = new ByteArrayOutputStream(WRITE_BUFFER_INITIAL_SIZE);
@ -48,10 +51,7 @@ public class BCF2Encoder {
//
// --------------------------------------------------------------------------------
public int getRecordSizeInBytes() {
return encodeStream.size();
}
@Ensures("result != null")
public byte[] getRecordBytes() {
byte[] bytes = encodeStream.toByteArray();
encodeStream.reset();
@ -64,18 +64,67 @@ public class BCF2Encoder {
//
// --------------------------------------------------------------------------------
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedMissing(final BCF2Type type) throws IOException {
encodeTyped(Collections.emptyList(), type);
encodeType(0, type);
}
// todo -- should be specialized for each object type for efficiency
public final void encodeTyped(final Object v, final BCF2Type type) throws IOException {
encodeTyped(Collections.singletonList(v), type);
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTyped(final Object value, final BCF2Type type) throws IOException {
if ( value == null )
encodeTypedMissing(type);
else {
switch ( type ) {
case INT8:
case INT16:
case INT32: encodeTypedInt((Integer)value, type); break;
case FLOAT: encodeTypedFloat((Double) value); break;
case CHAR: encodeTypedString((String) value); break;
default: throw new ReviewedStingException("Illegal type encountered " + type);
}
}
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedInt(final int v) throws IOException {
final BCF2Type type = BCF2Utils.determineIntegerType(v);
encodeTypedInt(v, type);
}
@Requires("type.isIntegerType()")
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedInt(final int v, final BCF2Type type) throws IOException {
encodeType(1, type);
encodeRawInt(v, type);
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedString(final String s) throws IOException {
encodeTypedString(s.getBytes());
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedString(final byte[] s) throws IOException {
if ( s == null )
encodeType(0, BCF2Type.CHAR);
else {
encodeType(s.length, BCF2Type.CHAR);
for ( int i = 0; i < s.length; i++ ) {
encodeRawChar(s[i]);
}
}
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTypedFloat(final double d) throws IOException {
encodeType(1, BCF2Type.FLOAT);
encodeRawFloat(d);
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeTyped(List<? extends Object> v, final BCF2Type type) throws IOException {
if ( type == BCF2Type.CHAR && v.size() != 0 ) {
final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List<String>)v) : (String)v.get(0);
final String s = v.size() > 1 ? BCF2Utils.collapseStringList((List<String>) v) : (String)v.get(0);
v = stringToBytes(s);
}
@ -103,7 +152,7 @@ public class BCF2Encoder {
switch (type) {
case INT8:
case INT16:
case INT32: encodePrimitive((Integer)value, type); break;
case INT32: encodeRawBytes((Integer) value, type); break;
case FLOAT: encodeRawFloat((Double) value); break;
case CHAR: encodeRawChar((Byte) value); break;
default: throw new ReviewedStingException("Illegal type encountered " + type);
@ -114,13 +163,13 @@ public class BCF2Encoder {
}
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeRawMissingValue(final BCF2Type type) throws IOException {
encodePrimitive(type.getMissingBytes(), type);
encodeRawBytes(type.getMissingBytes(), type);
}
@Requires("size >= 0")
public final void encodeRawMissingValues(final int size, final BCF2Type type) throws IOException {
if ( size <= 0 ) throw new ReviewedStingException("BUG: size <= 0");
for ( int i = 0; i < size; i++ )
encodeRawMissingValue(type);
}
@ -136,26 +185,28 @@ public class BCF2Encoder {
}
public final void encodeRawFloat(final double value) throws IOException {
encodePrimitive(Float.floatToIntBits((float)value), BCF2Type.FLOAT);
encodeRawBytes(Float.floatToIntBits((float) value), BCF2Type.FLOAT);
}
@Requires("size >= 0")
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeType(final int size, final BCF2Type type) throws IOException {
if ( size < 0 ) throw new ReviewedStingException("BUG: size < 0");
final byte typeByte = BCF2Utils.encodeTypeDescriptor(size, type);
encodeStream.write(typeByte);
if ( BCF2Utils.willOverflow(size) ) {
// write in the overflow size
encodeTyped(size, determineIntegerType(size));
encodeTypedInt(size);
}
}
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeRawInt(final int value, final BCF2Type type) throws IOException {
encodePrimitive(value, type, encodeStream);
BCF2Utils.encodeRawBytes(value, type, encodeStream);
}
public final void encodePrimitive(final int value, final BCF2Type type) throws IOException {
encodePrimitive(value, type, encodeStream);
@Ensures("encodeStream.size() > old(encodeStream.size())")
public final void encodeRawBytes(final int value, final BCF2Type type) throws IOException {
BCF2Utils.encodeRawBytes(value, type, encodeStream);
}
// --------------------------------------------------------------------------------
@ -164,42 +215,14 @@ public class BCF2Encoder {
//
// --------------------------------------------------------------------------------
public final BCF2Type determineIntegerType(final int[] values) {
// literally a copy of the code below, but there's no general way to unify lists and arrays in java
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
public final BCF2Type determineIntegerType(final List<Integer> values) {
BCF2Type maxType = BCF2Type.INT8;
for ( final int value : values ) {
final BCF2Type type1 = determineIntegerType(value);
switch ( type1 ) {
case INT8: break;
case INT16: maxType = BCF2Type.INT16; break;
case INT32: return BCF2Type.INT32; // fast path for largest possible value
default: throw new ReviewedStingException("Unexpected integer type " + type1 );
}
}
return maxType;
}
public final BCF2Type determineIntegerType(final int value) {
for ( final BCF2Type potentialType : BCF2Utils.INTEGER_TYPES_BY_SIZE ) {
if ( potentialType.withinRange(value) )
return potentialType;
}
throw new ReviewedStingException("Integer cannot be encoded in allowable range of even INT32: " + value);
@Requires({"s != null", "sizeToWrite >= 0"})
public void encodeRawString(final String s, final int sizeToWrite) throws IOException {
final byte[] bytes = s.getBytes();
for ( int i = 0; i < sizeToWrite; i++ )
if ( i < bytes.length )
encodeRawChar(bytes[i]);
else
encodeRawMissingValue(BCF2Type.CHAR);
}
/**
@ -210,7 +233,8 @@ public class BCF2Encoder {
* @param o
* @return
*/
protected final BCF2Type encode(final Object o) throws IOException {
@Requires("o != null")
public final BCF2Type encode(final Object o) throws IOException {
if ( o == null ) throw new ReviewedStingException("Generic encode cannot deal with null values");
if ( o instanceof List ) {
@ -224,11 +248,12 @@ public class BCF2Encoder {
}
}
@Requires("arg != null")
private final BCF2Type determineBCFType(final Object arg) {
final Object toType = arg instanceof List ? ((List)arg).get(0) : arg;
if ( toType instanceof Integer )
return determineIntegerType((Integer)toType);
return BCF2Utils.determineIntegerType((Integer) toType);
else if ( toType instanceof String )
return BCF2Type.CHAR;
else if ( toType instanceof Double )
@ -237,15 +262,6 @@ public class BCF2Encoder {
throw new ReviewedStingException("No native encoding for Object of type " + arg.getClass().getSimpleName());
}
public final static void encodePrimitive(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException {
for ( int i = type.getSizeInBytes() - 1; i >= 0; i-- ) {
final int shift = i * 8;
int mask = 0xFF << shift;
int byteValue = (mask & value) >> shift;
encodeStream.write(byteValue);
}
}
private final List<Byte> stringToBytes(final String v) throws IOException {
if ( v == null || v.equals("") )
return Collections.emptyList();

View File

@ -0,0 +1,529 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Invariant;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCompoundHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
*
* @author Mark DePristo
* @since 06/12
*/
@Invariant({
"headerLine != null",
"BCF2Type.INTEGERS.contains(dictionaryOffsetType)",
"dictionaryOffset >= 0"
})
public abstract class BCF2FieldEncoder {
/**
* The header line describing the field we will encode values of
*/
final VCFCompoundHeaderLine headerLine;
/**
* The BCF2 type we'll use to encoder this field, if it can be determined statically.
* If not, this variable must be null
*/
final BCF2Type staticType;
/**
* The integer offset into the strings map of the BCF2 file corresponding to this
* field.
*/
final int dictionaryOffset;
/**
* The integer type we use to encode our dictionary offset in the BCF2 file
*/
final BCF2Type dictionaryOffsetType;
// ----------------------------------------------------------------------
//
// Constructor
//
// ----------------------------------------------------------------------
@Requires({"headerLine != null", "dict != null"})
private BCF2FieldEncoder(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict, final BCF2Type staticType) {
this.headerLine = headerLine;
this.staticType = staticType;
final Integer offset = dict.get(getField());
if ( offset == null ) throw new ReviewedStingException("Format error: could not find string " + getField() + " in header as required by BCF");
this.dictionaryOffset = offset;
dictionaryOffsetType = BCF2Utils.determineIntegerType(offset);
}
// ----------------------------------------------------------------------
//
// Basic accessors
//
// ----------------------------------------------------------------------
@Ensures("result != null")
public final String getField() { return headerLine.getID(); }
/**
* Write the field key (dictionary offset and type) into the BCF2Encoder stream
*
* @param encoder where we write our dictionary offset
* @throws IOException
*/
@Requires("encoder != null")
public final void writeFieldKey(final BCF2Encoder encoder) throws IOException {
encoder.encodeTypedInt(dictionaryOffset, dictionaryOffsetType);
}
@Override
public String toString() {
return "BCF2FieldEncoder for " + getField() + " with count " + getCountType() + " encoded with " + getClass().getSimpleName();
}
// ----------------------------------------------------------------------
//
// methods to determine the number of encoded elements
//
// ----------------------------------------------------------------------
@Ensures("result != null")
protected final VCFHeaderLineCount getCountType() {
return headerLine.getCountType();
}
/**
* True if this field has a constant, fixed number of elements (such as 1 for an atomic integer)
*
* @return
*/
@Ensures("result != (hasValueDeterminedNumElements() || hasContextDeterminedNumElements())")
public boolean hasConstantNumElements() {
return getCountType() == VCFHeaderLineCount.INTEGER;
}
/**
* True if the only way to determine how many elements this field contains is by
* inspecting the actual value directly, such as when the number of elements
* is a variable length list per site or per genotype.
* @return
*/
@Ensures("result != (hasConstantNumElements() || hasContextDeterminedNumElements())")
public boolean hasValueDeterminedNumElements() {
return getCountType() == VCFHeaderLineCount.UNBOUNDED;
}
/**
* True if this field has a non-fixed number of elements that depends only on the properties
* of the current VariantContext, such as one value per Allele or per genotype configuration.
*
* @return
*/
@Ensures("result != (hasValueDeterminedNumElements() || hasConstantNumElements())")
public boolean hasContextDeterminedNumElements() {
return ! hasConstantNumElements() && ! hasValueDeterminedNumElements();
}
/**
* Get the number of elements, assuming this field has a constant number of elements.
* @return
*/
@Requires("hasConstantNumElements()")
@Ensures("result >= 0")
public int numElements() {
return headerLine.getCount();
}
/**
* Get the number of elements by looking at the actual value provided
* @return
*/
@Requires("hasValueDeterminedNumElements()")
@Ensures("result >= 0")
public int numElements(final Object value) {
return numElementsFromValue(value);
}
/**
* Get the number of elements, assuming this field has context-determined number of elements.
* @return
*/
@Requires("hasContextDeterminedNumElements()")
@Ensures("result >= 0")
public int numElements(final VariantContext vc) {
return headerLine.getCount(vc.getNAlleles() - 1);
}
/**
* A convenience access for the number of elements, returning
* the number of encoded elements, either from the fixed number
* it has, from the VC, or from the value itself.
* @param vc
* @param value
* @return
*/
@Ensures("result >= 0")
public final int numElements(final VariantContext vc, final Object value) {
if ( hasConstantNumElements() ) return numElements();
else if ( hasContextDeterminedNumElements() ) return numElements(vc);
else return numElements(value);
}
/**
* Given a value, return the number of elements we will encode for it.
*
* Assumes the value is encoded as a List
*
* @param value
* @return
*/
@Requires("hasValueDeterminedNumElements()")
@Ensures("result >= 0")
protected int numElementsFromValue(final Object value) {
if ( value == null ) return 0;
else if ( value instanceof List ) return ((List) value).size();
else return 1;
}
// ----------------------------------------------------------------------
//
// methods to determine the BCF2 type of the encoded values
//
// ----------------------------------------------------------------------
/**
* Is the BCF2 type of this field static, or does it have to be determine from
* the actual field value itself?
* @return
*/
@Ensures("result || isDynamicallyTyped()")
public final boolean isStaticallyTyped() { return ! isDynamicallyTyped(); }
/**
* Is the BCF2 type of this field static, or does it have to be determine from
* the actual field value itself?
* @return
*/
@Ensures("result || isStaticallyTyped()")
public final boolean isDynamicallyTyped() { return staticType == null; }
/**
* Get the BCF2 type for this field, either from the static type of the
* field itself or by inspecting the value itself.
*
* @return
*/
public final BCF2Type getType(final Object value) {
return isDynamicallyTyped() ? getDynamicType(value) : getStaticType();
}
@Requires("isStaticallyTyped()")
@Ensures("result != null")
public final BCF2Type getStaticType() {
return staticType;
}
@Requires("isDynamicallyTyped()")
@Ensures("result != null")
public BCF2Type getDynamicType(final Object value) {
throw new ReviewedStingException("BUG: cannot get dynamic type for statically typed BCF2 field");
}
// ----------------------------------------------------------------------
//
// methods to encode values, including the key abstract method
//
// ----------------------------------------------------------------------
/**
* Convenience method that just called encodeValue with a no minimum for the number of values.
*
* Primarily useful for encoding site values
*
* @param encoder
* @param value
* @param type
* @throws IOException
*/
@Requires({"encoder != null", "isDynamicallyTyped() || type == getStaticType()"})
public void encodeOneValue(final BCF2Encoder encoder, final Object value, final BCF2Type type) throws IOException {
encodeValue(encoder, value, type, 0);
}
/**
* Key abstract method that should encode a value of the given type into the encoder.
*
* Value will be of a type appropriate to the underlying encoder. If the genotype field is represented as
* an int[], this will be value, and the encoder needs to handle encoding all of the values in the int[].
*
* The argument should be used, not the getType() method in the superclass as an outer loop might have
* decided a more general type (int16) to use, even through this encoder could have been done with int8.
*
* If minValues > 0, then encodeValue must write in at least minValues items from value. If value is atomic,
* this means that minValues - 1 MISSING values should be added to the encoder. If minValues is a collection
* type (int[]) then minValues - values.length should be added. This argument is intended to handle padding
* of values in genotype fields.
*
* @param encoder
* @param value
* @param type
* @param minValues
* @throws IOException
*/
@Requires({"encoder != null", "isDynamicallyTyped() || type == getStaticType()", "minValues >= 0"})
public abstract void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException;
// ----------------------------------------------------------------------
//
// Subclass to encode Strings
//
// ----------------------------------------------------------------------
public static class StringOrCharacter extends BCF2FieldEncoder {
public StringOrCharacter(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, BCF2Type.CHAR);
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
final String s = javaStringToBCF2String(value);
encoder.encodeRawString(s, Math.max(s.length(), minValues));
}
//
// Regardless of what the header says, BCF2 strings and characters are always encoded
// as arrays of CHAR type, which has a variable number of elements depending on the
// exact string being encoded
//
@Override public boolean hasConstantNumElements() { return false; }
@Override public boolean hasContextDeterminedNumElements() { return false; }
@Override public boolean hasValueDeterminedNumElements() { return true; }
@Override protected int numElementsFromValue(final Object value) {
return value == null ? 0 : javaStringToBCF2String(value).length();
}
/**
* Recode the incoming object to a String, compacting it into a
* BCF2 string if the value is a list.
*
* @param value a String or List<String> to encode, or null
* @return a non-null string to encode
*/
@Ensures("result != null")
private String javaStringToBCF2String(final Object value) {
if ( value == null )
return "";
else if (value instanceof List) {
if ( ((List) value).size() == 1 )
return (String)((List) value).get(0);
else
return BCF2Utils.collapseStringList((List<String>)value);
} else
return (String)value;
}
}
// ----------------------------------------------------------------------
//
// Subclass to encode FLAG
//
// ----------------------------------------------------------------------
public static class Flag extends BCF2FieldEncoder {
public Flag(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, BCF2Type.INT8);
if ( ! headerLine.isFixedCount() || headerLine.getCount() != 0 )
throw new ReviewedStingException("Flag encoder only suppports atomic flags!");
}
@Override
public int numElements() {
return 1; // the header says 0 but we will write 1 value
}
@Override
@Requires("minValues <= 1")
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
encoder.encodeRawBytes(1, getStaticType());
}
}
// ----------------------------------------------------------------------
//
// Subclass to encode FLOAT
//
// ----------------------------------------------------------------------
public static class Float extends BCF2FieldEncoder {
final boolean isAtomic;
public Float(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, BCF2Type.FLOAT);
isAtomic = hasConstantNumElements() && numElements() == 1;
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
// TODO -- can be restructured to avoid toList operation
if ( isAtomic ) {
// fast path for fields with 1 fixed float value
if ( value != null ) {
encoder.encodeRawFloat((Double)value);
count++;
}
} else {
// handle generic case
final List<Double> doubles = toList(Double.class, value);
for ( final double d : doubles ) {
encoder.encodeRawFloat(d);
count++;
}
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
// ----------------------------------------------------------------------
//
// Subclass to encode int[]
//
// ----------------------------------------------------------------------
public static class IntArray extends BCF2FieldEncoder {
public IntArray(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, null);
}
@Override
protected int numElementsFromValue(final Object value) {
return value == null ? 0 : ((int[])value).length;
}
@Override
public BCF2Type getDynamicType(final Object value) {
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((int[])value);
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
if ( value != null ) {
for ( final int i : (int[])value ) {
encoder.encodeRawInt(i, type);
count++;
}
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
// ----------------------------------------------------------------------
//
// Subclass to encode List<Integer>
//
// ----------------------------------------------------------------------
/**
* Specialized int encoder for atomic (non-list) integers
*/
public static class AtomicInt extends BCF2FieldEncoder {
public AtomicInt(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, null);
}
@Override
public BCF2Type getDynamicType(final Object value) {
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType((Integer)value);
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
if ( value != null ) {
encoder.encodeRawInt((Integer)value, type);
count++;
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
public static class GenericInts extends BCF2FieldEncoder {
public GenericInts(final VCFCompoundHeaderLine headerLine, final Map<String, Integer> dict ) {
super(headerLine, dict, null);
}
@Override
public BCF2Type getDynamicType(final Object value) {
return value == null ? BCF2Type.INT8 : BCF2Utils.determineIntegerType(toList(Integer.class, value));
}
@Override
public void encodeValue(final BCF2Encoder encoder, final Object value, final BCF2Type type, final int minValues) throws IOException {
int count = 0;
for ( final int i : toList(Integer.class, value) ) {
encoder.encodeRawInt(i, type);
count++;
}
for ( ; count < minValues; count++ ) encoder.encodeRawMissingValue(type);
}
}
// ----------------------------------------------------------------------
//
// Helper methods
//
// ----------------------------------------------------------------------
/**
* Helper function that takes an object and returns a list representation
* of it:
*
* o == null => []
* o is a list => o
* else => [o]
*
* @param o
* @return
*/
private final static <T> List<T> toList(final Class<T> c, final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<T>)o;
else return Collections.singletonList((T)o);
}
}

View File

@ -0,0 +1,310 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
*
* @author Mark DePristo
* @since 06/12
*/
public abstract class BCF2FieldWriter {
private final VCFHeader header;
private final BCF2FieldEncoder fieldEncoder;
protected BCF2FieldWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
this.header = header;
this.fieldEncoder = fieldEncoder;
}
protected VCFHeader getHeader() { return header; }
protected BCF2FieldEncoder getFieldEncoder() {
return fieldEncoder;
}
protected String getField() { return getFieldEncoder().getField(); }
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
fieldEncoder.writeFieldKey(encoder);
}
public void done(final BCF2Encoder encoder, final VariantContext vc) throws IOException { } // TODO -- overload done so that we null out values and test for correctness
@Override
public String toString() {
return "BCF2FieldWriter " + getClass().getSimpleName() + " with encoder " + getFieldEncoder();
}
// --------------------------------------------------------------------------------
//
// Sites writers
//
// --------------------------------------------------------------------------------
public static abstract class SiteWriter extends BCF2FieldWriter {
protected SiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
public abstract void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException;
}
public static class GenericSiteWriter extends SiteWriter {
public GenericSiteWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
@Override
public void site(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
final Object rawValue = vc.getAttribute(getField(), null);
final BCF2Type type = getFieldEncoder().getType(rawValue);
if ( rawValue == null ) {
// the value is missing, just write in null
encoder.encodeType(0, type);
} else {
final int valueCount = getFieldEncoder().numElements(vc, rawValue);
encoder.encodeType(valueCount, type);
getFieldEncoder().encodeOneValue(encoder, rawValue, type);
}
}
}
// --------------------------------------------------------------------------------
//
// Genotypes writers
//
// --------------------------------------------------------------------------------
public static abstract class GenotypesWriter extends BCF2FieldWriter {
int nValuesPerGenotype = -1;
BCF2Type encodingType = null;
protected GenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
if ( fieldEncoder.hasConstantNumElements() ) {
nValuesPerGenotype = getFieldEncoder().numElements();
}
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// writes the key information
super.start(encoder, vc);
// only update if we need to
if ( ! getFieldEncoder().hasConstantNumElements() ) {
if ( getFieldEncoder().hasContextDeterminedNumElements() )
// we are cheap -- just depends on genotype of allele counts
nValuesPerGenotype = getFieldEncoder().numElements(vc);
else
// we have to go fishing through the values themselves (expensive)
nValuesPerGenotype = computeMaxSizeOfGenotypeFieldFromValues(vc);
}
encoder.encodeType(nValuesPerGenotype, encodingType);
}
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
final Object fieldValue = g.getExtendedAttribute(getField(), null);
getFieldEncoder().encodeValue(encoder, fieldValue, encodingType, nValuesPerGenotype);
}
protected int numElements(final VariantContext vc, final Genotype g) {
return getFieldEncoder().numElements(vc, g.getExtendedAttribute(getField()));
}
private final int computeMaxSizeOfGenotypeFieldFromValues(final VariantContext vc) {
int size = -1;
for ( final Genotype g : vc.getGenotypes() ) {
size = Math.max(size, numElements(vc, g));
}
return size;
}
}
public static class StaticallyTypeGenotypesWriter extends GenotypesWriter {
public StaticallyTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
encodingType = getFieldEncoder().getStaticType();
}
}
public static class IntegerTypeGenotypesWriter extends GenotypesWriter {
public IntegerTypeGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// the only value that is dynamic are integers
final List<Integer> values = new ArrayList<Integer>(vc.getNSamples());
for ( final Genotype g : vc.getGenotypes() ) {
for ( final Object i : BCF2Utils.toList(g.getExtendedAttribute(getField(), null)) ) {
values.add((Integer)i); // we know they are all integers
}
}
encodingType = BCF2Utils.determineIntegerType(values);
super.start(encoder, vc);
}
}
public static class IGFGenotypesWriter extends GenotypesWriter {
final IntGenotypeFieldAccessors.Accessor ige;
public IGFGenotypesWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder, final IntGenotypeFieldAccessors.Accessor ige) {
super(header, fieldEncoder);
this.ige = ige;
if ( ! (fieldEncoder instanceof BCF2FieldEncoder.IntArray) )
throw new ReviewedStingException("BUG: IntGenotypesWriter requires IntArray encoder for field " + getField());
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
// TODO
// TODO this piece of code consumes like 10% of the runtime alone because fo the vc.getGenotypes() iteration
// TODO
encodingType = BCF2Type.INT8;
for ( final Genotype g : vc.getGenotypes() ) {
final int[] pls = ige.getValues(g);
final BCF2Type plsType = getFieldEncoder().getType(pls);
encodingType = BCF2Utils.maxIntegerType(encodingType, plsType);
if ( encodingType == BCF2Type.INT32 )
break; // stop early
}
super.start(encoder, vc);
}
@Override
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
getFieldEncoder().encodeValue(encoder, ige.getValues(g), encodingType, nValuesPerGenotype);
}
@Override
protected int numElements(final VariantContext vc, final Genotype g) {
return ige.getSize(g);
}
}
public static class GTWriter extends GenotypesWriter {
final Map<Allele, Integer> alleleMapForTriPlus = new HashMap<Allele, Integer>(5);
Allele ref, alt1;
public GTWriter(final VCFHeader header, final BCF2FieldEncoder fieldEncoder) {
super(header, fieldEncoder);
}
@Override
public void start(final BCF2Encoder encoder, final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
encodingType = BCF2Type.INT8;
buildAlleleMap(vc);
nValuesPerGenotype = vc.getMaxPloidy();
super.start(encoder, vc);
}
@Override
public void addGenotype(final BCF2Encoder encoder, final VariantContext vc, final Genotype g) throws IOException {
final int samplePloidy = g.getPloidy();
for ( int i = 0; i < nValuesPerGenotype; i++ ) {
if ( i < samplePloidy ) {
// we encode the actual allele
final Allele a = g.getAllele(i);
final int offset = getAlleleOffset(a);
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
encoder.encodeRawBytes(encoded, encodingType);
} else {
// we need to pad with missing as we have ploidy < max for this sample
encoder.encodeRawBytes(encodingType.getMissingBytes(), encodingType);
}
}
}
/**
* Fast path code to determine the offset.
*
* Inline tests for == against ref (most common, first test)
* == alt1 (second most common, second test)
* == NO_CALL (third)
* and finally in the map from allele => offset for all alt 2+ alleles
*
* @param a the allele whose offset we wish to determine
* @return the offset (from 0) of the allele in the list of variant context alleles (-1 means NO_CALL)
*/
@Requires("a != null")
private final int getAlleleOffset(final Allele a) {
if ( a == ref ) return 0;
else if ( a == alt1 ) return 1;
else if ( a == Allele.NO_CALL ) return -1;
else {
final Integer o = alleleMapForTriPlus.get(a);
if ( o == null ) throw new ReviewedStingException("BUG: Couldn't find allele offset for allele " + a);
return o;
}
}
private final void buildAlleleMap(final VariantContext vc) {
// these are fast path options to determine the offsets for
final int nAlleles = vc.getNAlleles();
ref = vc.getReference();
alt1 = nAlleles > 1 ? vc.getAlternateAllele(0) : null;
if ( nAlleles > 2 ) {
// for multi-allelics we need to clear the map, and add additional looks
alleleMapForTriPlus.clear();
alleleMapForTriPlus.put(Allele.NO_CALL, -1); // convenience for lookup
final List<Allele> alleles = vc.getAlleles();
for ( int i = 2; i < alleles.size(); i++ ) {
alleleMapForTriPlus.put(alleles.get(i), i);
}
}
}
}
}

View File

@ -0,0 +1,183 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.HashMap;
import java.util.Map;
/**
* See #BCFWriter for documentation on this classes role in encoding BCF2 files
*
* @author Mark DePristo
* @since 06/12
*/
public class BCF2FieldWriterManager {
final protected static Logger logger = Logger.getLogger(BCF2FieldWriterManager.class);
final Map<String, BCF2FieldWriter.SiteWriter> siteWriters = new HashMap<String, BCF2FieldWriter.SiteWriter>();
final Map<String, BCF2FieldWriter.GenotypesWriter> genotypesWriters = new HashMap<String, BCF2FieldWriter.GenotypesWriter>();
final IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
public BCF2FieldWriterManager() { }
/**
* Setup the FieldWriters appropriate to each INFO and FORMAT in the VCF header
*
* Must be called before any of the getter methods will work
*
* @param header a VCFHeader containing description for every INFO and FORMAT field we'll attempt to write out to BCF
* @param encoder the encoder we are going to use to write out the BCF2 data
* @param stringDictionary a map from VCFHeader strings to their offsets for encoding
*/
public void setup(final VCFHeader header, final BCF2Encoder encoder, final Map<String, Integer> stringDictionary) {
for (final VCFInfoHeaderLine line : header.getInfoHeaderLines()) {
final String field = line.getID();
final BCF2FieldWriter.SiteWriter writer = createInfoWriter(header, line, encoder, stringDictionary);
add(siteWriters, field, writer);
}
for (final VCFFormatHeaderLine line : header.getFormatHeaderLines()) {
final String field = line.getID();
final BCF2FieldWriter.GenotypesWriter writer = createGenotypesWriter(header, line, encoder, stringDictionary);
add(genotypesWriters, field, writer);
}
}
@Requires({"field != null", "writer != null"})
@Ensures("map.containsKey(field)")
private final <T> void add(final Map<String, T> map, final String field, final T writer) {
if ( map.containsKey(field) )
throw new ReviewedStingException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders");
map.put(field, writer);
logger.info(writer);
}
// -----------------------------------------------------------------
//
// Master routine to look at the header, a specific line, and
// build an appropriate SiteWriter for that header element
//
// -----------------------------------------------------------------
private BCF2FieldWriter.SiteWriter createInfoWriter(final VCFHeader header,
final VCFInfoHeaderLine line,
final BCF2Encoder encoder,
final Map<String, Integer> dict) {
return new BCF2FieldWriter.GenericSiteWriter(header, createFieldEncoder(line, encoder, dict, false));
}
private BCF2FieldEncoder createFieldEncoder(final VCFCompoundHeaderLine line,
final BCF2Encoder encoder,
final Map<String, Integer> dict,
final boolean createGenotypesEncoders ) {
if ( createGenotypesEncoders && intGenotypeFieldAccessors.getAccessor(line.getID()) != null ) {
if ( line.getType() != VCFHeaderLineType.Integer )
logger.warn("Warning: field " + line.getID() + " expected to encode an integer but saw " + line.getType() + " for record " + line);
return new BCF2FieldEncoder.IntArray(line, dict);
} else if ( createGenotypesEncoders && line.getID().equals(VCFConstants.GENOTYPE_KEY) ) {
return new BCF2FieldEncoder.GenericInts(line, dict);
} else {
switch ( line.getType() ) {
case Character:
case String:
return new BCF2FieldEncoder.StringOrCharacter(line, dict);
case Flag:
return new BCF2FieldEncoder.Flag(line, dict);
case Float:
return new BCF2FieldEncoder.Float(line, dict);
case Integer:
if ( line.isFixedCount() && line.getCount() == 1 )
return new BCF2FieldEncoder.AtomicInt(line, dict);
else
return new BCF2FieldEncoder.GenericInts(line, dict);
default:
throw new ReviewedStingException("Unexpected type for field " + line.getID());
}
}
}
// -----------------------------------------------------------------
//
// Master routine to look at the header, a specific line, and
// build an appropriate Genotypes for that header element
//
// -----------------------------------------------------------------
private BCF2FieldWriter.GenotypesWriter createGenotypesWriter(final VCFHeader header,
final VCFFormatHeaderLine line,
final BCF2Encoder encoder,
final Map<String, Integer> dict) {
final String field = line.getID();
final BCF2FieldEncoder fieldEncoder = createFieldEncoder(line, encoder, dict, true);
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
return new BCF2FieldWriter.GTWriter(header, fieldEncoder);
} else if ( intGenotypeFieldAccessors.getAccessor(field) != null ) {
return new BCF2FieldWriter.IGFGenotypesWriter(header, fieldEncoder, intGenotypeFieldAccessors.getAccessor(field));
} else if ( line.getType() == VCFHeaderLineType.Integer ) {
return new BCF2FieldWriter.IntegerTypeGenotypesWriter(header, fieldEncoder);
} else {
return new BCF2FieldWriter.StaticallyTypeGenotypesWriter(header, fieldEncoder);
}
}
// -----------------------------------------------------------------
//
// Accessors to get site / genotype writers
//
// -----------------------------------------------------------------
/**
* Get a site writer specialized to encode values for site info field
* @param field key found in the VCF header INFO records
* @return
*/
public BCF2FieldWriter.SiteWriter getSiteFieldWriter(final String field) {
return getWriter(field, siteWriters);
}
/**
* Get a genotypes writer specialized to encode values for genotypes field
* @param field key found in the VCF header FORMAT records
* @return
*/
public BCF2FieldWriter.GenotypesWriter getGenotypeFieldWriter(final String field) {
return getWriter(field, genotypesWriters);
}
@Requires({"map != null", "key != null"})
@Ensures("result != null")
public <T> T getWriter(final String key, final Map<String, T> map) {
final T writer = map.get(key);
if ( writer == null ) throw new ReviewedStingException("BUG: no writer found for " + key);
return writer;
}
}

View File

@ -24,9 +24,11 @@
package org.broadinstitute.sting.utils.variantcontext.writer;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Encoder;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type;
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
@ -37,6 +39,49 @@ import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.*;
import java.util.*;
/**
* VariantContextWriter that emits BCF2 binary encoding
*
* Overall structure of this writer is complex for efficiency reasons
*
* -- The BCF2Writer manages the low-level BCF2 encoder, the mappings
* from contigs and strings to offsets, the VCF header, and holds the
* lower-level encoders that map from VC and Genotype fields to their
* specific encoders. This class also writes out the standard BCF2 fields
* like POS, contig, the size of info and genotype data, QUAL, etc. It
* has loops over the INFO and GENOTYPES to encode each individual datum
* with the generic field encoders, but the actual encoding work is
* done with by the FieldWriters classes themselves
*
* -- BCF2FieldWriter are specialized classes for writing out SITE and
* genotype information for specific SITE/GENOTYPE fields (like AC for
* sites and GQ for genotypes). These are objects in themselves because
* the manage all of the complexity of relating the types in the VCF header
* with the proper encoding in BCF as well as the type representing this
* in java. Relating all three of these pieces of information together
* is the main complexity challenge in the encoder. The piece of code
* that determines which FieldWriters to associate with each SITE and
* GENOTYPE field is the BCF2FieldWriterManager. These FieldWriters
* are specialized for specific combinations of encoders (see below)
* and contexts (genotypes) for efficiency, so they smartly manage
* the writing of PLs (encoded as int[]) directly into the lowest
* level BCFEncoder.
*
* -- At the third level is the BCF2FieldEncoder, relatively simple
* pieces of code that handle the task of determining the right
* BCF2 type for specific field values, as well as reporting back
* information such as the number of elements used to encode it
* (simple for atomic values like Integer but complex for PLs
* or lists of strings)
*
* -- At the lowest level is the BCF2Encoder itself. This provides
* just the limited encoding methods specified by the BCF2 specification. This encoder
* doesn't do anything but make it possible to conveniently write out valid low-level
* BCF2 constructs.
*
* @author Mark DePristo
* @since 06/12
*/
class BCF2Writer extends IndexingVariantContextWriter {
final protected static Logger logger = Logger.getLogger(BCF2Writer.class);
@ -45,8 +90,10 @@ class BCF2Writer extends IndexingVariantContextWriter {
private final Map<String, Integer> contigDictionary = new HashMap<String, Integer>();
private final Map<String, Integer> stringDictionaryMap = new LinkedHashMap<String, Integer>();
private final boolean doNotWriteGenotypes;
private String[] sampleNames = null;
private final BCF2Encoder encoder = new BCF2Encoder(); // initialized after the header arrives
final BCF2FieldWriterManager fieldManager = new BCF2FieldWriterManager();
public BCF2Writer(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, final boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
@ -60,11 +107,6 @@ class BCF2Writer extends IndexingVariantContextWriter {
//
// --------------------------------------------------------------------------------
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
for ( final VCFContigHeaderLine contig : contigLines )
contigDictionary.put(contig.getID(), contig.getContigIndex());
}
@Override
public void writeHeader(final VCFHeader header) {
// create the config offsets map
@ -81,6 +123,11 @@ class BCF2Writer extends IndexingVariantContextWriter {
stringDictionaryMap.put(dict.get(i), i);
}
sampleNames = header.getGenotypeSamples().toArray(new String[header.getNGenotypeSamples()]);
// setup the field encodings
fieldManager.setup(header, encoder, stringDictionaryMap);
try {
// write out the header into a byte stream, get it's length, and write everything to the file
final ByteArrayOutputStream capture = new ByteArrayOutputStream();
@ -91,7 +138,7 @@ class BCF2Writer extends IndexingVariantContextWriter {
final byte[] headerBytes = capture.toByteArray();
outputStream.write(BCF2Utils.MAGIC_HEADER_LINE);
BCF2Encoder.encodePrimitive(headerBytes.length, BCF2Type.INT32, outputStream);
BCF2Utils.encodeRawBytes(headerBytes.length, BCF2Type.INT32, outputStream);
outputStream.write(headerBytes);
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile("BCF2 stream", "Got IOException while trying to write BCF2 header", e);
@ -99,8 +146,11 @@ class BCF2Writer extends IndexingVariantContextWriter {
}
@Override
public void add( final VariantContext initialVC ) {
final VariantContext vc = initialVC.fullyDecode(header);
public void add( VariantContext vc ) {
if ( doNotWriteGenotypes )
vc = new VariantContextBuilder(vc).noGenotypes().make();
vc = vc.fullyDecode(header);
super.add(vc); // allow on the fly indexing
try {
@ -162,11 +212,11 @@ class BCF2Writer extends IndexingVariantContextWriter {
// info fields
final int nAlleles = vc.getNAlleles();
final int nInfo = vc.getAttributes().size();
final int nGenotypeFormatFields = VCFWriter.calcVCFGenotypeKeys(vc, header).size();
final int nGenotypeFormatFields = getNGenotypeFormatFields(vc);
final int nSamples = vc.getNSamples();
encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x00FF), BCF2Type.INT32);
encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x0FFF), BCF2Type.INT32);
encoder.encodeRawInt((nAlleles << 16) | (nInfo & 0x0000FFFF), BCF2Type.INT32);
encoder.encodeRawInt((nGenotypeFormatFields << 24) | (nSamples & 0x00FFFFF), BCF2Type.INT32);
buildID(vc);
buildAlleles(vc);
@ -176,15 +226,41 @@ class BCF2Writer extends IndexingVariantContextWriter {
return encoder.getRecordBytes();
}
private BCF2Codec.LazyData getLazyData(final VariantContext vc) {
if ( vc.getGenotypes().isLazyWithData() ) {
LazyGenotypesContext lgc = (LazyGenotypesContext)vc.getGenotypes();
if ( lgc.getUnparsedGenotypeData() instanceof BCF2Codec.LazyData )
return (BCF2Codec.LazyData)lgc.getUnparsedGenotypeData();
}
return null;
}
/**
* Try to get the nGenotypeFields as efficiently as possible.
*
* If this is a lazy BCF2 object just grab the field count from there,
* otherwise do the whole counting by types test in the actual data
*
* @param vc
* @return
*/
private final int getNGenotypeFormatFields(final VariantContext vc) {
final BCF2Codec.LazyData lazyData = getLazyData(vc);
return lazyData != null ? lazyData.nGenotypeFields : VCFWriter.calcVCFGenotypeKeys(vc, header).size();
}
private void buildID( VariantContext vc ) throws IOException {
encoder.encodeTyped(vc.getID(), BCF2Type.CHAR);
encoder.encodeTypedString(vc.getID());
}
private void buildAlleles( VariantContext vc ) throws IOException {
final boolean needsPadding = VariantContextUtils.needsPadding(vc);
for ( final Allele allele : vc.getAlleles() ) {
final String s = needsPadding ? VariantContextUtils.padAllele(vc,allele) : allele.getDisplayString();
encoder.encodeTyped(s, BCF2Type.CHAR);
byte[] s = allele.getBases();
if ( needsPadding )
s = VariantContextUtils.padAllele(vc,allele).getBytes();
encoder.encodeTypedString(s);
}
}
@ -199,233 +275,43 @@ class BCF2Writer extends IndexingVariantContextWriter {
private void buildInfo( VariantContext vc ) throws IOException {
for ( Map.Entry<String, Object> infoFieldEntry : vc.getAttributes().entrySet() ) {
final String key = infoFieldEntry.getKey();
final VCFToBCFEncoding encoding = prepFieldValueForEncoding(key, infoFieldEntry.getValue());
encodeStringByRef(key);
encoder.encodeTyped(encoding.valuesToEncode, encoding.BCF2Type);
final BCF2FieldWriter.SiteWriter writer = fieldManager.getSiteFieldWriter(key);
writer.start(encoder, vc);
writer.site(encoder, vc);
writer.done(encoder, vc);
}
}
private byte[] buildSamplesData(final VariantContext vc) throws IOException {
List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
for ( final String field : genotypeFields ) {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
addGenotypes(vc);
} else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
addGQ(vc);
} else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
addGenotypeFilters(vc);
} else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) {
addPLs(vc);
} else {
addGenericGenotypeField(vc, field);
}
}
return encoder.getRecordBytes();
}
private final int getNGenotypeFieldValues(final String field, final VariantContext vc) {
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field);
assert metaData != null; // field is supposed to be in header
int nFields = metaData.getCount(vc.getNAlleles() - 1);
if ( nFields == -1 ) { // unbounded, need to look at values
return computeMaxSizeOfGenotypeFieldFromValues(field, vc);
final BCF2Codec.LazyData lazyData = getLazyData(vc);
if ( lazyData != null ) {
// we never decoded any data from this BCF file, so just pass it back
return lazyData.bytes;
} else {
return nFields;
}
}
// we have to do work to convert the VC into a BCF2 byte stream
final List<String> genotypeFields = VCFWriter.calcVCFGenotypeKeys(vc, header);
for ( final String field : genotypeFields ) {
final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field);
private final int computeMaxSizeOfGenotypeFieldFromValues(final String field, final VariantContext vc) {
int size = -1;
final GenotypesContext gc = vc.getGenotypes();
for ( final Genotype g : gc ) {
final Object o = g.getAttribute(field);
if ( o == null ) continue;
if ( o instanceof List ) {
// only do compute if first value is of type list
size = Math.max(size, ((List)o).size());
} else if ( size == -1 )
size = 1;
}
return size;
}
private final void addGenericGenotypeField(final VariantContext vc, final String field) throws IOException {
final int numInFormatField = getNGenotypeFieldValues(field, vc);
final VCFToBCFEncoding encoding = prepFieldValueForEncoding(field, null);
startGenotypeField(field, numInFormatField, encoding.BCF2Type);
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
try {
final Object fieldValue = g.getAttribute(field);
if ( numInFormatField == 1 ) {
// we encode the actual allele, encodeRawValue handles the missing case where fieldValue == null
encoder.encodeRawValue(fieldValue, encoding.BCF2Type);
} else {
// multiple values, need to handle general case
final List<Object> asList = toList(fieldValue);
final int nSampleValues = asList.size();
for ( int i = 0; i < numInFormatField; i++ ) {
encoder.encodeRawValue(i < nSampleValues ? asList.get(i) : null, encoding.BCF2Type);
}
writer.start(encoder, vc);
for ( final String name : sampleNames ) {
Genotype g = vc.getGenotype(name);
if ( g == null )
// we don't have any data about g at all
g = new GenotypeBuilder(name).make();
writer.addGenotype(encoder, vc, g);
}
} catch ( ClassCastException e ) {
throw new ReviewedStingException("Value stored in VariantContext incompatible with VCF header type for field " + field, e);
writer.done(encoder, vc);
}
return encoder.getRecordBytes();
}
}
private final static List<Object> toList(final Object o) {
if ( o == null ) return Collections.emptyList();
else if ( o instanceof List ) return (List<Object>)o;
else return Collections.singletonList(o);
}
private final class VCFToBCFEncoding {
VCFHeaderLineType vcfType;
BCF2Type BCF2Type;
List<? extends Object> valuesToEncode;
private VCFToBCFEncoding(final VCFHeaderLineType vcfType, final BCF2Type BCF2Type, final List<? extends Object> valuesToEncode) {
this.vcfType = vcfType;
this.BCF2Type = BCF2Type;
this.valuesToEncode = valuesToEncode;
}
}
// TODO -- we really need explicit converters as first class objects
// TODO -- need to generalize so we can enable vectors of compressed genotype ints
// TODO -- no sense in allocating these over and over
private final VCFToBCFEncoding prepFieldValueForEncoding(final String field, final Object value) {
final VCFCompoundHeaderLine metaData = VariantContextUtils.getMetaDataForField(header, field);
final boolean isList = value instanceof List;
final Object toType = isList ? ((List)value).get(0) : value;
try {
switch ( metaData.getType() ) {
case Character:
assert toType instanceof String;
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, Collections.singletonList(value));
case Flag:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.INT8, Collections.singletonList(1));
case String:
final List<String> s = isList ? (List<String>)value : Collections.singletonList((String) value);
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.CHAR, s);
case Integer: // note integer calculation is a bit complex because of the need to determine sizes
List<Integer> l;
BCF2Type intType;
if ( isList ) {
l = (List<Integer>)value;
intType = encoder.determineIntegerType(l);
} else if ( value != null ) {
intType = encoder.determineIntegerType((Integer)value);
l = Collections.singletonList((Integer)value);
} else {
intType = BCF2Type.INT8;
l = Collections.singletonList((Integer) null);
}
return new VCFToBCFEncoding(metaData.getType(), intType, l);
case Float:
return new VCFToBCFEncoding(metaData.getType(), BCF2Type.FLOAT, isList ? (List<Double>)value : Collections.singletonList(value));
default:
throw new ReviewedStingException("Unexpected type for field " + field);
}
} catch ( ClassCastException e ) {
throw new ReviewedStingException("Error computing VCF -> BCF encoding. Received cast class exception"
+ " indicating that the VCF header for " + metaData + " is inconsistent with the" +
" value seen in the VariantContext object = " + value, e);
}
}
private final void addGenotypeFilters(final VariantContext vc) throws IOException {
logger.warn("Skipping genotype filter field");
// // TODO -- FIXME -- string is wrong here -- need to compute string size...
// startGenotypeField(VCFConstants.GENOTYPE_FILTER_KEY, 1, BCFType.CHAR);
// for ( final Genotype g : vc.getGenotypes() ) {
// if ( g.filtersWereApplied() && g.isFiltered() ) {
// encoder.encodeString(ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())));
// } else {
// encoder.encodeRawMissingValues(1, BCFType.CHAR); // todo fixme
// }
// }
}
private final void addGQ(final VariantContext vc) throws IOException {
startGenotypeField(VCFConstants.GENOTYPE_QUALITY_KEY, 1, BCF2Type.INT8);
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
if ( g.hasLog10PError() ) {
final int GQ = Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL);
if ( GQ > VCFConstants.MAX_GENOTYPE_QUAL ) throw new ReviewedStingException("Unexpectedly large GQ " + GQ + " at " + vc);
encoder.encodeRawValue(GQ, BCF2Type.INT8);
} else {
encoder.encodeRawMissingValues(1, BCF2Type.INT8);
}
}
}
/**
* Horrible special case to deal with the GenotypeLikelihoods class
* @param vc
* @throws IOException
*/
private final void addPLs(final VariantContext vc) throws IOException {
final String field = VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY;
final int numPLs = getNGenotypeFieldValues(field, vc);
final int[] allPLs = new int[numPLs * vc.getNSamples()];
// collect all of the PLs into a single vector of values
int i = 0;
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
final GenotypeLikelihoods gls = g.getLikelihoods();
final int[] pls = gls != null ? g.getLikelihoods().getAsPLs() : null;
if ( pls == null )
for ( int j = 0; j < numPLs; j++) allPLs[i++] = -1;
else
for ( int pl : pls ) allPLs[i++] = pl;
}
// determine the best size
final BCF2Type type = encoder.determineIntegerType(allPLs);
startGenotypeField(field, numPLs, type);
for ( int pl : allPLs )
encoder.encodePrimitive(pl == -1 ? type.getMissingBytes() : pl, type);
}
private final void addGenotypes(final VariantContext vc) throws IOException {
if ( vc.getNAlleles() > BCF2Utils.MAX_ALLELES_IN_GENOTYPES )
throw new ReviewedStingException("Current BCF2 encoder cannot handle sites " +
"with > " + BCF2Utils.MAX_ALLELES_IN_GENOTYPES + " alleles, but you have "
+ vc.getNAlleles() + " at " + vc.getChr() + ":" + vc.getStart());
final Map<Allele, String> alleleMap = VCFWriter.buildAlleleMap(vc);
final int maxPloidy = vc.getMaxPloidy();
startGenotypeField(VCFConstants.GENOTYPE_KEY, maxPloidy, BCF2Type.INT8);
for ( final String name : header.getGenotypeSamples() ) {
final Genotype g = vc.getGenotype(name); // todo -- can we optimize this?
final List<Allele> alleles = g.getAlleles();
final int samplePloidy = alleles.size();
for ( int i = 0; i < maxPloidy; i++ ) {
if ( i < samplePloidy ) {
// we encode the actual allele
final Allele a = alleles.get(i);
final int offset = a.isNoCall() ? -1 : Integer.valueOf(alleleMap.get(a));
final int encoded = ((offset+1) << 1) | (g.isPhased() ? 0x01 : 0x00);
encoder.encodePrimitive(encoded, BCF2Type.INT8);
} else {
// we need to pad with missing as we have ploidy < max for this sample
encoder.encodePrimitive(BCF2Type.INT8.getMissingBytes(), BCF2Type.INT8);
}
}
}
}
// --------------------------------------------------------------------------------
//
// Low-level block encoding
//
// --------------------------------------------------------------------------------
/**
* Write the data in the encoder to the outputstream as a length encoded
@ -434,29 +320,18 @@ class BCF2Writer extends IndexingVariantContextWriter {
*
* @throws IOException
*/
@Requires({"infoBlock.length > 0", "genotypesBlock.length >= 0"})
private void writeBlock(final byte[] infoBlock, final byte[] genotypesBlock) throws IOException {
assert infoBlock.length > 0;
assert genotypesBlock.length >= 0;
BCF2Encoder.encodePrimitive(infoBlock.length, BCF2Type.INT32, outputStream);
BCF2Encoder.encodePrimitive(genotypesBlock.length, BCF2Type.INT32, outputStream);
BCF2Utils.encodeRawBytes(infoBlock.length, BCF2Type.INT32, outputStream);
BCF2Utils.encodeRawBytes(genotypesBlock.length, BCF2Type.INT32, outputStream);
outputStream.write(infoBlock);
outputStream.write(genotypesBlock);
}
// TODO -- obvious optimization case
private final BCF2Type encodeStringByRef(final String string) throws IOException {
assert string != null;
return encodeStringsByRef(Collections.singletonList(string));
}
// TODO -- in size == 1 case branch to singleoton fast-path
@Requires("! strings.isEmpty()")
@Ensures("BCF2Type.INTEGERS.contains(result)")
private final BCF2Type encodeStringsByRef(final Collection<String> strings) throws IOException {
assert ! strings.isEmpty();
final List<Integer> offsets = new ArrayList<Integer>(strings.size());
BCF2Type maxType = BCF2Type.INT8; // start with the smallest size
// iterate over strings until we find one that needs 16 bits, and break
for ( final String string : strings ) {
@ -464,28 +339,22 @@ class BCF2Writer extends IndexingVariantContextWriter {
if ( got == null ) throw new ReviewedStingException("Format error: could not find string " + string + " in header as required by BCF");
final int offset = got;
offsets.add(offset);
if ( maxType != BCF2Type.INT32) { // don't bother looking if we already are at 32 bit ints
final BCF2Type type1 = encoder.determineIntegerType(offset);
switch ( type1 ) {
case INT8: break;
case INT16: if ( maxType == BCF2Type.INT8 ) maxType = BCF2Type.INT16; break;
case INT32: maxType = BCF2Type.INT32; break;
default: throw new ReviewedStingException("Unexpected type " + type1);
}
}
}
// we've checked the types for all strings, so write them out
encoder.encodeTyped(offsets, maxType);
return maxType;
final BCF2Type type = BCF2Utils.determineIntegerType(offsets);
encoder.encodeTyped(offsets, type);
return type;
}
private final void startGenotypeField(final String key, final int size, final BCF2Type valueType) throws IOException {
assert key != null && ! key.equals("");
assert size >= 0;
encodeStringByRef(key);
encoder.encodeType(size, valueType);
/**
* Create the contigDictionary from the contigLines extracted from the VCF header
*
* @param contigLines
*/
@Requires("contigDictionary.isEmpty()")
private final void createContigDictionary(final Collection<VCFContigHeaderLine> contigLines) {
int offset = 0;
for ( VCFContigHeaderLine contig : contigLines )
contigDictionary.put(contig.getID(), offset++);
}
}

View File

@ -159,4 +159,10 @@ final class PositionalOutputStream extends OutputStream {
}
public final long getPosition() { return position; }
@Override
public void close() throws IOException {
super.close();
out.close();
}
}

View File

@ -0,0 +1,96 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.variantcontext.writer;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.util.HashMap;
/**
* A convenient way to provide a single view on the many int and int[] field values we work with,
* for writing out the values. This class makes writing out the inline AD, GQ, PL, DP fields
* easy and fast
*
* @author Mark DePristo
* @since 6/12
*/
class IntGenotypeFieldAccessors {
// initialized once per writer to allow parallel writers to work
private final HashMap<String, Accessor> intGenotypeFieldEncoders = new HashMap<String, Accessor>();
public IntGenotypeFieldAccessors() {
intGenotypeFieldEncoders.put(VCFConstants.DEPTH_KEY, new IntGenotypeFieldAccessors.DPAccessor());
intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_ALLELE_DEPTHS, new IntGenotypeFieldAccessors.ADAccessor());
intGenotypeFieldEncoders.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, new IntGenotypeFieldAccessors.PLAccessor());
intGenotypeFieldEncoders.put(VCFConstants.GENOTYPE_QUALITY_KEY, new IntGenotypeFieldAccessors.GQAccessor());
}
/**
* Return an accessor for field, or null if none exists
* @param field
* @return
*/
public Accessor getAccessor(final String field) {
return intGenotypeFieldEncoders.get(field);
}
public static abstract class Accessor {
public abstract int[] getValues(final Genotype g);
public final int getSize(final Genotype g) {
final int[] v = getValues(g);
return v == null ? 0 : v.length;
}
}
private static abstract class AtomicAccessor extends Accessor {
private final int[] singleton = new int[1];
@Override
public int[] getValues(final Genotype g) {
singleton[0] = getValue(g);
return singleton[0] == -1 ? null : singleton;
}
public abstract int getValue(final Genotype g);
}
public static class GQAccessor extends AtomicAccessor {
@Override public int getValue(final Genotype g) { return Math.min(g.getGQ(), VCFConstants.MAX_GENOTYPE_QUAL); }
}
public static class DPAccessor extends AtomicAccessor {
@Override public int getValue(final Genotype g) { return g.getDP(); }
}
public static class ADAccessor extends Accessor {
@Override public int[] getValues(final Genotype g) { return g.getAD(); }
}
public static class PLAccessor extends Accessor {
@Override public int[] getValues(final Genotype g) { return g.getPL(); }
}
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.variantcontext.writer;
import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -53,28 +54,7 @@ class VCFWriter extends IndexingVariantContextWriter {
// were filters applied?
protected boolean filtersWereAppliedToContext = false;
// /**
// * create a VCF writer, given a file to write to
// *
// * @param location the file location to write to
// */
// public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) {
// this(location, openOutputStream(location), refDict, true, false);
// }
//
// public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) {
// this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false);
// }
//
// /**
// * create a VCF writer, given a stream to write to
// *
// * @param output the file location to write to
// * @param doNotWriteGenotypes do not write genotypes
// */
// public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) {
// this(null, output, refDict, false, doNotWriteGenotypes);
// }
private IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();
public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
@ -230,7 +210,7 @@ class VCFWriter extends IndexingVariantContextWriter {
if ( !vc.hasLog10PError() )
mWriter.write(VCFConstants.MISSING_VALUE_v4);
else
mWriter.write(getQualValue(vc.getPhredScaledQual()));
mWriter.write(formatQualValue(vc.getPhredScaledQual()));
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// FILTER
@ -250,7 +230,7 @@ class VCFWriter extends IndexingVariantContextWriter {
// FORMAT
final GenotypesContext gc = vc.getGenotypes();
if ( gc instanceof LazyGenotypesContext && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() != null) {
if ( gc.isLazyWithData() && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() instanceof String ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString());
} else {
@ -272,7 +252,7 @@ class VCFWriter extends IndexingVariantContextWriter {
}
}
public static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
private static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1);
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
@ -298,10 +278,13 @@ class VCFWriter extends IndexingVariantContextWriter {
return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
private String getQualValue(double qual) {
String s = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, qual);
if ( s.endsWith(VCFConstants.DOUBLE_PRECISION_INT_SUFFIX) )
s = s.substring(0, s.length() - VCFConstants.DOUBLE_PRECISION_INT_SUFFIX.length());
private static final String QUAL_FORMAT_STRING = "%.2f";
private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00";
private String formatQualValue(double qual) {
String s = String.format(QUAL_FORMAT_STRING, qual);
if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) )
s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length());
return s;
}
@ -347,6 +330,13 @@ class VCFWriter extends IndexingVariantContextWriter {
*/
private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
throws IOException {
if ( ! mHeader.getGenotypeSamples().containsAll(vc.getSampleNames()) ) {
final List<String> badSampleNames = new ArrayList<String>();
for ( final Genotype g : vc.getGenotypes() )
if ( ! mHeader.getGenotypeSamples().contains(g.getSampleName()) )
badSampleNames.add(g.getSampleName());
throw new ReviewedStingException("BUG: VariantContext contains some samples not in the VCF header: bad samples are " + Utils.join(",",badSampleNames));
}
for ( String sample : mHeader.getGenotypeSamples() ) {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -360,9 +350,9 @@ class VCFWriter extends IndexingVariantContextWriter {
}
List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for ( String key : genotypeFormatKeys ) {
for ( String field : genotypeFormatKeys ) {
if ( key.equals(VCFConstants.GENOTYPE_KEY) ) {
if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
if ( !g.isAvailable() ) {
throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
}
@ -376,36 +366,50 @@ class VCFWriter extends IndexingVariantContextWriter {
continue;
}
Object val = g.hasAttribute(key) ? g.getAttribute(key) : VCFConstants.MISSING_VALUE_v4;
// some exceptions
if ( key.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
if ( ! g.hasLog10PError() )
val = VCFConstants.MISSING_VALUE_v4;
String outputValue;
final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field);
if ( accessor != null ) {
final int[] intValues = accessor.getValues(g);
if ( intValues == null )
outputValue = VCFConstants.MISSING_VALUE_v4;
else if ( intValues.length == 1 ) // fast path
outputValue = Integer.toString(intValues[0]);
else {
val = getQualValue(Math.min(g.getPhredScaledQual(), VCFConstants.MAX_GENOTYPE_QUAL));
}
} else if ( key.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) {
val = g.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())) : (g.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(key);
if ( metaData != null ) {
int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size());
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
for ( int i = 1; i < numInFormatField; i++ ) {
StringBuilder sb = new StringBuilder();
sb.append(intValues[0]);
for ( int i = 1; i < intValues.length; i++) {
sb.append(",");
sb.append(VCFConstants.MISSING_VALUE_v4);
sb.append(intValues[i]);
}
val = sb.toString();
outputValue = sb.toString();
}
} else {
Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;
// some exceptions
if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) {
val = g.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(g.getFilters())) : (g.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
if ( metaData != null ) {
int numInFormatField = metaData.getCount(vc.getAlternateAlleles().size());
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."
StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
for ( int i = 1; i < numInFormatField; i++ ) {
sb.append(",");
sb.append(VCFConstants.MISSING_VALUE_v4);
}
val = sb.toString();
}
}
// assume that if key is absent, then the given string encoding suffices
outputValue = formatVCFField(val);
}
// assume that if key is absent, then the given string encoding suffices
String outputValue = formatVCFField(val);
if ( outputValue != null )
attrs.add(outputValue);
}
@ -438,12 +442,41 @@ class VCFWriter extends IndexingVariantContextWriter {
mWriter.write(encoding);
}
/**
* Takes a double value and pretty prints it to a String for display
*
* Large doubles => gets %.2f style formatting
* Doubles < 1 / 10 but > 1/100 </>=> get %.3f style formatting
* Double < 1/100 => %.3e formatting
* @param d
* @return
*/
public static final String formatVCFDouble(final double d) {
String format;
if ( d < 1 ) {
if ( d < 0.01 ) {
if ( Math.abs(d) >= 1e-20 )
format = "%.3e";
else {
// return a zero format
return "0.00";
}
} else {
format = "%.3f";
}
} else {
format = "%.2f";
}
return String.format(format, d);
}
public static String formatVCFField(Object val) {
String result;
if ( val == null )
result = VCFConstants.MISSING_VALUE_v4;
else if ( val instanceof Double )
result = String.format(VCFConstants.DOUBLE_PRECISION_FORMAT_STRING, (Double)val);
result = formatVCFDouble((Double) val);
else if ( val instanceof Boolean )
result = (Boolean)val ? "" : null; // empty string for true, null for false
else if ( val instanceof List ) {
@ -475,21 +508,24 @@ class VCFWriter extends IndexingVariantContextWriter {
boolean sawGoodGT = false;
boolean sawGoodQual = false;
boolean sawGenotypeFilter = false;
boolean sawDP = false;
boolean sawAD = false;
boolean sawPL = false;
for ( final Genotype g : vc.getGenotypes() ) {
keys.addAll(g.getAttributes().keySet());
if ( g.isAvailable() )
sawGoodGT = true;
if ( g.hasLog10PError() )
sawGoodQual = true;
if (g.isFiltered() && g.isCalled())
sawGenotypeFilter = true;
keys.addAll(g.getExtendedAttributes().keySet());
if ( g.isAvailable() ) sawGoodGT = true;
if ( g.hasGQ() ) sawGoodQual = true;
if ( g.hasDP() ) sawDP = true;
if ( g.hasAD() ) sawAD = true;
if ( g.hasPL() ) sawPL = true;
if (g.isFiltered() && g.isCalled()) sawGenotypeFilter = true;
}
if ( sawGoodQual )
keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
if (sawGenotypeFilter)
keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
if ( sawDP ) keys.add(VCFConstants.DEPTH_KEY);
if ( sawAD ) keys.add(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
if ( sawPL ) keys.add(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));

View File

@ -87,8 +87,10 @@ public abstract class BaseTest {
private static final String networkTempDir;
private static final File networkTempDirFile;
public static final File testDirFile = new File("public/testdata/");
protected static final String testDirRelative = "public/testdata/";
public static final File testDirFile = new File(testDirRelative);
public static final String testDir = testDirFile.getAbsolutePath() + "/";
protected static final String testDirRoot = testDir.replace(testDirRelative, "");
public static final String keysDataLocation = validationDataLocation + "keys/";
public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key";

View File

@ -25,14 +25,13 @@
package org.broadinstitute.sting;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.testng.Assert;
import java.io.*;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
/**
@ -44,14 +43,46 @@ import java.util.Arrays;
* Utilities for manipulating the MD5 database of previous results
*/
public class MD5DB {
public static final Logger logger = Logger.getLogger(MD5DB.class);
/**
* Subdirectory under the ant build directory where we store integration test md5 results
*/
private static final int MAX_RECORDS_TO_READ = 10000;
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = 1000;
private static final int MAX_RECORDS_TO_READ = 100000;
private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1;
public static final String LOCAL_MD5_DB_DIR = "integrationtests";
public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests";
// tracking and emitting a data file of origina and new md5s
private final File MD5MismatchesFile;
private final PrintStream md5MismatchStream;
public MD5DB() {
this(new File(MD5DB.LOCAL_MD5_DB_DIR + "/md5mismatches.txt"));
}
public MD5DB(final File MD5MismatchesFile) {
this.MD5MismatchesFile = MD5MismatchesFile;
ensureMd5DbDirectory();
logger.debug("Creating md5 mismatch db at " + MD5MismatchesFile);
try {
md5MismatchStream = new PrintStream(new FileOutputStream(MD5MismatchesFile));
md5MismatchStream.printf("%s\t%s\t%s%n", "expected", "observed", "test");
} catch ( FileNotFoundException e ) {
throw new ReviewedStingException("Failed to open md5 mismatch file", e);
}
}
public void close() {
if ( md5MismatchStream != null ) {
logger.debug("Closeing md5 mismatch db at " + MD5MismatchesFile);
md5MismatchStream.close();
}
}
// ----------------------------------------------------------------------
//
// MD5 DB stuff
@ -61,7 +92,7 @@ public class MD5DB {
/**
* Create the MD5 file directories if necessary
*/
protected static void ensureMd5DbDirectory() {
private void ensureMd5DbDirectory() {
File dir = new File(LOCAL_MD5_DB_DIR);
if ( ! dir.exists() ) {
System.out.printf("##### Creating MD5 db %s%n", LOCAL_MD5_DB_DIR);
@ -79,7 +110,7 @@ public class MD5DB {
* @param valueIfNotFound
* @return
*/
public static String getMD5FilePath(final String md5, final String valueIfNotFound) {
public String getMD5FilePath(final String md5, final String valueIfNotFound) {
// we prefer the global db to the local DB, so match it first
for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) {
File f = getFileForMD5(md5, dir);
@ -99,7 +130,7 @@ public class MD5DB {
* @param dbPath
* @return
*/
private static File getFileForMD5(final String md5, final String dbPath) {
private File getFileForMD5(final String md5, final String dbPath) {
final String basename = String.format("%s.integrationtest", md5);
return new File(dbPath + "/" + basename);
}
@ -110,7 +141,7 @@ public class MD5DB {
* @param md5
* @param resultsFile
*/
private static void updateMD5Db(final String md5, final File resultsFile) {
private void updateMD5Db(final String md5, final File resultsFile) {
copyFileToDB(getFileForMD5(md5, LOCAL_MD5_DB_DIR), resultsFile);
copyFileToDB(getFileForMD5(md5, GLOBAL_MD5_DB_DIR), resultsFile);
}
@ -120,7 +151,7 @@ public class MD5DB {
* @param dbFile
* @param resultsFile
*/
private static void copyFileToDB(File dbFile, final File resultsFile) {
private void copyFileToDB(File dbFile, final File resultsFile) {
if ( ! dbFile.exists() ) {
// the file isn't already in the db, copy it over
System.out.printf("##### Updating MD5 file: %s%n", dbFile.getPath());
@ -192,7 +223,7 @@ public class MD5DB {
* @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text.
* @return The calculated MD5.
*/
public static MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) {
public MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) {
final String actualMD5 = testFileMD5(name, resultsFile, expectedMD5, parameterize);
String failMessage = null;
boolean failed = false;
@ -218,7 +249,7 @@ public class MD5DB {
* @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text.
* @return The calculated MD5.
*/
public static String testFileMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) {
public String testFileMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) {
try {
byte[] bytesOfMessage = getBytesFromFile(resultsFile);
byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage);
@ -247,11 +278,13 @@ public class MD5DB {
BaseTest.log(String.format("calculated %s", filemd5sum));
BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File));
md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, filemd5sum, name);
md5MismatchStream.flush();
// inline differences
// TODO -- capture output and put in log
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final PrintStream ps = new PrintStream(baos);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE);
DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false);
boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params);
if ( success ) {
final String content = baos.toString();

View File

@ -40,10 +40,13 @@ import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContextTestProvider;
import java.io.*;
import org.testng.Assert;
import org.testng.annotations.AfterSuite;
import org.testng.annotations.BeforeMethod;
import java.io.File;
import java.text.SimpleDateFormat;
import java.util.*;
@ -52,13 +55,26 @@ public class WalkerTest extends BaseTest {
private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false;
private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false;
private static MD5DB md5DB = new MD5DB();
@BeforeMethod
public void initializeRandomGenerator() {
public void initializeWalkerTests() {
logger.debug("Initializing walker tests");
GenomeAnalysisEngine.resetRandomGenerator();
}
@AfterSuite
public void finalizeWalkerTests() {
logger.debug("Finalizing walker tests");
md5DB.close();
}
public static MD5DB getMd5DB() {
return md5DB;
}
public MD5DB.MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) {
return MD5DB.assertMatchingMD5(name, resultsFile, expectedMD5, parameterize());
return getMd5DB().assertMatchingMD5(name, resultsFile, expectedMD5, parameterize());
}
public void validateOutputBCFIfPossible(final String name, final File resultFile) {
@ -67,6 +83,7 @@ public class WalkerTest extends BaseTest {
logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile);
try {
VariantContextTestProvider.assertVCFandBCFFilesAreTheSame(resultFile, bcfFile);
logger.warn(" Shadow BCF PASSED!");
} catch ( Exception e ) {
Assert.fail("Exception received reading shadow BCFFile " + bcfFile + " for test " + name, e);
}
@ -103,9 +120,9 @@ public class WalkerTest extends BaseTest {
for (int i = 0; i < resultFiles.size(); i++) {
MD5DB.MD5Match result = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i));
validateOutputBCFIfPossible(name, resultFiles.get(i));
if ( ! result.failed ) {
validateOutputIndex(name, resultFiles.get(i));
validateOutputBCFIfPossible(name, resultFiles.get(i));
md5s.add(result.expectedMD5);
} else {
fails.add(result);
@ -256,8 +273,6 @@ public class WalkerTest extends BaseTest {
}
protected Pair<List<File>, List<String>> executeTest(final String name, WalkerTestSpec spec) {
MD5DB.ensureMd5DbDirectory(); // ensure the md5 directory exists
List<File> tmpFiles = new ArrayList<File>();
for (int i = 0; i < spec.nOutputFiles; i++) {
String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i);
@ -337,8 +352,11 @@ public class WalkerTest extends BaseTest {
boolean gotAnException = false;
try {
final String now = new SimpleDateFormat("HH:mm:ss").format(new Date());
System.out.println(String.format("[%s] Executing test %s with GATK arguments: %s",
now, name, Utils.join(" ",command)));
final String cmdline = Utils.join(" ",command);
System.out.println(String.format("[%s] Executing test %s with GATK arguments: %s", now, name, cmdline));
// also write the command line to the HTML log for convenient follow-up
// do the replaceAll so paths become relative to the current
BaseTest.log(cmdline.replaceAll(testDirRoot, ""));
CommandLineExecutable.start(instance, command);
} catch (Exception e) {
gotAnException = true;

View File

@ -764,23 +764,6 @@ public class ParsingEngineUnitTest extends BaseTest {
Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
}
@Test
public void variantContextBindingArgumentTestVCF3() {
final String[] commandLine = new String[] {"-V:vcf3",NON_EXISTANT_FILENAME_VCF};
parsingEngine.addArgumentSource( VariantContextRodBindingArgProvider.class );
parsingEngine.parse( commandLine );
parsingEngine.validate();
VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider();
parsingEngine.loadArgumentsIntoObject( argProvider );
Assert.assertEquals(argProvider.binding.getName(), "binding", "Name isn't set properly");
Assert.assertEquals(argProvider.binding.getSource(), NON_EXISTANT_FILENAME_VCF, "Source isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getType(), VariantContext.class, "Type isn't set to its expected value");
Assert.assertEquals(argProvider.binding.getTags().getPositionalTags().size(), 1, "Tags aren't correctly set");
}
private class ListRodBindingArgProvider {
@Input(fullName = "binding", shortName="V", required=false)
public List<RodBinding<Feature>> bindings;

View File

@ -47,10 +47,6 @@ public class EngineFeaturesIntegrationTest extends WalkerTest {
testBadRODBindingInput("beagle", "BEAGLE input to VCF expecting walker", UserException.BadArgumentValue.class);
}
@Test() private void testBadRODBindingInputType2() {
testBadRODBindingInput("vcf3", "VCF3 input to VCF expecting walker", UserException.class);
}
@Test() private void testBadRODBindingInputType3() {
testBadRODBindingInput("bed", "Bed input to VCF expecting walker", UserException.BadArgumentValue.class);
}

View File

@ -24,7 +24,7 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(b36KGReference, "symbolic_alleles_1.vcf"),
1,
Arrays.asList("444a20659f67592a8284e0b7849e4302"));
Arrays.asList("c79137da24ad4dc15cedc742de39247f"));
executeTest("Test symbolic alleles", spec);
}
@ -33,7 +33,7 @@ public class SymbolicAllelesIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(b36KGReference, "symbolic_alleles_2.vcf"),
1,
Arrays.asList("93a24c019663a6011b4d6de12538df11"));
Arrays.asList("3f6cbbd5fdf164d87081a3af19eeeba7"));
executeTest("Test symbolic alleles mixed in with non-symbolic alleles", spec);
}
}

View File

@ -16,7 +16,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsNotAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --variant:VCF3 " + testDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("5720826c2bf6cbc762e4a888ef58c3f2"));
Arrays.asList("dfa5dff09fa964b06da19c0f4aff6928"));
executeTest("test file has annotations, not asking for annotations, #1", spec);
}
@ -24,7 +24,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsNotAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --variant:VCF3 " + testDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("088e5db7d8de6606cd562885fa47f3b2"));
Arrays.asList("9914bd19f6235c550e5182e0f4591da6"));
executeTest("test file has annotations, not asking for annotations, #2", spec);
}
@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + testDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("37fd6826db907f80d4631bae1b629da4"));
Arrays.asList("6a52ef10bb10d72cdd82a8f7afc2dd09"));
executeTest("test file has annotations, asking for annotations, #1", spec);
}
@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testHasAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + testDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("8a85c20b219a8bb286df3c9f4e1cdc8c"));
Arrays.asList("74d894fd31b449deffca88d0e465f01b"));
executeTest("test file has annotations, asking for annotations, #2", spec);
}
@ -48,7 +48,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsNotAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --variant:VCF3 " + testDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("da446d3a3e9aefa7537b65b5adc3609b"));
Arrays.asList("dd89dfa22f0e1d6760095e04f528d62a"));
executeTest("test file doesn't have annotations, not asking for annotations, #1", spec);
}
@ -58,7 +58,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
// they don't get reordered. It's a good test of the genotype ordering system.
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --variant:VCF3 " + testDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("04c71d90e3df9d519160636ceb0f02b9"));
Arrays.asList("542d9ed8290ef7868387af4127e0b5fa"));
executeTest("test file doesn't have annotations, not asking for annotations, #2", spec);
}
@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + testDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("6d64723c808a3dd774ed06e228f9c63d"));
Arrays.asList("b1b32ed3b831c92c94258c8e4a60e8c9"));
executeTest("test file doesn't have annotations, asking for annotations, #1", spec);
}
@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoAnnotsAsking2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant:VCF3 " + testDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1,
Arrays.asList("153a23b2fa4eb0ee288e4bb2f0fc4bf8"));
Arrays.asList("a25eacb0ceea2c082af349f8d7776c8a"));
executeTest("test file doesn't have annotations, asking for annotations, #2", spec);
}
@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testExcludeAnnotations() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + testDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1,
Arrays.asList("a28a503ab204474ecee306c9eceb1060"));
Arrays.asList("ef046909a6f6c6cb43653a255a99a014"));
executeTest("test exclude annotations", spec);
}
@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testOverwritingHeader() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant " + testDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1,
Arrays.asList("1d98be77dad9c703402de0315db5176a"));
Arrays.asList("5c2fded3b6a96b0b0788086bbb2409ed"));
executeTest("test overwriting header", spec);
}
@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoReads() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -G Standard --variant " + testDir + "vcfexample3empty.vcf -L " + testDir + "vcfexample3empty.vcf", 1,
Arrays.asList("ea6201db7c1fd5cb9cc3110a3396c646"));
Arrays.asList("c590088d85edce786604fd600f5d5e75"));
executeTest("not passing it any reads", spec);
}
@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testDBTagWithDbsnp() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + testDir + "vcfexample3empty.vcf -L " + testDir + "vcfexample3empty.vcf", 1,
Arrays.asList("5103b9d9857530dc0ccdb8ca0a1db8c3"));
Arrays.asList("ade9354a4cdd6cc92c169f252fb36f3f"));
executeTest("getting DB tag with dbSNP", spec);
}
@ -114,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testMultipleIdsWithDbsnp() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + testDir + "vcfexample3withIDs.vcf -L " + testDir + "vcfexample3withIDs.vcf", 1,
Arrays.asList("d519c21ab0ae901d39856fea7e0e9d83"));
Arrays.asList("f496f40e1e9efa743e3b473f6fe6e6d3"));
executeTest("adding multiple IDs with dbSNP", spec);
}
@ -122,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testDBTagWithHapMap() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --comp:H3 " + testDir + "fakeHM3.vcf -G Standard --variant " + testDir + "vcfexample3empty.vcf -L " + testDir + "vcfexample3empty.vcf", 1,
Arrays.asList("746f3a431c6491b85dd6fcf75065550f"));
Arrays.asList("d383fbd741d604625c9507d4da1c5a27"));
executeTest("getting DB tag with HM3", spec);
}
@ -130,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testNoQuals() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --variant " + testDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + testDir + "noQual.vcf -A QualByDepth", 1,
Arrays.asList("7ce09a89e72ee95f21313e496311068a"));
Arrays.asList("4a247f039dfb16ac05b38a0dd5f98da6"));
executeTest("test file doesn't have QUALs", spec);
}
@ -138,7 +138,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testUsingExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + testDir + "targetAnnotations.vcf -G Standard --variant:VCF3 " + testDir + "vcfexample3empty.vcf -E foo.AF -L " + testDir + "vcfexample3empty.vcf", 1,
Arrays.asList("accce2796a967d05d756e1b5adecd6d2"));
Arrays.asList("067792efcffea93ade632e52a80d0d8f"));
executeTest("using expression", spec);
}
@ -146,13 +146,13 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
public void testUsingExpressionWithID() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + testDir + "targetAnnotations.vcf -G Standard --variant:VCF3 " + testDir + "vcfexample3empty.vcf -E foo.ID -L " + testDir + "vcfexample3empty.vcf", 1,
Arrays.asList("9a37502ab929ac3d5a829467f5612853"));
Arrays.asList("66c68deb0508348324eb47d524e756de"));
executeTest("using expression with ID", spec);
}
@Test
public void testTabixAnnotations() {
final String MD5 = "bb9a148716fc69d706c5be146c1afa00";
final String MD5 = "5aebcf8f76c649d645708b1262185c80";
for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1,
@ -168,7 +168,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
validationDataLocation + "1kg_exomes_unfiltered.AFR.unfiltered.vcf --snpEffFile " + validationDataLocation +
"snpEff2.0.5.AFR.unfiltered.vcf -L 1:1-1,500,000 -L 2:232,325,429",
1,
Arrays.asList("bef7201d9c79facbecba15d4abcc684b")
Arrays.asList("0c20cda1cf0b903a287f1807ae5bee02")
);
executeTest("Testing SnpEff annotations", spec);
}
@ -187,7 +187,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
@Test
public void testTDTAnnotation() {
final String MD5 = "900e9d82ea3127aa06e676cf50b341f6";
final String MD5 = "81f85f0ce8cc36df7c717c478e100ba1";
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + b37KGReference + " -A TransmissionDisequilibriumTest --variant:vcf " + testDir + "ug.random50000.subset300bp.chr1.family.vcf" +
" -L " + testDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + testDir + "ug.random50000.family.ped -o %s", 1,
@ -198,7 +198,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
@Test
public void testChromosomeCountsPed() {
final String MD5 = "7fe0e9df2d9fb375beb7cf23afdb4c87";
final String MD5 = "9830fe2247651377e68ad0b0894e9a4e";
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + b37KGReference + " -A ChromosomeCounts --variant:vcf " + testDir + "ug.random50000.subset300bp.chr1.family.vcf" +
" -L " + testDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + testDir + "ug.random50000.family.ped -o %s", 1,
@ -208,7 +208,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
@Test
public void testInbreedingCoeffPed() {
final String MD5 = "7aaf0033a823bbf9066b43764d8dd660";
final String MD5 = "e94d589b5691e3ecfd9cc9475a384890";
WalkerTestSpec spec = new WalkerTestSpec(
"-T VariantAnnotator -R " + b37KGReference + " -A InbreedingCoeff --variant:vcf " + testDir + "ug.random50000.subset300bp.chr1.family.vcf" +
" -L " + testDir + "ug.random50000.subset300bp.chr1.family.vcf --no_cmdline_in_header -ped " + testDir + "ug.random50000.family.ped -o %s", 1,

View File

@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " +
"--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " +
"--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " +
"-o %s --no_cmdline_in_header", 1, Arrays.asList("7fd0d0c2d1af3b16378339c181e40611"));
"-o %s --no_cmdline_in_header", 1, Arrays.asList("cdbf8cc557f5be9ac778e52338c0d906"));
executeTest("test BeagleOutputToVCF", spec);
}
@ -50,7 +50,7 @@ public class BeagleIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ProduceBeagleInput -R " + hg19Reference + " " +
"--variant:VCF3 " + beagleValidationDataLocation + "inttestbgl.input.vcf " +
"-o %s", 1, Arrays.asList("a01c704246f3dd1b9c65774007e51e69"));
"-o %s", 1, Arrays.asList("f301b089d21da259873f04bdc468835d"));
executeTest("test BeagleInput", spec);
}
@ -60,7 +60,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"-T ProduceBeagleInput --variant:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+
"--validation:VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+
"-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta --no_cmdline_in_header ",2,
Arrays.asList("660986891b30cdc937e0f2a3a5743faa","e96ddd51da9f4a797b2aa8c20e404166"));
Arrays.asList("660986891b30cdc937e0f2a3a5743faa","4b6417f892ccfe5c63b8a60cb0ef3740"));
executeTest("test BeagleInputWithBootstrap",spec);
}
@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest {
"--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
"--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
"--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
"-L 20:1-70000 -o %s --no_cmdline_in_header ",1,Arrays.asList("43865f3f0d975ee2c5912b31393842f8"));
"-L 20:1-70000 -o %s --no_cmdline_in_header ",1,Arrays.asList("8c05bda0630155bcd0ebaf155ed5e491"));
executeTest("testBeagleChangesSitesToRef",spec);
}

View File

@ -35,12 +35,14 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
private class TestParams extends TestDataProvider {
public File master, test;
public String MD5;
public boolean doPairwise;
private TestParams(String master, String test, String MD5) {
private TestParams(String master, String test, final boolean doPairwise, String MD5) {
super(TestParams.class);
this.master = new File(master);
this.test = new File(test);
this.MD5 = MD5;
this.doPairwise = doPairwise;
}
public String toString() {
@ -50,8 +52,10 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
@DataProvider(name = "data")
public Object[][] createData() {
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "daead9bfab1a5df72c5e3a239366118e");
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "3f46f5a964f7c34015d972256fe49a35");
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", true, "bf7ef17436a7eccf27be41a9477904f6");
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", true, "3f46f5a964f7c34015d972256fe49a35");
new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", false, "8ab29169cff232e670db9a4c54fc4358");
new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", false, "47bf16c27c9e2c657a7e1d13f20880c9");
return TestParams.getTests(TestParams.class);
}
@ -61,6 +65,7 @@ public class DiffObjectsIntegrationTest extends WalkerTest {
"-T DiffObjects -R public/testdata/exampleFASTA.fasta "
+ " -m " + params.master
+ " -t " + params.test
+ (params.doPairwise ? " -doPairwise " : "")
+ " -o %s",
Arrays.asList(params.MD5));
executeTest("testDiffObjects:"+params, spec).getFirst();

View File

@ -16,7 +16,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testNoAction() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("5720826c2bf6cbc762e4a888ef58c3f2"));
Arrays.asList("dfa5dff09fa964b06da19c0f4aff6928"));
executeTest("test no action", spec);
}
@ -24,7 +24,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testClusteredSnps() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -window 10 --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("d7c2a4b0c1b2b982847508997ba57ebf"));
Arrays.asList("4a4596929f9fe983d8868ca142567781"));
executeTest("test clustered SNPs", spec);
}
@ -32,7 +32,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testMask1() {
WalkerTestSpec spec1 = new WalkerTestSpec(
baseTestString() + " -maskName foo --mask:VCF3 " + testDir + "vcfexample2.vcf --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("890774962576c407d8a17ed57cf704c1"));
Arrays.asList("1719462cd17986c33e59e45b69df0270"));
executeTest("test mask all", spec1);
}
@ -40,7 +40,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testMask2() {
WalkerTestSpec spec2 = new WalkerTestSpec(
baseTestString() + " -maskName foo --mask:VCF " + testDir + "vcfMask.vcf --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("8864573dbf52908501140e6b0afcbc90"));
Arrays.asList("db19ff7d90c82cda09fb3c3878100eb5"));
executeTest("test mask some", spec2);
}
@ -48,7 +48,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testMask3() {
WalkerTestSpec spec3 = new WalkerTestSpec(
baseTestString() + " -maskName foo -maskExtend 10 --mask:VCF " + testDir + "vcfMask.vcf --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("42a1c08763f151073a49e3c7bb68028b"));
Arrays.asList("a9e417cba21585c786d4b9930265ea31"));
executeTest("test mask extend", spec3);
}
@ -56,7 +56,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilter1() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -filter 'DoC < 20 || FisherStrand > 20.0' -filterName foo --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("ef8100c3b7c67d28571cbda771c414c2"));
Arrays.asList("4160904b180d1f62a6bf50de6728ce00"));
executeTest("test filter #1", spec);
}
@ -64,7 +64,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilter2() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -filter 'AlleleBalance < 70.0 && FisherStrand == 1.4' -filterName bar --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("318ed3874fd42b7da8c59554a25a1fab"));
Arrays.asList("df80db30c7836731ac7c8c3d4fc005b4"));
executeTest("test filter #2", spec);
}
@ -72,7 +72,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testFilterWithSeparateNames() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --filterName ABF -filter 'AlleleBalance < 0.7' --filterName FSF -filter 'FisherStrand == 1.4' --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("9cb398e78a38a7bc5e839e28c8dae2eb"));
Arrays.asList("71ce6c0952831cb68f575aa0173dce2b"));
executeTest("test filter with separate names #2", spec);
}
@ -80,7 +80,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testGenotypeFilters1() {
WalkerTestSpec spec1 = new WalkerTestSpec(
baseTestString() + " -G_filter 'GQ == 0.60' -G_filterName foo --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("b38709f932b969e4267603333863269e"));
Arrays.asList("179f7f2a90c0e6c656109aac9b775476"));
executeTest("test genotype filter #1", spec1);
}
@ -88,7 +88,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testGenotypeFilters2() {
WalkerTestSpec spec2 = new WalkerTestSpec(
baseTestString() + " -G_filter 'AF == 0.04 && isHomVar == 1' -G_filterName foo --variant:VCF3 " + testDir + "vcfexample2.vcf -L 1:10,020,000-10,021,000", 1,
Arrays.asList("0e1457e678326e44e92ee13e84414e0f"));
Arrays.asList("22e07c27feb9017a130dfb045c5b29b9"));
executeTest("test genotype filter #2", spec2);
}
@ -96,7 +96,7 @@ public class VariantFiltrationIntegrationTest extends WalkerTest {
public void testDeletions() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --filterExpression 'QUAL < 100' --filterName foo --variant:VCF " + testDir + "twoDeletions.vcf", 1,
Arrays.asList("569546fd798afa0e65c5b61b440d07ac"));
Arrays.asList("637256ee5348c1c57f1dadf581b06ed9"));
executeTest("test deletions", spec);
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
@ -50,7 +51,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
}
private static Genotype createGenotype(String name, double[] gls) {
return new Genotype(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), Genotype.NO_LOG10_PERROR, gls);
return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make();
}
@DataProvider(name = "getGLs")

View File

@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("bf5c76bec6e00199d441b6175b6b7c39"));
Arrays.asList("b6c677b2375541fd2db775d0029571e6"));
executeTest("test MultiSample Pilot1", spec);
}
@ -37,7 +37,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("9f56f8d62c047213c894c3f250706aea"));
Arrays.asList("3400dfae6db8ed7e1351b1aa52341714"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
}
@ -45,7 +45,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("0a5048062cd9022b761ae87efed5957e"));
Arrays.asList("0bb67b07ee5315d0486f3a0045a03757"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -53,7 +53,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("f50a30bf9bbd4e5dcd5d7d9282b6dadf"));
Arrays.asList("5c5bf3d2676e1a26d521f1f902f73526"));
executeTest("test SingleSample Pilot2", spec);
}
@ -61,7 +61,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + testDir + "multiallelic.snps.bam -o %s -L " + testDir + "multiallelic.snps.intervals", 1,
Arrays.asList("6fb6ea5f2b9da02a0fea7cb2994fb5db"));
Arrays.asList("eb6c8b7680f40b5fdac6e451c623ab81"));
executeTest("test Multiple SNP alleles", spec);
}
@ -69,7 +69,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testBadRead() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH -I " + testDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
Arrays.asList("95158fb50db5d41a678cd331a3ffe5e1"));
Arrays.asList("e2cf97bca4a720ca64ca7f682da6c9f0"));
executeTest("test bad read", spec);
}
@ -77,7 +77,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("c86e05f315a86bc190d72cde911e6fe2"));
Arrays.asList("0c195201574815559757885c693b6640"));
executeTest("test reverse trim", spec);
}
@ -87,7 +87,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "f6d655714706b6e8390037db3fad60ef";
private final static String COMPRESSED_OUTPUT_MD5 = "6209a19a33ac9e187a9074cee549f93b";
@Test
public void testCompressedOutput() {
@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// Note that we need to turn off any randomization for this to work, so no downsampling and no annotations
String md5 = "7bc812cc553b4ab77c08049f0e32d0f6";
String md5 = "34cb7146c037925e8f324cffd986834d";
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1,
@ -140,7 +140,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("dfeaccb68165fdaffafde9150914432d"));
Arrays.asList("f48e4898c741c84354da3a0562cb44e1"));
executeTest("test min_base_quality_score 26", spec);
}
@ -148,7 +148,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("35ef19b4f248969c74da8bd7489385d6"));
Arrays.asList("f4ef85f1ed72e35b91b0469edf5956ad"));
executeTest("test SLOD", spec);
}
@ -156,7 +156,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("aa49989fde8c6378f5c751f8b267c471"));
Arrays.asList("ea219bdce9596e8649ad1d39e24e333a"));
executeTest("test NDA", spec);
}
@ -164,23 +164,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("ffaeb60a5776d85b41c64786ddc4d14d"));
Arrays.asList("9d5c51379e1b1031da5735aa8c965766"));
executeTest("test using comp track", spec);
}
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "f9a4005c53291170800e6023503d5635");
testOutputParameters("-sites_only", "ac8bea16be247d9e39d66a6305409f57");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "e6c63baff51aaeb318c8bebaf2989828");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "1e908a3164adbab10dcb6415e2645954");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "43ffa34646d781a368ea81342c21ae2e");
testOutputParameters("--output_mode EMIT_ALL_SITES", "eee23523912b51b249472e6d5fc0aece");
}
private void testOutputParameters(final String args, final String md5) {
@ -194,7 +194,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("c7cb29121eb30e752ab6652a6d2a62a6"));
Arrays.asList("355bee3d375e994e4a3b07f7a8d267a0"));
executeTest("test confidence 1", spec1);
}
@ -202,7 +202,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1,
Arrays.asList("e7bdb76be82420a03ff28038d283822d"));
Arrays.asList("72d9ea93591b17535b7f5b53e1d064cb"));
executeTest("test confidence 2", spec2);
}
@ -213,12 +213,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "ca65e199e9ff0bc986df3dee74e11eb1" );
testHeterozosity( 0.01, "0ffd19f90b05652e45f58e4a959ae304" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "ddcdfe4a5252da59278a6f1ba6f8a175" );
testHeterozosity( 1.0 / 1850, "b6dbfb567e433273fe90b0d038556a9f" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -242,7 +242,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("c4b3876d76e3d0fb78a1d3ebd674f1a1"));
Arrays.asList("c9675bc1ca6c82cb60d39d9395881c96"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -261,7 +261,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("41445b1cd1a82af71126ff1692f7a5fe"));
Arrays.asList("6e4089986d08d46a8d0b4ddfd611a7c3"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("c9e79470a4ce6eacde366e9fcf4d5b14"));
Arrays.asList("80a5a499cc553ee579ba93dcb967e5ef"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -295,7 +295,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("70f8a17ba68131520db5c764ac5acdd2"));
Arrays.asList("9271105e630ab39cf1c88b338da54594"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -308,7 +308,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("e4316d80fd833886820c8b4e122fbfc4"));
Arrays.asList("d77a379429ca848cea552c4697b86472"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -318,7 +318,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + testDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("c92aba3635f3331ddf8ae7a0382ca594"));
Arrays.asList("f83c4f370ed0a343ca0808e5da3d997d"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -328,7 +328,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ testDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("b87034f349887160ec1124e12863d543"));
Arrays.asList("ca5459e93a9955aec8f93abf7f84e5ed"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -336,13 +336,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("51e6a7868d2ea2daefa411ed82f18be2"));
Arrays.asList("04aaeff1e9f97bbf2dc2d6d754f25a0d"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("954c52be0c6ca9ed5a213a53f4efbc10"));
Arrays.asList("5c7db047ae9417d37c6bbda1d8ea6019"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -352,7 +352,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + testDir + vcf + " -I " + validationDataLocation +
"NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1,
Arrays.asList("ae44230ed54fd8ce63711cae908470cb"));
Arrays.asList("3e3ac23846801c34acbf10a1a527264a"));
executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec);
}
@ -385,7 +385,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("471012c1d3dbec4633710264de5daa24"));
Arrays.asList("90e8140f114e026f2a0e7a881baa3f20"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -393,7 +393,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("9165507fb202d515512a947a8a9db6bb"));
Arrays.asList("db70b7a015fa882c8ce1e4c43f589f22"));
executeTest("test minIndelFraction 0.25", spec);
}
@ -401,7 +401,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinIndelFraction100() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 1", 1,
Arrays.asList("c1bbd4998b7c6dffee1682d3e5c929cc"));
Arrays.asList("50a6774b7d8f71fe0e125c204d50ba84"));
executeTest("test minIndelFraction 1.0", spec);
}
}

View File

@ -29,7 +29,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-o %s"
),
2,
Arrays.asList("d54a142d68dca54e478c13f9a0e4c95c","1a37fcc93a73429f9065b942ab771233")
Arrays.asList("cd112ec37a9e28d366aff29a85fdcaa0","313cc749c7ee97713e4551de39e01ac5")
);
executeTest("testTrueNegativeMV", spec);
}
@ -48,7 +48,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-o %s"
),
2,
Arrays.asList("883ea7fd2b200c4b7fa95a4f7aa15931","7b1f5309c3d4f4aa7e9061f288dceb68")
Arrays.asList("27ccd6feb51de7e7dcdf35f4697fa4eb","dd90dad9fd11e1b16e6660c3ca0553e7")
);
executeTest("testTruePositiveMV", spec);
}
@ -67,7 +67,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-o %s"
),
2,
Arrays.asList("e812d62a3449b74b6948ee7deb8a0790","d00922496759e84c66a4b5e222e36997")
Arrays.asList("719d681bb0a52a40bc854bba107c5c94","b35a86d2cad17f0db7b5e84ddc0e5545")
);
executeTest("testFalsePositiveMV", spec);
}
@ -86,7 +86,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-o %s"
),
2,
Arrays.asList("e3c572f933a40e1878a2cfa52049517a","0de6cccfec929caa07cd0eeafacbfffd")
Arrays.asList("7f4a277aee2c7398fcfa84d6c98d5fb3","c53b5fd377bef48e9c6035a94db398db")
);
executeTest("testSpecialCases", spec);
}
@ -108,7 +108,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-o %s"
),
2,
Arrays.asList("b42af3b73a2cb38cfc92f8047dd686b3","a69c3f9c005e852b44c29ab25e87ba0d")
Arrays.asList("44e09d2f9e4d8a9488226d03a97fe999","6f596470740e1a57679bbb38c0126364")
);
executeTest("testPriorOption", spec);
}
@ -128,7 +128,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-o %s"
),
1,
Arrays.asList("d00922496759e84c66a4b5e222e36997")
Arrays.asList("b35a86d2cad17f0db7b5e84ddc0e5545")
);
executeTest("testMVFileOption", spec);
}
@ -149,7 +149,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest {
"-fatherAlleleFirst"
),
2,
Arrays.asList("c158a3816357597543ef85c4478c41e8","4f8daca19c8f31bd87850c124f91e330")
Arrays.asList("60ced3d078792a150a03640b62926857","6d550784382aa910f78b533d889c91c0")
);
executeTest("testFatherAlleleFirst", spec);
}

View File

@ -26,7 +26,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:332341-382503",
1,
Arrays.asList("2520f93505fda28d44f618a0123d593b"));
Arrays.asList("0a41b96b04a87fdb99bc3342d48d2eba"));
executeTest("MAX 10 het sites [TEST ONE]; require PQ >= 10", spec);
}
@ -36,7 +36,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:1232503-1332503",
1,
Arrays.asList("965b8f448365b7f4a124d32e809eb048"));
Arrays.asList("f7517896c899a872c24d8e823ac9deae"));
executeTest("MAX 10 het sites [TEST TWO]; require PQ >= 10", spec);
}
@ -46,7 +46,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 2, 30)
+ " -L chr20:332341-382503",
1,
Arrays.asList("60f5bb699335f47cdc505322c5be3803"));
Arrays.asList("cdbdd2f68c232012b6fe9a322b0ea24c"));
executeTest("MAX 2 het sites [TEST THREE]; require PQ >= 30", spec);
}
@ -56,7 +56,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 5, 100)
+ " -L chr20:332341-382503",
1,
Arrays.asList("023c2fb43b50807cfd46841ed6f0d215"));
Arrays.asList("6b70e3e4e28f9583d35d98bf8a7d0d59"));
executeTest("MAX 5 het sites [TEST FOUR]; require PQ >= 100", spec);
}
@ -66,7 +66,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 1000, 7, 10)
+ " -L chr20:332341-482503",
1,
Arrays.asList("e5e6e9f84d108d5b001aa53017d2801e"));
Arrays.asList("6163a1fba27532da77765a7a11c55332"));
executeTest("MAX 7 het sites [TEST FIVE]; require PQ >= 10; cacheWindow = 1000", spec);
}
@ -76,7 +76,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "phasing_test_chr20_332341_1332503.vcf", 20000, 10, 10)
+ " -L chr20:652810-681757",
1,
Arrays.asList("8fc53bfbea2754ff8577460786a3400c"));
Arrays.asList("61a7d05f9eb4317cf0e6937d72e1e7ec"));
executeTest("MAX 10 het sites [TEST SIX]; require PQ >= 10; cacheWindow = 20000; has inconsistent sites", spec);
}
@ -86,7 +86,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10)
+ " -L chr20:332341-802503",
1,
Arrays.asList("c37548b333b65f58d0edfc5c2a62a28a"));
Arrays.asList("44eb225ab3167651ec0a9e1fdcc83d34"));
executeTest("Use trio-phased VCF, but ignore its phasing [TEST SEVEN]", spec);
}
@ -96,7 +96,7 @@ public class ReadBackedPhasingIntegrationTest extends WalkerTest {
baseTestString(hg18Reference, "phasing_test_chr20_332341_1332503.bam", "CEU.trio.2010_03.genotypes.hg18.vcf", 20000, 10, 10)
+ " -L chr20:332341-802503" + " -respectPhaseInInput",
1,
Arrays.asList("dfc7cdddd702e63d46d04f61a3ecd720"));
Arrays.asList("e3549b89d49092e73cc6eb21f233471c"));
executeTest("Use trio-phased VCF, and respect its phasing [TEST EIGHT]", spec);
}

View File

@ -33,7 +33,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleNone + freqUnif + "--variant " + testfile),
1,
Arrays.asList("6a9e990a9252840904b5144213915b32")
Arrays.asList("b8a988757ac1f206d123140da5a3e778")
);
executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec);
@ -45,7 +45,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleNone + freqAF + "--variant " + testfile),
1,
Arrays.asList("eaa2385086cddff68cf4fdb81cbdbbb9")
Arrays.asList("542d5d5ff8c64da7b077bab4b950a9a3")
);
executeTest("testNoSampleSelectionFreqAF--" + testfile, spec);
@ -57,7 +57,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGT + freqUnif + "--variant " + testfile),
1,
Arrays.asList("24077656f590d6905546f7e019c8dccb")
Arrays.asList("7385b17eed7f4ff0f6e82e60c3334ce7")
);
executeTest("testPolyGTFreqUniform--" + testfile, spec);
@ -69,7 +69,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGT + freqAF + "--variant " + testfile),
1,
Arrays.asList("3c1180fd9b5e80e540b39c5a95fbe722")
Arrays.asList("0ee4a565a0d4f6b6942abd72a373becd")
);
executeTest("testPolyGTFreqAF--" + testfile, spec);
@ -81,7 +81,7 @@ public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGL + freqAF + "--variant " + testfile),
1,
Arrays.asList("ad30c028864348204ebe80b9c8c503e8")
Arrays.asList("0ee4a565a0d4f6b6942abd72a373becd")
);
executeTest("testPolyGLFreqAF--" + testfile, spec);

View File

@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
String tests = cmdRoot +
" --dbsnp " + b36dbSNP129 +
" --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
" --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf";
" --comp:comp_genotypes " + testDir + "yri.trio.gatk.ug.head.vcf";
WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s",
1, Arrays.asList("4b9dcbce0717285e3c0c736c1bed744c"));
executeTestParallel("testSelect1", spec);

View File

@ -27,8 +27,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
VRTest lowPass = new VRTest("phase1.projectConsensus.chr20.raw.snps.vcf",
"0ddd1e0e483d2eaf56004615cea23ec7", // tranches
"a45a78de049cfe767ce23d3423f80b01", // recal file
"1050c387d170639f8cec221e5dddd626"); // cut VCF
"6e1f98bb819ccf03e17a2288742160d3", // recal file
"c58ff4140e8914f0b656ed625c7f73b9"); // cut VCF
@DataProvider(name = "VRTest")
public Object[][] createData1() {
@ -67,16 +67,16 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" --no_cmdline_in_header" +
" -input " + params.inVCF +
" -o %s" +
" -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null),
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null),
Arrays.asList(params.cutVCFMD5));
executeTest("testApplyRecalibration-"+params.inVCF, spec);
}
VRTest indel = new VRTest("combined.phase1.chr20.raw.indels.sites.vcf",
"da4458d05f6396f5c4ab96f274e5ccdc", // tranches
"918a5ecad5a2a8a46795144366683188", // recal file
"bf0e8ed5e250d52f0545074c61217d16"); // cut VCF
"8e2417336fa62e6c4d9f61b6deebdd82", // recal file
"05e88052e0798f1c1e83f0a8938bce56"); // cut VCF
@DataProvider(name = "VRIndelTest")
public Object[][] createData2() {
@ -115,8 +115,8 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" --no_cmdline_in_header" +
" -input " + params.inVCF +
" -o %s" +
" -tranchesFile " + MD5DB.getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + MD5DB.getMD5FilePath(params.recalMD5, null),
" -tranchesFile " + getMd5DB().getMD5FilePath(params.tranchesMD5, null) +
" -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null),
Arrays.asList(params.cutVCFMD5));
executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec);
}
@ -133,7 +133,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest {
" -o %s" +
" -tranchesFile " + testDir + "VQSR.mixedTest.tranches" +
" -recalFile " + testDir + "VQSR.mixedTest.recal",
Arrays.asList("9039576b63728df7ee2c881817c0e9eb"));
Arrays.asList("1370d7701a6231633d43a8062b7aff7f"));
executeTest("testApplyRecalibrationSnpAndIndelTogether", spec);
}
}

View File

@ -78,26 +78,26 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
executeTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec);
}
@Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c1e82f0842ca721d10f21604f26a5248"); }
@Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "b2fcf3983cc9e667b9bbed8372080776", " -setKey foo"); }
@Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "98c0cbb94e5debf7545a656665a1b659", " -setKey null"); }
@Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "10170f9e72cc831a5820bd03e70fe46a"); } // official project VCF files in tabix format
@Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "6469fce8a5cd5a0f77e5ac5d9e9e192b"); }
@Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "a4cedaa83d54e34cafc3ac4b80acf5b4", " -setKey foo"); }
@Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "ac58a5fde17661e2a19004ca954d9781", " -setKey null"); }
@Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "67a8076e30b4bca0ea5acdc9cd26a4e0"); } // official project VCF files in tabix format
@Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "074e909f80ffcc9fddc3fac89ea36bef"); }
@Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "f26980af214011c0452b8ce843f3063b"); }
@Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "ef2d249ea4b25311966e038aac05c661"); }
@Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "cdb448aaa92ca5a9e393d875b42581b3"); }
@Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "5bc1de1197506aced0f9e7a08b572c44"); }
@Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "284083f60792c5f817899445dfa63a42"); }
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "06e86711bcf0efc0f0c4a378f6147cf6"); } // official project VCF files in tabix format
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "03103f6b39e9fb7a396df0013f01fae6"); } // official project VCF files in tabix format
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "12fc1b8145f7884762f0c2cbbd319ae1"); }
@Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "4efdf983918db822e4ac13d911509576"); } // official project VCF files in tabix format
@Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "848d4408ee953053d2307cefebc6bd6d"); } // official project VCF files in tabix format
@Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "91f6087e6e2bf3df4d1c9700eaff958b"); }
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "7e2dba80ba38b2a86713f635d630eb59"); }
@Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a9be239ab5e03e7e97caef58a3841dd2"); }
@Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "3950392e1b8b53ae363e705185ad1da9"); }
@Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "0b1815c699e71e143ed129bfadaffbcb"); }
@Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "5c60eb8d5d4b957a0cf52ca008f021ba"); }
@Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "774b43e69cc7ec93090b4f6e9f4a1079"); }
@Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "def52bcd3942bbe39cd7ebe845c4f206"); }
@Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "5f61145949180bf2a0cd342d8e064860"); }
@Test public void threeWayWithRefs() {
WalkerTestSpec spec = new WalkerTestSpec(
@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
" -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" +
" -genotypeMergeOptions UNIQUIFY -L 1"),
1,
Arrays.asList("948291bbf47d1cec692d0fe4358ff92c"));
Arrays.asList("c0d4d601aa5d2b29927c535868448d2a"));
executeTest("threeWayWithRefs", spec);
}
@ -127,17 +127,17 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec);
}
@Test public void complexTestFull() { combineComplexSites("", "dd805f6edfc3cf724512dfbbe8df5183"); }
@Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "14a205edb022f79abf1863588cfee56b"); }
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "e118d04d1d47c02ad38c046561a9f616"); }
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "e118d04d1d47c02ad38c046561a9f616"); }
@Test public void complexTestFull() { combineComplexSites("", "7d587bf49bbc9f8239476bab84bf9708"); }
@Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "4d1e0c12d95f50e472493fc14af3cc06"); }
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "9a98b01b9b2a28ae6af3125edc131dea"); }
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "9a98b01b9b2a28ae6af3125edc131dea"); }
@Test
public void combineDBSNPDuplicateSites() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineVariants --no_cmdline_in_header -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132,
1,
Arrays.asList("a838dc241cf357466cd4331fd298c73a"));
Arrays.asList("3d2a5a43db86e3f6217ed2a63251285b"));
executeTest("combineDBSNPDuplicateSites:", spec);
}
}

View File

@ -6,7 +6,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.testng.Assert;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderUnitTest;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderUnitTest;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.testng.annotations.Test;

View File

@ -40,7 +40,7 @@ public class LeftAlignVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LeftAlignVariants -o %s -R " + b37KGReference + " --variant:vcf " + testDir + "forLeftAlignVariantsTest.vcf --no_cmdline_in_header",
1,
Arrays.asList("8e0991576518823b339a4e2f83299d4f"));
Arrays.asList("bcf05f56adbb32a47b6d6b27b327d5c2"));
executeTest("test left alignment", spec);
}
}

View File

@ -38,7 +38,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
@Test
public void testb36Tohg19() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LiftoverVariants -o %s -R " + b36KGReference + " --variant:vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict",
"-T LiftoverVariants -o %s -R " + b36KGReference + " --variant " + testDir + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict",
1,
Arrays.asList("70aeaca5b74cc7ba8e2da7b71ff0fbfd"));
executeTest("test b36 to hg19", spec);
@ -47,7 +47,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
@Test
public void testb36Tohg19UnsortedSamples() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LiftoverVariants -o %s -R " + b36KGReference + " --variant:vcf3 " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict",
"-T LiftoverVariants -o %s -R " + b36KGReference + " --variant " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.noheader.unsortedSamples.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict",
1,
Arrays.asList("07d1bf52125d1f9a25e260e13ec7b010"));
executeTest("test b36 to hg19, unsorted samples", spec);

View File

@ -10,6 +10,47 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
return "-T SelectVariants -R " + b36KGReference + " -L 1 -o %s --no_cmdline_in_header" + args;
}
@Test
public void testDiscordanceNoSampleSpecified() {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("133fd0ded0bb213097cbe68995afbb7e")
);
spec.disableShadowBCF();
executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec);
}
@Test
public void testRepeatedLineSelection() {
String testfile = testDir + "test.dup.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -sn B -sn C --variant " + testfile),
1,
Arrays.asList("b2ee12588ebda200727762a903b8c972")
);
executeTest("testRepeatedLineSelection--" + testfile, spec);
}
@Test
public void testDiscordance() {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("f64c90c4cca470f1095d9fa2062eac3e")
);
spec.disableShadowBCF();
executeTest("testDiscordance--" + testFile, spec);
}
@Test
public void testComplexSelection() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
@ -18,7 +59,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile),
1,
Arrays.asList("6cd82274335eeb0b449e571f38d54d3a")
Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
);
spec.disableShadowBCF();
executeTest("testComplexSelection--" + testfile, spec);
@ -32,53 +73,13 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile,
1,
Arrays.asList("bbd7b28d1c5701e17b395d64f8b6f13d")
Arrays.asList("b24f31db48d254d8fe15295955173486")
);
spec.disableShadowBCF();
executeTest("testSampleExclusion--" + testfile, spec);
}
@Test
public void testRepeatedLineSelection() {
String testfile = testDir + "test.dup.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -sn A -sn B -sn C --variant " + testfile),
1,
Arrays.asList("77579c53dbde4e8171f3cee83b98351b")
);
executeTest("testRepeatedLineSelection--" + testfile, spec);
}
@Test
public void testDiscordance() {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("03abdc27bfd7aa36d57bba0325b31e0d")
);
spec.disableShadowBCF();
executeTest("testDiscordance--" + testFile, spec);
}
@Test
public void testDiscordanceNoSampleSpecified() {
String testFile = testDir + "NA12878.hg19.example1.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("9fb54ed003234a5847c565ffb6767b95")
);
spec.disableShadowBCF();
executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec);
}
@Test
public void testConcordance() {
@ -87,7 +88,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("76857b016198c3e08a2e27bbdb49f3f0")
Arrays.asList("9da5dab3d344c1c0a5987b15e60fa082")
);
spec.disableShadowBCF();
@ -101,7 +102,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("6c0b0c5f03d26f4a7a1438a2afc9fb6b")
Arrays.asList("30b89b3a6706f7f46b23bfb3be69cc8e")
);
executeTest("testVariantTypeSelection--" + testFile, spec);
@ -114,7 +115,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("a8a26c621018142c9cba1080cbe687a8")
Arrays.asList("8bf557aaa07eccb294c81f491225bf9e")
);
executeTest("testUsingDbsnpName--" + testFile, spec);
@ -127,7 +128,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("6bee6dc2316aa539560a6d9d8adbc4ff")
Arrays.asList("5bf9663274ceb552f5469f8c1dfc22ed")
);
executeTest("testRegenotype--" + testFile, spec);
@ -140,10 +141,10 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b36KGReference + " -select 'KG_FREQ < 0.5' --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("6ff686a64e98fc1be2cde9b034d4a43a")
Arrays.asList("cb9932f9a7aa2e53af605b30d88ad43f")
);
executeTest("testMultipleRecordsAtOnePositionFirstIsFiltered--" + testFile, spec);
executeTest("testMultipleRecordsAtOnePosition--" + testFile, spec);
}
@Test
@ -153,13 +154,13 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " --variant " + testFile + " -o %s --no_cmdline_in_header",
1,
Arrays.asList("95c4d43b11c3d0dd3ab19941c474269b")
Arrays.asList("920605cc2182026e3f54c009f6a04141")
);
executeTest("testMultipleRecordsAtOnePositionFirstIsFiltered--" + testFile, spec);
executeTest("testNoGTs--" + testFile, spec);
}
@Test
@Test(enabled = false)
public void testParallelization2() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
@ -168,13 +169,13 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"),
1,
Arrays.asList("6cd82274335eeb0b449e571f38d54d3a")
Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
);
spec.disableShadowBCF();
executeTest("testParallelization (2 threads)--" + testfile, spec);
}
@Test
@Test(enabled = false)
public void testParallelization4() {
String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
@ -182,7 +183,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
spec = new WalkerTestSpec(
baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"),
1,
Arrays.asList("6cd82274335eeb0b449e571f38d54d3a")
Arrays.asList("446eea62630bc5325ffab30b9b9fbfe4")
);
spec.disableShadowBCF();
@ -196,7 +197,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile,
1,
Arrays.asList("fa92b3b41f1c04f685be8de32afc9706")
Arrays.asList("2f2a342812ba914bcce666e42ef761d7")
);
executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec);
}

View File

@ -46,7 +46,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
// Copy VCF data from the test file into the FIFO.
String testFile = validationDataLocation + "yri.trio.gatk.ug.head.vcf";
String testFile = testDir + "yri.trio.gatk.ug.head.vcf";
FileInputStream inputStream = new FileInputStream(testFile);
FileOutputStream outputStream = new FileOutputStream(tmpFifo);
outputStream.getChannel().transferFrom(inputStream.getChannel(),0,inputStream.getChannel().size());
@ -56,11 +56,11 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T SelectVariants" +
" -R " + b36KGReference +
" --variant:vcf3,storage=STREAM " + tmpFifo.getAbsolutePath() +
" --variant:VCF,storage=STREAM " + tmpFifo.getAbsolutePath() +
" --no_cmdline_in_header " +
" -o %s",
1,
Arrays.asList("c5e93b0e2e8610785d43e5d9e7fb5a7b")
Arrays.asList("b532a20b5af4e8ea7a073888976c71ba")
);
executeTest("testSimpleVCFStreaming", spec);
@ -74,13 +74,13 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
File tmpFifo = File.createTempFile("vcfstreaming","");
Runtime.getRuntime().exec(new String[] {"mkfifo",tmpFifo.getAbsolutePath()});
String testFile = validationDataLocation + "yri.trio.gatk.ug.head.vcf";
String testFile = testDir + "yri.trio.gatk.ug.head.vcf";
// Output select to FIFO
WalkerTestSpec selectTestSpec = new WalkerTestSpec(
"-T SelectVariants" +
" -R " + b36KGReference +
" --variant:vcf3,storage=STREAM " + testFile +
" --variant:VCF,storage=STREAM " + testFile +
" --no_cmdline_in_header" +
" -select 'QD > 2.0'" +
" -o " + tmpFifo.getAbsolutePath(),
@ -93,7 +93,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest {
selectTestSpec = new WalkerTestSpec(
"-T VariantEval" +
" -R " + b36KGReference +
" --eval:vcf3 " + testFile +
" --eval " + testFile +
" --comp:vcf,storage=STREAM " + tmpFifo.getAbsolutePath() +
" -EV CompOverlap -noEV -noST" +
" -o %s",

View File

@ -83,10 +83,23 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
" -GF RD" +
" -o %s",
1,
Arrays.asList("f80c4714d83226b6a6db8bf281b3bcba"));
Arrays.asList("d43562e9b94f0e8e337d38a6829671ee"));
executeTest("testGenotypeFields", spec);
}
@Test(enabled = true)
public void testGenotypeFieldsWithInline() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
" --variant " + testDir + "vcfexample2.vcf" +
" -T VariantsToTable" +
" -GF RD -GF GT -GF GQ" +
" -o %s",
1,
Arrays.asList("29744059742ae71fd6aabd29e5c391fb"));
executeTest("testGenotypeFieldsWithInline", spec);
}
@Test(enabled = true)
public void testMoltenOutput() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -111,7 +124,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest {
" --moltenize" +
" -o %s",
1,
Arrays.asList("132890fd33d16946e04b41cfd7453c0e"));
Arrays.asList("1d97fe63c249a995df4ce666382872d8"));
executeTest("testMoltenOutputWithGenotypeFields", spec);
}

View File

@ -19,7 +19,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testVariantsToVCFUsingDbsnpInput() {
List<String> md5 = new ArrayList<String>();
md5.add("a26afcce2a89f905a49c3d09719586b2");
md5.add("268c116f825c2a4b5200a416ca587adc");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -36,7 +36,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testVariantsToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("4accae035d271b35ee2ec58f403c68c6");
md5.add("82ca5ecef2df5d64dee9ef5a4b14ef2f");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -54,7 +54,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingGeliInput() {
List<String> md5 = new ArrayList<String>();
md5.add("2413f036ec4100b8d5db179946159a82");
md5.add("90bc2e21d633fa6c3c47c6bd86c134a0");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -72,7 +72,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingHapMapInput() {
List<String> md5 = new ArrayList<String>();
md5.add("f343085305e80c7a2493422e4eaad983");
md5.add("bb71dabd072a679cc85fe8d3e130fb2b");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +
@ -89,7 +89,7 @@ public class VariantsToVCFIntegrationTest extends WalkerTest {
@Test
public void testGenotypesToVCFUsingVCFInput() {
List<String> md5 = new ArrayList<String>();
md5.add("b1ddde7efff9c405f8f92f0a636cd919");
md5.add("ae39e2249bc20fcd0a668a7fe5fb84b0");
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-R " + b36KGReference +

View File

@ -31,7 +31,7 @@ package org.broadinstitute.sting.utils.codecs.bcf2;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.variantcontext.writer.BCF2Encoder;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@ -41,7 +41,10 @@ import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class BCF2EncoderDecoderUnitTest extends BaseTest {
@ -172,13 +175,11 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
}
}
@DataProvider(name = "BCF2EncodingTestProviderSingletons")
public Object[][] BCF2EncodingTestProviderSingletons() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( BCF2TypedValue tv : primitives )
tests.add(new Object[]{Arrays.asList(tv)});
return tests.toArray(new Object[][]{});
}
// -----------------------------------------------------------------
//
// Test encoding of basic types
//
// -----------------------------------------------------------------
@DataProvider(name = "BCF2EncodingTestProviderBasicTypes")
public Object[][] BCF2EncodingTestProviderBasicTypes() {
@ -188,36 +189,68 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "BCF2EncodingTestProviderSequences")
public Object[][] BCF2EncodingTestProviderSequences() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( BCF2TypedValue tv1 : forCombinations )
for ( BCF2TypedValue tv2 : forCombinations )
for ( BCF2TypedValue tv3 : forCombinations )
tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)});
return tests.toArray(new Object[][]{});
private interface EncodeMe {
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException;
}
@Test(dataProvider = "BCF2EncodingTestProviderSingletons")
public void testBCF2EncodingSingletons(final List<BCF2TypedValue> toEncode) throws IOException {
final byte[] record = encodeRecord(toEncode);
decodeRecord(toEncode, record);
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
public void testBCF2BasicTypesWithStaticCalls(final List<BCF2TypedValue> toEncode) throws IOException {
testBCF2BasicTypesWithEncodeMe(toEncode,
new EncodeMe() {
@Override
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException {
switch ( tv.type ) {
case INT8:
case INT16:
case INT32:
encoder.encodeTypedInt((Integer)tv.value, tv.type);
break;
case FLOAT:
encoder.encodeTypedFloat((Double)tv.value);
break;
case CHAR:
encoder.encodeTypedString((String)tv.value);
break;
}
}
});
}
@DataProvider(name = "ListOfStrings")
public Object[][] listOfStringsProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"});
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"});
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"});
return tests.toArray(new Object[][]{});
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
public void testBCF2BasicTypesWithObjectType(final List<BCF2TypedValue> toEncode) throws IOException {
testBCF2BasicTypesWithEncodeMe(toEncode,
new EncodeMe() {
@Override
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException {
encoder.encodeTyped(tv.value, tv.type);
}
});
}
@Test(dataProvider = "ListOfStrings")
public void testEncodingListOfString(List<String> strings, String expected) throws IOException {
final String collapsed = BCF2Utils.collapseStringList(strings);
Assert.assertEquals(collapsed, expected);
Assert.assertEquals(BCF2Utils.exploreStringList(collapsed), strings);
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
public void testBCF2BasicTypesWithObjectNoType(final List<BCF2TypedValue> toEncode) throws IOException {
testBCF2BasicTypesWithEncodeMe(toEncode,
new EncodeMe() {
@Override
public void encode(final BCF2Encoder encoder, final BCF2TypedValue tv) throws IOException {
encoder.encode(tv.value);
}
});
}
public void testBCF2BasicTypesWithEncodeMe(final List<BCF2TypedValue> toEncode, final EncodeMe func) throws IOException {
for ( final BCF2TypedValue tv : toEncode ) {
BCF2Encoder encoder = new BCF2Encoder();
func.encode(encoder, tv);
BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
final Object decoded = decoder.decodeTypedValue();
Assert.assertNotNull(decoded);
Assert.assertFalse(decoded instanceof List);
myAssertEquals(tv, decoded);
}
}
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
@ -240,30 +273,34 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
}
}
@DataProvider(name = "BestIntTypeTests")
public Object[][] BestIntTypeTests() {
@DataProvider(name = "BCF2EncodingTestProviderSingletons")
public Object[][] BCF2EncodingTestProviderSingletons() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{Arrays.asList(1), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, 10), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, 10, 100), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, -1), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, 1000), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(1, 1000, 10), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(1, 1000, 100), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(1000), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(100000), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(100000, 10), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(100000, 100), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(100000, 1, -10), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(-100000, 1, -10), BCF2Type.INT32});
for ( BCF2TypedValue tv : primitives )
tests.add(new Object[]{Arrays.asList(tv)});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BestIntTypeTests")
public void determineBestEncoding(final List<Integer> ints, final BCF2Type expectedType) throws IOException {
BCF2Encoder encoder = new BCF2Encoder();
Assert.assertEquals(encoder.determineIntegerType(ints), expectedType);
Assert.assertEquals(encoder.determineIntegerType(ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))), expectedType);
@Test(dataProvider = "BCF2EncodingTestProviderSingletons")
public void testBCF2EncodingSingletons(final List<BCF2TypedValue> toEncode) throws IOException {
final byte[] record = encodeRecord(toEncode);
decodeRecord(toEncode, record);
}
// -----------------------------------------------------------------
//
// Test encoding of vectors
//
// -----------------------------------------------------------------
@DataProvider(name = "BCF2EncodingTestProviderSequences")
public Object[][] BCF2EncodingTestProviderSequences() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( BCF2TypedValue tv1 : forCombinations )
for ( BCF2TypedValue tv2 : forCombinations )
for ( BCF2TypedValue tv3 : forCombinations )
tests.add(new Object[]{Arrays.asList(tv1, tv2, tv3)});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BCF2EncodingTestProviderBasicTypes")
@ -289,13 +326,72 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
}
}
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingSingletons")
public void testBCF2EncodingTestProviderSequences(final List<BCF2TypedValue> toEncode) throws IOException {
final byte[] record = encodeRecord(toEncode);
decodeRecord(toEncode, record);
}
// -----------------------------------------------------------------
//
// Test strings and lists of strings
//
// -----------------------------------------------------------------
@DataProvider(name = "ListOfStrings")
public Object[][] listOfStringsProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{Arrays.asList("s1", "s2"), ",s1,s2"});
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3"), ",s1,s2,s3"});
tests.add(new Object[]{Arrays.asList("s1", "s2", "s3", "s4"), ",s1,s2,s3,s4"});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ListOfStrings")
public void testEncodingListOfString(List<String> strings, String expected) throws IOException {
final String collapsed = BCF2Utils.collapseStringList(strings);
Assert.assertEquals(collapsed, expected);
Assert.assertEquals(BCF2Utils.exploreStringList(collapsed), strings);
}
// -----------------------------------------------------------------
//
// Tests to determine the best type of arrays of integers
//
// -----------------------------------------------------------------
@DataProvider(name = "BestIntTypeTests")
public Object[][] BestIntTypeTests() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{Arrays.asList(1), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, 10), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, 10, 100), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, -1), BCF2Type.INT8});
tests.add(new Object[]{Arrays.asList(1, 1000), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(1, 1000, 10), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(1, 1000, 100), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(1000), BCF2Type.INT16});
tests.add(new Object[]{Arrays.asList(100000), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(100000, 10), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(100000, 100), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(100000, 1, -10), BCF2Type.INT32});
tests.add(new Object[]{Arrays.asList(-100000, 1, -10), BCF2Type.INT32});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BestIntTypeTests")
public void determineBestEncoding(final List<Integer> ints, final BCF2Type expectedType) throws IOException {
BCF2Encoder encoder = new BCF2Encoder();
Assert.assertEquals(BCF2Utils.determineIntegerType(ints), expectedType);
Assert.assertEquals(BCF2Utils.determineIntegerType(ArrayUtils.toPrimitive(ints.toArray(new Integer[0]))), expectedType);
}
// -----------------------------------------------------------------
//
// Tests managing and skipping multiple blocks
//
// -----------------------------------------------------------------
@Test(dataProvider = "BCF2EncodingTestProviderSequences", dependsOnMethods = "testBCF2EncodingTestProviderSequences")
public void testReadAndSkipWithMultipleBlocks(final List<BCF2TypedValue> block) throws IOException {
testReadAndSkipWithMultipleBlocks(block, forCombinations);
@ -337,6 +433,82 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
decodeRecord(block2, decoder);
}
// -----------------------------------------------------------------
//
// Test encoding / decoding arrays of ints
//
// This checks that we can encode and decode correctly with the
// low-level decodeIntArray function arrays of values. This
// has to be pretty comprehensive as decodeIntArray is a highly optimized
// piece of code with lots of edge cases. The values we are encoding
// don't really matter -- just that the values come back as expected.
//
// -----------------------------------------------------------------
@DataProvider(name = "IntArrays")
public Object[][] makeIntArrays() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( int nValues : Arrays.asList(0, 1, 2, 5, 10, 100) ) {
for ( int nPad : Arrays.asList(0, 1, 2, 5, 10, 100) ) {
int nElements = nValues + nPad;
List<Integer> values = new ArrayList<Integer>(nElements);
// add nValues from 0 to nValues - 1
for ( int i = 0; i < nValues; i++ )
values.add(i);
// add nPad nulls
for ( int i = 0; i < nPad; i++ )
values.add(null);
tests.add(new Object[]{values});
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "IntArrays")
public void testIntArrays(final List<Integer> ints) throws IOException {
final BCF2Encoder encoder = new BCF2Encoder();
encoder.encodeTyped(ints, BCF2Type.INT16);
final BCF2Decoder decoder = new BCF2Decoder(encoder.getRecordBytes());
final byte typeDescriptor = decoder.readTypeDescriptor();
// read the int[] with the low-level version
final int[] decoded = decoder.decodeIntArray(typeDescriptor);
if ( isMissing(ints) ) {
// we expect that the result is null in this case
Assert.assertNull(decoded, "Encoded all missing values -- expected null");
} else {
// we expect at least some values to come back
Assert.assertTrue(decoded.length > 0, "Must have at least 1 element for non-null encoded data");
// check corresponding values
for ( int i = 0; i < ints.size(); i++ ) {
final Integer expected = ints.get(i);
if ( expected == null ) {
Assert.assertTrue(decoded.length <= i, "we expect decoded to be truncated for missing values");
} else {
Assert.assertTrue(decoded.length > i, "we expected at least " + i + " values in decoded array");
Assert.assertEquals(decoded[i], (int)expected);
}
}
}
}
// -----------------------------------------------------------------
//
// Helper routines
//
// -----------------------------------------------------------------
private final byte[] combineRecords(final byte[] record1, final byte[] record2) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(record1);
@ -392,4 +564,12 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest {
} else
Assert.assertEquals(decoded, tv.value);
}
private final boolean isMissing(final List<Integer> values) {
if ( values != null )
for ( Integer value : values )
if ( value != null )
return false;
return true;
}
}

Some files were not shown because too many files have changed in this diff Show More