diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index 3cbffe577..0a2e2af17 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -51,6 +51,8 @@ import java.util.List; * @version 0.1 */ public class VariantContextWriterStub implements Stub, VariantContextWriter { + public final static boolean UPDATE_CONTIG_HEADERS = false; + /** * The engine, central to the GATK's processing. */ @@ -215,7 +217,8 @@ public class VariantContextWriterStub implements Stub, Var vcfHeader.addMetaDataLine(commandLineArgHeaderLine); } - vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine); + if ( UPDATE_CONTIG_HEADERS ) + vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine); } outputTracker.getStorage(this).writeHeader(vcfHeader); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 4f4608a36..aa3243b74 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -145,7 +146,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa } // public String getIndelBases() - public List getKeyNames() { return Arrays.asList("AD"); } + public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } public List getDescriptions() { return Arrays.asList( diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 806a9c2a5..98ed629b7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -2144,7 +2144,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { class VCFIndelAttributes { - public static String ALLELIC_DEPTH_KEY = "AD"; + public static String ALLELIC_DEPTH_KEY = VCFConstants.GENOTYPE_ALLELE_DEPTHS; public static String DEPTH_TOTAL_KEY = VCFConstants.DEPTH_KEY; public static String MAPQ_KEY = "MQS"; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 6fd2b39ed..dd0d85367 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -798,8 +798,8 @@ public class PhaseByTransmission extends RodWalker, HashMa mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t%s:%s:%s:%s\t%s:%s:%s:%s", vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(), phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(), - phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString()); + phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS),phasedFather.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS),phasedChild.getLikelihoodsString()); if(!(phasedMother.getType()==mother.getType() && phasedFather.getType()==father.getType() && phasedChild.getType()==child.getType())) metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); } @@ -809,8 +809,8 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t%s:%s:%s:%s\t.:.:.:.\t%s:%s:%s:%s", vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(), - phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute("AD"),phasedMother.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString()); + phasedMother.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedMother.getGenotypeString(),phasedMother.getAttribute(VCFConstants.DEPTH_KEY),phasedMother.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS),phasedMother.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS),phasedChild.getLikelihoodsString()); } } else{ @@ -820,8 +820,8 @@ public class PhaseByTransmission extends RodWalker, HashMa metricsCounters.put(NUM_GENOTYPES_MODIFIED,metricsCounters.get(NUM_GENOTYPES_MODIFIED)+1); mvfLine = String.format("%s\t%d\t%s\t%s\t%s\t%s\t.:.:.:.\t%s:%s:%s:%s\t%s:%s:%s:%s", vc.getChr(),vc.getStart(),vc.getFilters(),vc.getAttribute(VCFConstants.ALLELE_COUNT_KEY),sample.toString(), - phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute("AD"),phasedFather.getLikelihoodsString(), - phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute("AD"),phasedChild.getLikelihoodsString()); + phasedFather.getAttribute(TRANSMISSION_PROBABILITY_TAG_NAME),phasedFather.getGenotypeString(),phasedFather.getAttribute(VCFConstants.DEPTH_KEY),phasedFather.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS),phasedFather.getLikelihoodsString(), + phasedChild.getGenotypeString(),phasedChild.getAttribute(VCFConstants.DEPTH_KEY),phasedChild.getAttribute(VCFConstants.GENOTYPE_ALLELE_DEPTHS),phasedChild.getLikelihoodsString()); } //Report violation if set so diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 84018b652..3da645cbd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -34,7 +34,6 @@ import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -313,7 +312,7 @@ public class BCF2Codec implements FeatureCodec, ReferenceDepende final VariantContextBuilder builder ) { if (siteInfo.nSamples > 0) { final LazyGenotypesContext.LazyParser lazyParser = - new LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields); + new BCF2LazyGenotypesDecoder(this, siteInfo.alleles, siteInfo.nSamples, siteInfo.nFormatFields); final int nGenotypes = header.getGenotypeSamples().size(); LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, decoder.getRecordBytes(), nGenotypes); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/LazyGenotypesDecoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java similarity index 64% rename from public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/LazyGenotypesDecoder.java rename to public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java index 6d716756f..a628be53b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/LazyGenotypesDecoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2LazyGenotypesDecoder.java @@ -28,10 +28,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.LazyGenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -41,8 +38,8 @@ import java.util.*; * @author Mark DePristo * @since 5/12 */ -class LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { - final protected static Logger logger = Logger.getLogger(LazyGenotypesDecoder.class); +class BCF2LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { + final protected static Logger logger = Logger.getLogger(BCF2LazyGenotypesDecoder.class); // the essential information for us to use to decode the genotypes data // initialized when this lazy decoder is created, as we know all of this from the BCF2Codec @@ -52,7 +49,7 @@ class LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { private final int nSamples; private final int nFields; - LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, final int nFields) { + BCF2LazyGenotypesDecoder(final BCF2Codec codec, final ArrayList alleles, final int nSamples, final int nFields) { this.codec = codec; this.siteAlleles = alleles; this.nSamples = nSamples; @@ -131,6 +128,84 @@ class LazyGenotypesDecoder implements LazyGenotypesContext.LazyParser { return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset()); } +// public LazyGenotypesContext.LazyData parse2(final Object data) { +// logger.info("Decoding BCF genotypes for " + nSamples + " samples with " + nFields + " fields each"); +// +// // load our bytep[] data into the decoder +// final BCF2Decoder decoder = new BCF2Decoder((byte[])data); +// +// // go ahead and decode everyone +// final List samples = new ArrayList(codec.getHeader().getGenotypeSamples()); +// +// if ( samples.size() != nSamples ) +// throw new UserException.MalformedBCF2("GATK currently doesn't support reading BCF2 files with " + +// "different numbers of samples per record. Saw " + samples.size() + +// " samples in header but have a record with " + nSamples + " samples"); +// +// final Map> fieldValues = decodeGenotypeFieldValues(decoder, nFields, nSamples); +// final ArrayList genotypes = new ArrayList(nSamples); +// final GenotypeBuilder gb = new GenotypeBuilder(); +// for ( int i = 0; i < nSamples; i++ ) { +// // all of the information we need for each genotype, with default values +// gb.reset(); +// gb.name(samples.get(i)); +// +// for ( final Map.Entry> entry : fieldValues.entrySet() ) { +// final String field = entry.getKey(); +// Object value = entry.getValue().get(i); +// try { +// if ( field.equals(VCFConstants.GENOTYPE_KEY) ) { +// gb.alleles(decodeGenotypeAlleles(siteAlleles, (List)value)); +// } else if ( field.equals(VCFConstants.DEPTH_KEY) ) { +// if ( value != BCF2Type.INT8.getMissingJavaValue() ) +// gb.DP((Integer)value); +// } else if ( field.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { +// if ( value != BCF2Type.INT8.getMissingJavaValue() ) +// gb.GQ((Integer)value); +// } else if ( field.equals(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) ) { +// final int[] PLs = decodeIntArray(value); +// if ( PLs != null ) +// gb.PL(PLs); +// } else if ( field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS) ) { +// final int[] AD = decodeIntArray(value); +// if ( AD != null ) +// gb.AD(AD); +// } else if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) ) { +// throw new ReviewedStingException("Genotype filters not implemented in GATK BCF2"); +// //filters = new HashSet(values.get(i)); +// } else { // add to attributes +// if ( value != null ) { // don't add missing values +// if ( value instanceof List && ((List)value).size() == 1) +// value = ((List)value).get(0); +// gb.attribute(field, value); +// } +// } +// } catch ( ClassCastException e ) { +// throw new UserException.MalformedBCF2("BUG: expected encoding of field " + field +// + " inconsistent with the value observed in the decoded value in the " +// + " BCF file. Value was " + value); +// } +// } +// +// final Genotype g = gb.make(); +// genotypes.add(g); +// } +// +// return new LazyGenotypesContext.LazyData(genotypes, codec.getHeader().getSampleNamesInOrder(), codec.getHeader().getSampleNameToOffset()); +// } + + private final int[] decodeIntArray(final Object value) { + // todo -- decode directly into int[] + final List pls = (List)value; + if ( pls != null ) { // we have a PL field + final int[] x = new int[pls.size()]; + for ( int j = 0; j < x.length; j++ ) + x[j] = pls.get(j); + return x; + } else + return null; + } + private final List decodeGenotypeAlleles(final ArrayList siteAlleles, final List encoded) { if ( encoded == null ) // no called sample GT = . diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index 4593023fc..fd762342e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -48,6 +48,7 @@ public final class VCFConstants { public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods public static final String GENOTYPE_POSTERIORS_KEY = "GP"; public static final String GENOTYPE_QUALITY_KEY = "GQ"; + public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; public static final String HAPMAP2_KEY = "H2"; public static final String HAPMAP3_KEY = "H3"; public static final String HAPLOTYPE_QUALITY_KEY = "HQ"; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java new file mode 100755 index 000000000..fe92f7c7c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/FastGenotype.java @@ -0,0 +1,620 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broad.tribble.util.ParsingUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * This class encompasses all the basic information about a genotype. It is immutable. + * + * A genotype has several key fields + * + * -- a sample name, must be a non-null string + * + * -- an ordered list of alleles, intrepreted as the genotype of the sample, + * each allele for each chromosome given in order. If alleles = [a*, t] + * then the sample is a/t, with a (the reference from the *) the first + * chromosome and t on the second chromosome + * + * -- a isPhased marker indicting where the alleles are phased with respect to some global + * coordinate system. See VCF4.1 spec for a detailed discussion + * + * -- Inline, optimized ints and int[] values for: + * -- GQ: the phred-scaled genotype quality, of -1 if it's missing + * + * -- DP: the count of reads at this locus for this sample, of -1 if missing + * + * -- AD: an array of counts of reads at this locus, one for each Allele at the site. + * that is, for each allele in the surrounding VariantContext. Null if missing. + * + * -- PL: phred-scaled genotype likelihoods in standard VCF4.1 order for + * all combinations of the alleles in the surrounding VariantContext, given + * the ploidy of the sample (from the alleles vector). Null if missing. + * + * -- A general map from String keys to -> Object values for all other attributes in + * this genotype. Note that this map should not contain duplicate values for the + * standard bindings for GQ, DP, AD, and PL. Genotype filters can be put into + * this genotype, but it isn't respected by the GATK in analyses + * + * The only way to build a Genotype object is with a GenotypeBuilder, which permits values + * to be set in any order, which means that GenotypeBuilder may at some in the chain of + * sets pass through invalid states that are not permitted in a fully formed immutable + * Genotype. + * + * Note this is a simplified, refactored Genotype object based on the original + * generic (and slow) implementation from the original VariantContext + Genotype + * codebase. + * + * @author Mark DePristo + * @since 05/12 + */ +public final class FastGenotype implements Comparable { + private final String sampleName; + private final List alleles; + private final boolean isPhased; + private final int GQ; + private final int DP; + private final int[] AD; + private final int[] PL; + private final Map extendedAttributes; + + private Type type = null; + + /** + * The only way to make one of these, for use by GenotypeBuilder only + * + * @param sampleName + * @param alleles + * @param isPhased + * @param GQ + * @param DP + * @param AD + * @param PL + * @param extendedAttributes + */ + @Requires({ + "sampleName != null", + "alleles != null", + "GQ >= -1", + "DP >= -1", + "validADorPLField(AD)", + "validADorPLField(PL)", + "extendedAttributes != null", + "! hasForbiddenKey(extendedAttributes)"}) + protected FastGenotype(final String sampleName, + final List alleles, + final boolean isPhased, + final int GQ, + final int DP, + final int[] AD, + final int[] PL, + final Map extendedAttributes) { + this.sampleName = sampleName; + this.alleles = alleles; + this.isPhased = isPhased; + this.GQ = GQ; + this.DP = DP; + this.AD = AD; + this.PL = PL; + this.extendedAttributes = extendedAttributes; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Basic getters + // + // --------------------------------------------------------------------------------------------------------- + + /** + * @return the alleles for this genotype. Cannot be null. May be empty + */ + @Ensures("result != null") + public List getAlleles() { + return alleles; + } + + /** + * Returns how many times allele appears in this genotype object? + * + * @param allele + * @return a value >= 0 indicating how many times the allele occurred in this sample's genotype + */ + @Requires("allele != null") + @Ensures("result >= 0") + public int countAllele(final Allele allele) { + int c = 0; + for ( final Allele a : alleles ) + if ( a.equals(allele) ) + c++; + + return c; + } + + /** + * Get the ith allele in this genotype + * + * @param i the ith allele, must be < the ploidy, starting with 0 + * @return the allele at position i, which cannot be null + */ + @Requires("i >=0 && i < getPloidy()") + @Ensures("result != null") + public Allele getAllele(int i) { + if ( getType() == Type.UNAVAILABLE ) + throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype"); + return alleles.get(i); + } + + /** + * Are the alleles phased w.r.t. the global phasing system? + * + * @return true if yes + */ + public boolean isPhased() { + return isPhased; + } + + /** + * What is the ploidy of this sample? + * + * @return the ploidy of this genotype. 0 if the site is no-called. + */ + @Ensures("result >= 0") + public int getPloidy() { + return alleles.size(); + } + + /** + * @return the sequencing depth of this sample, or -1 if this value is missing + */ + @Ensures("result >= -1") + public int getDP() { + return DP; + } + + /** + * @return the count of reads, one for each allele in the surrounding Variant context, + * matching the corresponding allele, or null if this value is missing. MUST + * NOT BE MODIFIED! + */ + public int[] getAD() { + return AD; + } + + @Ensures("result != null") + public String getSampleName() { + return sampleName; + } + + /** + * Returns a phred-scaled quality score, or -1 if none is available + * @return + */ + @Ensures("result >= -1") + public int getGQ() { + return GQ; + } + + // --------------------------------------------------------------------------------------------------------- + // + // The type of this genotype + // + // --------------------------------------------------------------------------------------------------------- + + public enum Type { + /** The sample is no-called (all alleles are NO_CALL */ + NO_CALL, + /** The sample is homozygous reference */ + HOM_REF, + /** The sample is heterozygous, with at least one ref and at least one one alt in any order */ + HET, + /** All alleles are non-reference */ + HOM_VAR, + /** There is no allele data availble for this sample (alleles.isEmpty) */ + UNAVAILABLE, + /** Some chromosomes are NO_CALL and others are called */ + MIXED // no-call and call in the same genotype + } + + /** + * @return the high-level type of this sample's genotype + */ + @Ensures({"type != null", "result != null"}) + public Type getType() { + if ( type == null ) { + type = determineType(); + } + return type; + } + + /** + * Internal code to determine the type of the genotype from the alleles vector + * @return the type + */ + protected Type determineType() { + // TODO -- this code is slow and could be optimized for the diploid case + if ( alleles.isEmpty() ) + return Type.UNAVAILABLE; + + boolean sawNoCall = false, sawMultipleAlleles = false; + Allele observedAllele = null; + + for ( Allele allele : alleles ) { + if ( allele.isNoCall() ) + sawNoCall = true; + else if ( observedAllele == null ) + observedAllele = allele; + else if ( !allele.equals(observedAllele) ) + sawMultipleAlleles = true; + } + + if ( sawNoCall ) { + if ( observedAllele == null ) + return Type.NO_CALL; + return Type.MIXED; + } + + if ( observedAllele == null ) + throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null"); + + return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR; + } + + /** + * @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false. + */ + public boolean isHom() { return isHomRef() || isHomVar(); } + + /** + * @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false. + */ + public boolean isHomRef() { return getType() == Type.HOM_REF; } + + /** + * @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false. + */ + public boolean isHomVar() { return getType() == Type.HOM_VAR; } + + /** + * @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false. + */ + public boolean isHet() { return getType() == Type.HET; } + + /** + * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false. + */ + public boolean isNoCall() { return getType() == Type.NO_CALL; } + + /** + * @return true if this genotype is comprised of any alleles that are not no-calls (even if some are). + */ + public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; } + + /** + * @return true if this genotype is comprised of both calls and no-calls. + */ + public boolean isMixed() { return getType() == Type.MIXED; } + + /** + * @return true if the type of this genotype is set. + */ + public boolean isAvailable() { return getType() != Type.UNAVAILABLE; } + + // ------------------------------------------------------------------------------ + // + // methods for getting genotype likelihoods for a genotype object, if present + // + // ------------------------------------------------------------------------------ + + /** + * @return Returns true if this Genotype has PL field values + */ + public boolean hasLikelihoods() { + return PL != null; + } + + /** + * Convenience function that returns a string representation of the PL field of this + * genotype, or . if none is available. + * + * @return a non-null String representation for the PL of this sample + */ + @Ensures("result != null") + public String getLikelihoodsString() { + return hasLikelihoods() ? getLikelihoods().toString() : VCFConstants.MISSING_VALUE_v4; + } + + /** + * Returns the GenotypesLikelihoods data associated with this Genotype, or null if missing + * @return null or a GenotypesLikelihood object for this sample's PL field + */ + @Ensures({"hasLikelihoods() && result != null", "! hasLikelihoods() && result == null"}) + public GenotypeLikelihoods getLikelihoods() { + return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null; + } + + /** + * Unsafe low-level accessor the PL field itself, may be null. + * + * @return a pointer to the underlying PL data. MUST NOT BE MODIFIED! + */ + public int[] getPL() { + return PL; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Many different string representations + // + // --------------------------------------------------------------------------------------------------------- + + /** + * Return a VCF-like string representation for the alleles of this genotype. + * + * Does not append the reference * marker on the alleles. + * + * @return a string representing the genotypes, or null if the type is unavailable. + */ + @Ensures("result != null || ! isAvailable()") + public String getGenotypeString() { + return getGenotypeString(true); + } + + private final static String PHASED_ALLELE_SEPARATOR = "|"; + private final static String UNPHASED_ALLELE_SEPARATOR = "/"; + + /** + * Return a VCF-like string representation for the alleles of this genotype. + * + * If ignoreRefState is true, will not append the reference * marker on the alleles. + * + * @return a string representing the genotypes, or null if the type is unavailable. + */ + @Ensures("result != null || ! isAvailable()") + public String getGenotypeString(boolean ignoreRefState) { + if ( alleles.size() == 0 ) + return null; + + // Notes: + // 1. Make sure to use the appropriate separator depending on whether the genotype is phased + // 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele) + // 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course) + return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR, + ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles()))); + } + + /** + * Utility that returns a list of allele strings corresponding to the alleles in this sample + * @return + */ + private List getAlleleStrings() { + List al = new ArrayList(); + for ( Allele a : alleles ) + al.add(a.getBaseString()); + + return al; + } + + public String toString() { + return String.format("[%s %s%s%s%s%s%s]", + getSampleName(), + getGenotypeString(false), + toStringIfExists(VCFConstants.GENOTYPE_QUALITY_KEY, GQ), + toStringIfExists(VCFConstants.DEPTH_KEY, DP), + toStringIfExists(VCFConstants.GENOTYPE_ALLELE_DEPTHS, AD), + toStringIfExists(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, PL), + sortedString(getExtendedAttributes())); + } + + public String toBriefString() { + return String.format("%s:Q%d", getGenotypeString(false), getGQ()); + } + + // --------------------------------------------------------------------------------------------------------- + // + // Comparison operations + // + // --------------------------------------------------------------------------------------------------------- + + /** + * comparable genotypes -> compareTo on the sample names + * @param genotype + * @return + */ + @Override + public int compareTo(final FastGenotype genotype) { + return getSampleName().compareTo(genotype.getSampleName()); + } + + public boolean sameGenotype(final FastGenotype other) { + return sameGenotype(other, true); + } + + public boolean sameGenotype(final FastGenotype other, boolean ignorePhase) { + if (getPloidy() != other.getPloidy()) + return false; // gotta have the same number of allele to be equal + + // By default, compare the elements in the lists of alleles, element-by-element + Collection thisAlleles = this.getAlleles(); + Collection otherAlleles = other.getAlleles(); + + if (ignorePhase) { // do not care about order, only identity of Alleles + thisAlleles = new TreeSet(thisAlleles); //implemented Allele.compareTo() + otherAlleles = new TreeSet(otherAlleles); + } + + return thisAlleles.equals(otherAlleles); + } + + // --------------------------------------------------------------------------------------------------------- + // + // get routines for extended attributes + // + // --------------------------------------------------------------------------------------------------------- + + /** + * Returns the extended attributes for this object + * @return is never null, but is often isEmpty() + */ + @Ensures("result != null") + public Map getExtendedAttributes() { + return extendedAttributes; + } + + /** + * Is key associated with a value (even a null one) in the extended attributes? + * + * Note this will not return true for the inline attributes DP, GQ, AD, or PL + * + * @param key a non-null string key to check for an association + * @return true if key has a value in the extendedAttributes + */ + @Requires("key != null") + public boolean hasAttribute(final String key) { + return extendedAttributes.containsKey(key); + } + + /** + * Get the extended attribute value associated with key, if possible + * + * @param key a non-null string key to fetch a value for + * @param defaultValue the value to return if key isn't in the extended attributes + * @return a value (potentially) null associated with key, or defaultValue if no association exists + */ + @Requires("key != null") + @Ensures("hasAttribute(key) || result == defaultValue") + public Object getAttribute(final String key, final Object defaultValue) { + return hasAttribute(key) ? extendedAttributes.get(key) : defaultValue; + } + + // TODO -- add getAttributesAsX interface here + + // ------------------------------------------------------------------------------ + // + // private utilities + // + // ------------------------------------------------------------------------------ + + /** + * a utility method for generating sorted strings from a map key set. + * @param c the map + * @param the key type + * @param the value type + * @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys + */ + @Requires("c != null") + private static , V> String sortedString(Map c) { + + // NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS + final List t = new ArrayList(c.keySet()); + Collections.sort(t); + + final List pairs = new ArrayList(); + for (final T k : t) { + pairs.add(k + "=" + c.get(k)); + } + + return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}"; + } + + /** + * Returns a display name for field name with value v if this isn't -1. Otherwise returns "" + * @param name of the field ("AD") + * @param v the value of the field, or -1 if missing + * @return a non-null string for display if the field is not missing + */ + @Requires("name != null") + @Ensures("result != null") + private final static String toStringIfExists(final String name, final int v) { + return v == -1 ? "" : " " + name + " " + v; + } + + /** + * Returns a display name for field name with values vs if this isn't null. Otherwise returns "" + * @param name of the field ("AD") + * @param vs the value of the field, or null if missing + * @return a non-null string for display if the field is not missing + */ + @Requires("name != null") + @Ensures("result != null") + private final static String toStringIfExists(final String name, final int[] vs) { + if ( vs == null ) + return ""; + else { + StringBuilder b = new StringBuilder(); + b.append(" ").append(name).append(" "); + for ( int i = 0; i < vs.length; i++ ) { + if ( i != 0 ) b.append(","); + b.append(vs[i]); + } + return b.toString(); + } + } + + /** + * A list of genotype field keys corresponding to values we + * manage inline in the Genotype object. They must not appear in the + * extended attributes map + */ + private final static Collection FORBIDDEN_KEYS = Arrays.asList( + VCFConstants.GENOTYPE_KEY, + VCFConstants.GENOTYPE_QUALITY_KEY, + VCFConstants.DEPTH_KEY, + VCFConstants.GENOTYPE_ALLELE_DEPTHS, + VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + + /** + * Does the attribute map have a mapping involving a forbidden key (i.e., + * one that's managed inline by this Genotypes object? + * + * @param attributes the extended attributes key + * @return + */ + private final static boolean hasForbiddenKey(final Map attributes) { + for ( final String forbidden : FORBIDDEN_KEYS ) + if ( attributes.containsKey(forbidden) ) + return true; + return false; + } + + /** + * Is values a valid AD or PL field + * @param values + * @return + */ + private final static boolean validADorPLField(final int[] values) { + if ( values != null ) + for ( int v : values ) + if ( v < 0 ) + return false; + return true; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java new file mode 100755 index 000000000..39852c723 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.variantcontext; + + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.*; + +/** + * A builder class for genotypes + * + * @author Mark DePristo + */ +public final class GenotypeBuilder { + private String sampleName = null; + private List alleles = null; + + private boolean isPhased = false; + private int GQ = -1; + private int DP = -1; + private int[] AD = null; + private int[] PL = null; + private Map extendedAttributes = null; + + private final static Map NO_ATTRIBUTES = + new HashMap(0); + + /** + * Create a empty builder. Both a sampleName and alleles must be provided + * before trying to make a Genotype from this builder. + */ + public GenotypeBuilder() {} + + /** + * Create a builder using sampleName. Alleles must be provided + * before trying to make a Genotype from this builder. + * @param sampleName + */ + public GenotypeBuilder(final String sampleName) { + name(sampleName); + } + + /** + * Make a builder using sampleName and alleles for starting values + * @param sampleName + * @param alleles + */ + public GenotypeBuilder(final String sampleName, final List alleles) { + name(sampleName); + alleles(alleles); + } + + /** + * Create a new builder starting with the values in Genotype g + * @param g + */ + public GenotypeBuilder(final FastGenotype g) { + copy(g); + } + + /** + * Copy all of the values for this builder from Genotype g + * @param g + * @return + */ + public GenotypeBuilder copy(final FastGenotype g) { + name(g.getSampleName()); + alleles(g.getAlleles()); + phased(g.isPhased()); + GQ(g.getGQ()); + DP(g.getDP()); + AD(g.getAD()); + PL(g.getPL()); + attributes(g.getExtendedAttributes()); + return this; + } + + /** + * Reset all of the builder attributes to their defaults. After this + * function you must provide sampleName and alleles before trying to + * make more Genotypes. + */ + public final void reset() { + sampleName = null; + alleles = null; + isPhased = false; + GQ = -1; + DP = -1; + AD = null; + PL = null; + extendedAttributes = null; + } + + /** + * Create a new Genotype object using the values set in this builder. + * + * After creation the values in this builder can be modified and more Genotypes + * created, althrough the contents of array values like PL should never be modified + * inline as they are not copied for efficiency reasons. + * + * @return a newly minted Genotype object with values provided from this builder + */ + @Ensures({"result != null"}) + public FastGenotype make() { + final Map ea = extendedAttributes == null ? NO_ATTRIBUTES : extendedAttributes; + return new FastGenotype(sampleName, alleles, isPhased, GQ, DP, AD, PL, ea); + } + + @Requires({"sampleName != null"}) + @Ensures({"this.sampleName != null"}) + public GenotypeBuilder name(final String sampleName) { + this.sampleName = sampleName; + return this; + } + + @Ensures({"this.alleles != null"}) + public GenotypeBuilder alleles(final List alleles) { + if ( alleles == null ) + this.alleles = Collections.emptyList(); + else + this.alleles = alleles; + return this; + } + + public GenotypeBuilder phased(final boolean phased) { + isPhased = phased; + return this; + } + + @Requires({"GQ >= -1"}) + @Ensures({"this.GQ == GQ"}) + public GenotypeBuilder GQ(final int GQ) { + this.GQ = GQ; + return this; + } + + @Requires({"DP >= -1"}) + @Ensures({"this.DP == DP"}) + public GenotypeBuilder DP(final int DP) { + this.DP = DP; + return this; + } + + @Requires({"AD == null || AD.length > 0"}) + @Ensures({"this.AD == AD"}) + public GenotypeBuilder AD(final int[] AD) { + this.AD = AD; + return this; + } + + @Requires("PL == null || PL.length > 0") + @Ensures({"this.PL == PL"}) + public GenotypeBuilder PL(final int[] PL) { + this.PL = PL; + return this; + } + + @Requires("attributes != null") + @Ensures("extendedAttributes != null") + public GenotypeBuilder attributes(final Map attributes) { + for ( Map.Entry pair : attributes.entrySet() ) + attribute(pair.getKey(), pair.getValue()); + return this; + } + + @Requires({"key != null"}) + @Ensures({"extendedAttributes != null", "extendedAttributes.containsKey(key)"}) + public GenotypeBuilder attribute(final String key, final Object value) { + if ( extendedAttributes == null ) + extendedAttributes = new HashMap(5); + extendedAttributes.put(key, value); + return this; + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 81ec322d7..1995736f7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -48,7 +48,7 @@ import java.text.SimpleDateFormat; import java.util.*; public class WalkerTest extends BaseTest { - private static final boolean GENERATE_SHADOW_BCF = true; + private static final boolean GENERATE_SHADOW_BCF = false; private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false;