diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 2b163ecbd..d70c63bd2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -711,6 +711,8 @@ public class SAMDataSource { * @param validationStringency validation stringency. */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { + int totalNumberOfFiles = readerIDs.size(); + int readerNumber = 1; for(SAMReaderID readerID: readerIDs) { File indexFile = findIndexFile(readerID.samFile); @@ -728,8 +730,7 @@ public class SAMDataSource { reader.enableFileSource(true); reader.setValidationStringency(validationStringency); - final SAMFileHeader header = reader.getFileHeader(); - logger.debug(String.format("Sort order is: " + header.getSortOrder())); + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); readers.put(readerID,reader); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index ee0873c6e..a6f6b3481 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -142,20 +142,75 @@ public class SampleDB { * @return */ public final Map> getFamilies() { + return getFamilies(null); + } + + /** + * Returns a map from family ID -> set of family members for all samples in sampleIds with + * non-null family ids + * + * @param sampleIds - all samples to include. If null is passed then all samples are returned. + * @return + */ + public final Map> getFamilies(Collection sampleIds) { final Map> families = new TreeMap>(); for ( final Sample sample : samples.values() ) { - final String famID = sample.getFamilyID(); - if ( famID != null ) { - if ( ! families.containsKey(famID) ) - families.put(famID, new TreeSet()); - families.get(famID).add(sample); + if(sampleIds == null || sampleIds.contains(sample.getID())){ + final String famID = sample.getFamilyID(); + if ( famID != null ) { + if ( ! families.containsKey(famID) ) + families.put(famID, new TreeSet()); + families.get(famID).add(sample); + } } } - return families; } + + /** + * Returns the set of all children that have both of their parents. + * Note that if a family is composed of more than 1 child, each child is + * returned. + * @return - all the children that have both of their parents + */ + public final Set getChildrenWithParents(){ + return getChildrenWithParents(false); + } + + /** + * Returns the set of all children that have both of their parents. + * Note that if triosOnly = false, a family is composed of more than 1 child, each child is + * returned. + * + * This method can be used wherever trios are needed + * + * @param triosOnly - if set to true, only strict trios are returned + * @return - all the children that have both of their parents + */ + public final Set getChildrenWithParents(boolean triosOnly) { + + Map> families = getFamilies(); + final Set childrenWithParents = new HashSet(); + Iterator sampleIterator; + + for ( Set familyMembers: families.values() ) { + if(triosOnly && familyMembers.size() != 3) + continue; + + sampleIterator = familyMembers.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() == 2 && familyMembers.containsAll(sample.getParents())) + childrenWithParents.add(sample); + } + + } + return childrenWithParents; + } + /** * Return all samples with a given family ID * @param familyId diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java index 792fef9c3..6264808f4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Walker.java @@ -88,7 +88,7 @@ public abstract class Walker { return getToolkit().getMasterSequenceDictionary(); } - protected SampleDB getSampleDB() { + public SampleDB getSampleDB() { return getToolkit().getSampleDB(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index bd0d4e3fb..b9e6a5b2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -3,22 +3,18 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -30,23 +26,26 @@ import java.util.Map; public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { private MendelianViolation mendelianViolation = null; + private String motherId; + private String fatherId; + private String childId; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( mendelianViolation == null ) { - if ( walker instanceof VariantAnnotator && ((VariantAnnotator) walker).familyStr != null) { - mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP ); + if (checkAndSetSamples(((VariantAnnotator) walker).getSampleDB())) { + mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid Family String file (-family) on the command line."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) && vc.getGenotype(mendelianViolation.getSampleChild()).hasLikelihoods() && - vc.hasGenotype(mendelianViolation.getSampleDad()) && vc.getGenotype(mendelianViolation.getSampleDad()).hasLikelihoods() && - vc.hasGenotype(mendelianViolation.getSampleMom()) && vc.getGenotype(mendelianViolation.getSampleMom()).hasLikelihoods(); + boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && + vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && + vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc)); + toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); return toRet; } @@ -55,4 +54,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getKeyNames() { return Arrays.asList("MVLR"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + + private boolean checkAndSetSamples(SampleDB db){ + Set families = db.getFamilyIDs(); + if(families.size() != 1) + return false; + + Set family = db.getFamily(families.iterator().next()); + if(family.size() != 3) + return false; + + Iterator sampleIter = family.iterator(); + Sample sample; + for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ + if(sample.getParents().size()==2){ + motherId = sample.getMaternalID(); + fatherId = sample.getPaternalID(); + childId = sample.getID(); + return true; + } + } + return false; + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index d555463bc..6638fc7a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -38,7 +38,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati for ( final Genotype genotype : genotypes ) { // we care only about variant calls with likelihoods - if ( genotype.isHomRef() ) + if ( !genotype.isHet() && !genotype.isHomVar() ) continue; AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 3de179365..6cc8923e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -12,10 +12,8 @@ import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.FileNotFoundException; import java.util.*; /** @@ -26,42 +24,33 @@ import java.util.*; public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation { - private Set fullMVSet = null; + private Set trios = null; private final static int REF = 0; private final static int HET = 1; private final static int HOM = 2; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( fullMVSet == null ) { - fullMVSet = new HashSet(); - + if ( trios == null ) { if ( walker instanceof VariantAnnotator ) { - final Map> families = ((VariantAnnotator) walker).getSampleDB().getFamilies(); - for( final Set family : families.values() ) { - for( final Sample sample : family ) { - if( sample.getParents().size() == 2 && family.containsAll(sample.getParents()) ) { // only works with trios for now - fullMVSet.add( new MendelianViolation(sample, 0.0) ); - } - } - } + trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); } else { throw new UserException("Transmission disequilibrium test annotation can only be used from the Variant Annotator and requires a valid ped file be passed in."); } } final Map toRet = new HashMap(1); - final HashSet mvsToTest = new HashSet(); + final HashSet triosToTest = new HashSet(); - for( final MendelianViolation mv : fullMVSet ) { - final boolean hasAppropriateGenotypes = vc.hasGenotype(mv.getSampleChild()) && vc.getGenotype(mv.getSampleChild()).hasLikelihoods() && - vc.hasGenotype(mv.getSampleDad()) && vc.getGenotype(mv.getSampleDad()).hasLikelihoods() && - vc.hasGenotype(mv.getSampleMom()) && vc.getGenotype(mv.getSampleMom()).hasLikelihoods(); + for( final Sample child : trios) { + final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() && + vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() && + vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods(); if ( hasAppropriateGenotypes ) { - mvsToTest.add(mv); + triosToTest.add(child); } } - toRet.put("TDT", calculateTDT( vc, mvsToTest )); + toRet.put("TDT", calculateTDT( vc, triosToTest )); return toRet; } @@ -72,27 +61,27 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("TDT", 1, VCFHeaderLineType.Float, "Test statistic from Wittkowski transmission disequilibrium test.")); } // Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT - private double calculateTDT( final VariantContext vc, final Set mvsToTest ) { + private double calculateTDT( final VariantContext vc, final Set triosToTest ) { - final double nABGivenABandBB = calculateNChildren(vc, mvsToTest, HET, HET, HOM); - final double nBBGivenABandBB = calculateNChildren(vc, mvsToTest, HOM, HET, HOM); - final double nAAGivenABandAB = calculateNChildren(vc, mvsToTest, REF, HET, HET); - final double nBBGivenABandAB = calculateNChildren(vc, mvsToTest, HOM, HET, HET); - final double nAAGivenAAandAB = calculateNChildren(vc, mvsToTest, REF, REF, HET); - final double nABGivenAAandAB = calculateNChildren(vc, mvsToTest, HET, REF, HET); + final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM); + final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM); + final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET); + final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET); + final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET); + final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET); final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB); final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB); return (numer * numer) / denom; } - private double calculateNChildren( final VariantContext vc, final Set mvsToTest, final int childIdx, final int momIdx, final int dadIdx ) { - final double likelihoodVector[] = new double[mvsToTest.size() * 2]; + private double calculateNChildren( final VariantContext vc, final Set triosToTest, final int childIdx, final int momIdx, final int dadIdx ) { + final double likelihoodVector[] = new double[triosToTest.size() * 2]; int iii = 0; - for( final MendelianViolation mv : mvsToTest ) { - final double[] momGL = vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsVector(); - final double[] dadGL = vc.getGenotype(mv.getSampleDad()).getLikelihoods().getAsVector(); - final double[] childGL = vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsVector(); + for( final Sample child : triosToTest ) { + final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector(); + final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector(); + final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector(); likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx]; likelihoodVector[iii++] = momGL[dadIdx] + dadGL[momIdx] + childGL[childIdx]; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 143f2eb2e..94902e828 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -167,9 +167,6 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; - @Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation") - public String familyStr = null; - @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio") public double minGenotypeQualityP = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index a8ce98945..7d3e7047d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -52,7 +52,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { protected enum GenotypeType { AA, AB, BB } - protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE; + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; protected AlleleFrequencyCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { this.N = N; @@ -62,24 +62,14 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { /** * Must be overridden by concrete subclasses - * @param GLs genotype likelihoods - * @param Alleles Alleles corresponding to GLs - * @param log10AlleleFrequencyPriors priors - * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results + * @param GLs genotype likelihoods + * @param Alleles Alleles corresponding to GLs + * @param log10AlleleFrequencyPriors priors + * @param log10AlleleFrequencyLikelihoods array (pre-allocated) to store likelihoods results + * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store posteriors results */ protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors); - - /** - * Can be overridden by concrete subclasses - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyPosteriors allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - protected abstract GenotypesContext assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood); + double[][] log10AlleleFrequencyPriors, + double[][] log10AlleleFrequencyLikelihoods, + double[][] log10AlleleFrequencyPosteriors); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java deleted file mode 100644 index fbd9c1dbf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.variantcontext.Allele; - -public class BiallelicGenotypeLikelihoods { - - private String sample; - private double[] GLs; - private Allele A, B; - private int depth; - - /** - * Create a new object for sample with given alleles and genotype likelihoods - * - * @param sample sample name - * @param A allele A - * @param B allele B - * @param log10AALikelihoods AA likelihoods - * @param log10ABLikelihoods AB likelihoods - * @param log10BBLikelihoods BB likelihoods - * @param depth the read depth used in creating the likelihoods - */ - public BiallelicGenotypeLikelihoods(String sample, - Allele A, - Allele B, - double log10AALikelihoods, - double log10ABLikelihoods, - double log10BBLikelihoods, - int depth) { - this.sample = sample; - this.A = A; - this.B = B; - this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods}; - this.depth = depth; - } - - public String getSample() { - return sample; - } - - public double getAALikelihoods() { - return GLs[0]; - } - - public double getABLikelihoods() { - return GLs[1]; - } - - public double getBBLikelihoods() { - return GLs[2]; - } - - public double[] getLikelihoods() { - return GLs; - } - - public Allele getAlleleA() { - return A; - } - - public Allele getAlleleB() { - return B; - } - - public int getDepth() { - return depth; - } -} - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 106bb1982..09936c112 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -27,13 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.BaseUtils; -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Aug 4, 2009 - * Time: 6:46:09 PM - * To change this template use File | Settings | File Templates. - */ public enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), @@ -110,6 +103,20 @@ public enum DiploidGenotype { return conversionMatrix[index1][index2]; } + /** + * create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly + * @param baseIndex1 base1 + * @param baseIndex2 base2 + * @return the diploid genotype + */ + public static DiploidGenotype createDiploidGenotype(int baseIndex1, int baseIndex2) { + if ( baseIndex1 == -1 ) + throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character"); + if ( baseIndex2 == -1 ) + throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character"); + return conversionMatrix[baseIndex1][baseIndex2]; + } + private static final DiploidGenotype[][] conversionMatrix = { { DiploidGenotype.AA, DiploidGenotype.AC, DiploidGenotype.AG, DiploidGenotype.AT }, { DiploidGenotype.AC, DiploidGenotype.CC, DiploidGenotype.CG, DiploidGenotype.CT }, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 5d0b6f0a7..ed86897f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,80 +27,43 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - // - // code for testing purposes - // + private final static boolean DEBUG = false; + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - private final boolean SIMPLE_GREEDY_GENOTYPER = false; - private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - private final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); } - public void getLog10PNonRef(GenotypesContext GLs, List alleles, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { + public void getLog10PNonRef(final GenotypesContext GLs, + final List alleles, + final double[][] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyLikelihoods, + final double[][] log10AlleleFrequencyPosteriors) { final int numAlleles = alleles.size(); - final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null; - final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null; - int idxDiag = numAlleles; - int incr = numAlleles - 1; - for (int k=1; k < numAlleles; k++) { - // multi-allelic approximation, part 1: Ideally - // for each alt allele compute marginal (suboptimal) posteriors - - // compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of - // as a row-wise upper triangular matrix of likelihoods. - // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. - // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD - - final int idxAA = 0; - final int idxAB = k; - // yy is always element on the diagonal. - // 2 alleles: BBelement 2 - // 3 alleles: BB element 3. CC element 5 - // 4 alleles: - final int idxBB = idxDiag; - idxDiag += incr--; - - final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); - - if (numAlleles > 2) { - posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone(); - bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors); - } - } - - if (numAlleles > 2) { - // multiallelic approximation, part 2: - // report posteriors for allele that has highest estimated AC - int mostLikelyAlleleIdx = MathUtils.maxElementIndex(bestAFguess); - for (int k=0; k < log10AlleleFrequencyPosteriors.length-1; k++) - log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]); - - } + //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); } private static final ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(); + ArrayList genotypeLikelihoods = new ArrayList(); // TODO -- initialize with size of GLs genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { if ( sample.hasLikelihoods() ) { double[] gls = sample.getLikelihoods().getAsVector(); - if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) + if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) genotypeLikelihoods.add(gls); } } @@ -109,9 +72,401 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } + final static double approximateLog10SumLog10(double[] vals) { + if ( vals.length < 2 ) + throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10"); + + double approx = approximateLog10SumLog10(vals[0], vals[1]); + for ( int i = 2; i < vals.length; i++ ) + approx = approximateLog10SumLog10(approx, vals[i]); + return approx; + } + + final static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if ( small > big ) { + final double t = big; + big = small; + small = t; + } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) + return big; + + if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup + // with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding + int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding + + //double z =Math.log10(1+Math.pow(10.0,-diff)); + //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); + return big + MathUtils.jacobianLogTable[ind]; + } + + // ------------------------------------------------------------------------------------- // - // Linearized, ~O(N), implementation. + // Multi-allelic implementation. + // + // ------------------------------------------------------------------------------------- + + private static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + + // a wrapper around the int array so that we can make it hashable + private static final class ExactACcounts { + + private final int[] counts; + private int hashcode = -1; + + public ExactACcounts(final int[] counts) { + this.counts = counts; + } + + public int[] getCounts() { + return counts; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof ExactACcounts) ? Arrays.equals(counts, ((ExactACcounts)obj).counts) : false; + } + + @Override + public int hashCode() { + if ( hashcode == -1 ) + hashcode = Arrays.hashCode(counts); + return hashcode; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append(counts[0]); + for ( int i = 1; i < counts.length; i++ ) { + sb.append("/"); + sb.append(counts[i]); + } + return sb.toString(); + } + } + + // This class represents a column in the Exact AC calculation matrix + private static final class ExactACset { + + // the counts of the various alternate alleles which this column represents + final ExactACcounts ACcounts; + + // the column of the matrix + final double[] log10Likelihoods; + + // mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column; + // for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB. + final HashMap ACsetIndexToPLIndex = new HashMap(); + + // to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them + final ArrayList dependentACsetsToDelete = new ArrayList(); + + + public ExactACset(final int size, final ExactACcounts ACcounts) { + this.ACcounts = ACcounts; + log10Likelihoods = new double[size]; + } + + // sum of all the non-reference alleles + public int getACsum() { + int sum = 0; + for ( int count : ACcounts.getCounts() ) + sum += count; + return sum; + } + + public boolean equals(Object obj) { + return (obj instanceof ExactACset) ? ACcounts.equals(((ExactACset)obj).ACcounts) : false; + } + } + + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[][] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyLikelihoods, + final double[][] log10AlleleFrequencyPosteriors, + final boolean preserveData) { + + final ArrayList genotypeLikelihoods = getGLs(GLs); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final Queue ACqueue = new LinkedList(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(numChr+1); + + // add AC=0 to the queue + int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.ACcounts, zeroSet); + + // keep processing while we have AC conformations that need to be calculated + double maxLog10L = Double.NEGATIVE_INFINITY; + while ( !ACqueue.isEmpty() ) { + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + + // adjust max likelihood seen if needed + maxLog10L = Math.max(maxLog10L, log10LofKs); + } + } + + private static double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double maxLog10L, + final int numChr, + final boolean preserveData, + final Queue ACqueue, + final HashMap indexesToACset, + final double[][] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyLikelihoods, + final double[][] log10AlleleFrequencyPosteriors) { + + if ( DEBUG ) + System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + + // clean up memory + if ( !preserveData ) { + for ( ExactACcounts index : set.dependentACsetsToDelete ) { + indexesToACset.put(index, null); + if ( DEBUG ) + System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); + } + } + + final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + if ( DEBUG ) + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + + // no reason to keep this data around because nothing depends on it + if ( !preserveData ) + indexesToACset.put(set.ACcounts, null); + + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing + final int numAltAlleles = set.ACcounts.getCounts().length; + + // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. + // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. + + // add conformations for the k+1 case + int PLindex = 0; + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele]++; + lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + lastSet = updateACset(ACcountsClone, numChr, set, ++PLindex , ACqueue, indexesToACset); + } + } + } + + // if the last dependent set was not at the back of the queue (i.e. not just added), then we need to iterate + // over all the dependent sets to find the last one in the queue (otherwise it will be cleaned up too early) + if ( !preserveData && lastSet == null ) { + if ( DEBUG ) + System.out.printf(" *** iterating over dependent sets for set=%s%n", set.ACcounts); + lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); + } + if ( lastSet != null ) + lastSet.dependentACsetsToDelete.add(set.ACcounts); + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also adds it as a dependency to the given callingSetIndex. + // returns the ExactACset if that set was not already in the queue and null otherwise. + private static ExactACset updateACset(final int[] ACcounts, + final int numChr, + final ExactACset callingSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset) { + final ExactACcounts index = new ExactACcounts(ACcounts); + boolean wasInQueue = true; + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + wasInQueue = false; + } + + // add the given dependency to the set + final ExactACset set = indexesToACset.get(index); + set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); + return wasInQueue ? null : set; + } + + private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final Queue ACqueue) { + ExactACset set = null; + for ( ExactACset queued : ACqueue ) { + if ( queued.dependentACsetsToDelete.contains(callingSetIndex) ) + set = queued; + } + return set; + } + + private static void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final HashMap indexesToACset, + final double[][] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyLikelihoods, + final double[][] log10AlleleFrequencyPosteriors) { + + set.log10Likelihoods[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) + set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + } + // k > 0 for at least one k + else { + // all possible likelihoods for a given cell from which to choose the max + final int numPaths = set.ACsetIndexToPLIndex.size() + 1; + final double[] log10ConformationLikelihoods = new double[numPaths]; // TODO can be created just once, since you initialize it + + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + final double[] gl = genotypeLikelihoods.get(j); + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + // initialize + for ( int i = 0; i < numPaths; i++ ) + // TODO -- Arrays.fill? + // todo -- is this even necessary? Why not have as else below? + log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY; + + // deal with the AA case first + if ( totalK < 2*j-1 ) + log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + + // deal with the other possible conformations now + if ( totalK <= 2*j ) { // skip impossible conformations + int conformationIndex = 1; + for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { + if ( DEBUG ) + System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); + log10ConformationLikelihoods[conformationIndex++] = + determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()]; + } + } + + final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods); + + // finally, update the L(j,k) value + set.log10Likelihoods[j] = log10Max - logDenominator; + } + } + + final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + + // determine the power of theta to use + int nonRefAlleles = 0; + for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { + if ( set.ACcounts.getCounts()[i] > 0 ) + nonRefAlleles++; + } + + // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs + for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { + int AC = set.ACcounts.getCounts()[i]; + log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyLikelihoods[i][AC], log10LofK); + + // for k=0 we still want to use theta + final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; + log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); + } + } + + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + // todo -- arent' there a small number of fixed values that this function can adopt? + // todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance + // todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient + // todo -- this can be computed once at the start of the all operations + + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + final int numAltAlleles = ACcounts.length; + + // the AX het case + if ( PLindex <= numAltAlleles ) + return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; + + int subtractor = numAltAlleles+1; + int subtractions = 0; + do { + PLindex -= subtractor; + subtractor--; + subtractions++; + } + while ( PLindex >= subtractor ); + + final int k_i = ACcounts[subtractions-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( PLindex == 0 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[subtractions+PLindex-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + + // ------------------------------------------------------------------------------------- + // + // Deprecated bi-allelic ~O(N) implementation. Kept here for posterity. // // ------------------------------------------------------------------------------------- @@ -119,6 +474,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors * for the exact model calculation */ +/* private final static class ExactACCache { double[] kMinus2, kMinus1, kMinus0; @@ -154,7 +510,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public int linearExact(GenotypesContext GLs, double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { + double[][] log10AlleleFrequencyLikelihoods, + double[][] log10AlleleFrequencyPosteriors) { final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -171,7 +528,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( k == 0 ) { // special case for k = 0 for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[idxAA]; + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; } } else { // k > 0 final double[] kMinus1 = logY.getkMinus1(); @@ -184,14 +541,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double aa = Double.NEGATIVE_INFINITY; double ab = Double.NEGATIVE_INFINITY; if (k < 2*j-1) - aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[idxAA]; + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; if (k < 2*j) - ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[idxAB]; + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; double log10Max; if (k > 1) { - final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[idxBB]; + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; log10Max = approximateLog10SumLog10(aa, ab, bb); } else { // we know we aren't considering the BB case, so we can use an optimized log10 function @@ -205,7 +562,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // update the posteriors vector final double log10LofK = kMinus0[numSamples]; - log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; + log10AlleleFrequencyLikelihoods[0][k] = log10LofK; + log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k]; // can we abort early? lastK = k; @@ -222,202 +580,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } final static double approximateLog10SumLog10(double a, double b, double c) { - //return softMax(new double[]{a, b, c}); return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); } +*/ - final static double approximateLog10SumLog10(double small, double big) { - // make sure small is really the smaller value - if ( small > big ) { - final double t = big; - big = small; - small = t; - } - - if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY ) - return big; - - if (big >= small + MathUtils.MAX_JACOBIAN_TOLERANCE) - return big; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - //final int ind = (int)(((big-small)/JACOBIAN_LOG_TABLE_STEP)); // hard rounding - int ind = (int)(Math.round((big-small)/MathUtils.JACOBIAN_LOG_TABLE_STEP)); // hard rounding - - //double z =Math.log10(1+Math.pow(10.0,-diff)); - //System.out.format("x: %f, y:%f, app: %f, true: %f ind:%d\n",x,y,t2,z,ind); - return big + MathUtils.jacobianLogTable[ind]; - } - - - - /** - * Can be overridden by concrete subclasses - * @param vc variant context with genotype likelihoods - * @param log10AlleleFrequencyPosteriors allele frequency results - * @param AFofMaxLikelihood allele frequency of max likelihood - * - * @return calls - */ - public GenotypesContext assignGenotypes(VariantContext vc, - double[] log10AlleleFrequencyPosteriors, - int AFofMaxLikelihood) { - if ( !vc.isVariant() ) - throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - - - GenotypesContext GLs = vc.getGenotypes(); - double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; - int[][] tracebackArray = new int[GLs.size()+1][AFofMaxLikelihood+1]; - - ArrayList sampleIndices = new ArrayList(); - int sampleIdx = 0; - - // todo - optimize initialization - for (int k=0; k <= AFofMaxLikelihood; k++) - for (int j=0; j <= GLs.size(); j++) - pathMetricArray[j][k] = -1e30; - - pathMetricArray[0][0] = 0.0; - - // todo = can't deal with optimal dynamic programming solution with multiallelic records - if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - sampleIndices.addAll(GLs.getSampleNamesOrderedByName()); - sampleIdx = GLs.size(); - } - else { - - for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { - if ( !genotype.hasLikelihoods() ) - continue; - - double[] likelihoods = genotype.getLikelihoods().getAsVector(); - - if (MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL) { - //System.out.print(sample.getKey()+":"); - //for (int k=0; k < likelihoods.length; k++) - // System.out.format("%4.2f ",likelihoods[k]); - //System.out.println(); - // all likelihoods are essentially the same: skip this sample and will later on force no call. - //sampleIdx++; - continue; - } - - sampleIndices.add(genotype.getSampleName()); - - for (int k=0; k <= AFofMaxLikelihood; k++) { - - double bestMetric = pathMetricArray[sampleIdx][k] + likelihoods[0]; - int bestIndex = k; - - if (k>0) { - double m2 = pathMetricArray[sampleIdx][k-1] + likelihoods[1]; - if (m2 > bestMetric) { - bestMetric = m2; - bestIndex = k-1; - } - } - - if (k>1) { - double m2 = pathMetricArray[sampleIdx][k-2] + likelihoods[2]; - if (m2 > bestMetric) { - bestMetric = m2; - bestIndex = k-2; - } - } - - pathMetricArray[sampleIdx+1][k] = bestMetric; - tracebackArray[sampleIdx+1][k] = bestIndex; - } - sampleIdx++; - } - } - - GenotypesContext calls = GenotypesContext.create(); - - int startIdx = AFofMaxLikelihood; - for (int k = sampleIdx; k > 0; k--) { - int bestGTguess; - String sample = sampleIndices.get(k-1); - Genotype g = GLs.get(sample); - if ( !g.hasLikelihoods() ) - continue; - // if all likelihoods are essentially the same: we want to force no-call. In this case, we skip this sample for now, - // and will add no-call genotype to GL's in a second pass - ArrayList myAlleles = new ArrayList(); - - double[] likelihoods = g.getLikelihoods().getAsVector(); - - if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - bestGTguess = Utils.findIndexOfMaxEntry(likelihoods); - } - else { - int newIdx = tracebackArray[k][startIdx];; - bestGTguess = startIdx - newIdx; - startIdx = newIdx; - } - - // likelihoods are stored row-wise in lower triangular matrix. IE - // for 2 alleles they have ordering AA,AB,BB - // for 3 alleles they are ordered AA,AB,BB,AC,BC,CC - // Get now alleles corresponding to best index - int kk=0; - boolean done = false; - for (int j=0; j < vc.getNAlleles(); j++) { - for (int i=0; i <= j; i++){ - if (kk++ == bestGTguess) { - if (i==0) - myAlleles.add(vc.getReference()); - else - myAlleles.add(vc.getAlternateAllele(i-1)); - - if (j==0) - myAlleles.add(vc.getReference()); - else - myAlleles.add(vc.getAlternateAllele(j-1)); - done = true; - break; - } - - } - if (done) - break; - } - - final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods); - //System.out.println(myAlleles.toString()); - calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); - } - - for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { - if ( !genotype.hasLikelihoods() ) - continue; - - final Genotype g = GLs.get(genotype.getSampleName()); - final double[] likelihoods = genotype.getLikelihoods().getAsVector(); - - if (MathUtils.sum(likelihoods) <= SUM_GL_THRESH_NOCALL) - continue; // regular likelihoods - - final double qual = Genotype.NO_LOG10_PERROR; - calls.replace(new Genotype(g.getSampleName(), NO_CALL_ALLELES, qual, null, g.getAttributes(), false)); - } - - return calls; - } - - private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { - int j = logYMatrix.length - 1; - System.out.printf("-----------------------------------%n"); - for (int k=0; k <= numChr; k++) { - double posterior = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 74c55dbfe..b30a25414 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Map; @@ -79,19 +80,17 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { * @param contexts stratified alignment contexts * @param contextType stratified context type * @param priors priors to use for GLs - * @param GLs hash of sample->GL to fill in * @param alternateAlleleToUse the alternate allele to use, null if not set * @param useBAQedPileup should we use the BAQed pileup or the raw one? - * @return genotype likelihoods per sample for AA, AB, BB + * @return variant context where genotypes are no-called but with GLs */ - public abstract Allele getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Map GLs, - Allele alternateAlleleToUse, - boolean useBAQedPileup); + public abstract VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { int count = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 14d647b6d..653a6f6e7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -41,8 +42,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -243,7 +243,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // get deletion length int dLen = Integer.valueOf(bestAltAllele.substring(1)); // get ref bases of accurate deletion - int startIdxInReference = (int)(1+loc.getStart()-ref.getWindow().getStart()); + int startIdxInReference = 1+loc.getStart()-ref.getWindow().getStart(); //System.out.println(new String(ref.getBases())); byte[] refBases = Arrays.copyOfRange(ref.getBases(),startIdxInReference,startIdxInReference+dLen); @@ -270,19 +270,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private final static EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED); - public Allele getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Map GLs, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup) { if ( tracker == null ) return null; - GenomeLoc loc = ref.getLocus(); Allele refAllele, altAllele; VariantContext vc = null; @@ -368,10 +366,17 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), ref, hsize, numPrefBases); + // start making the VariantContext + final int endLoc = calculateEndPos(alleleList, refAllele, loc); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleleList).referenceBaseForIndel(ref.getBase()); + + // create the genotypes; no-call everyone for now + GenotypesContext genotypes = GenotypesContext.create(); + final List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); + // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. - // initialize the GenotypeLikelihoods - GLs.clear(); for ( Map.Entry sample : contexts.entrySet() ) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); @@ -384,11 +389,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (pileup != null ) { final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(genotypeLikelihoods); - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), - alleleList, - genotypeLikelihoods, - getFilteredDepth(pileup))); + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); + genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); if (DEBUG) { System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString()); @@ -399,9 +405,25 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } - return refAllele; + return builder.genotypes(genotypes).make(); } + private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { + // for indels, stop location is one more than ref allele length + boolean hasNullAltAllele = false; + for ( Allele a : alleles ) { + if ( a.isNull() ) { + hasNullAltAllele = true; + break; + } + } + + int endLoc = loc.getStart() + refAllele.length(); + if( !hasNullAltAllele ) + endLoc--; + + return endLoc; + } public static HashMap> getIndelLikelihoodMap() { return indelLikelihoodMap.get(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java deleted file mode 100755 index 4f378b24a..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: 6/1/11 - * Time: 10:38 AM - * To change this template use File | Settings | File Templates. - */ -public class MultiallelicGenotypeLikelihoods { - private String sample; - private double[] GLs; - private List alleleList; - private int depth; - - public MultiallelicGenotypeLikelihoods(String sample, - List A, - double[] log10Likelihoods, int depth) { - /* Check for consistency between likelihood vector and number of alleles */ - int numAlleles = A.size(); - if (log10Likelihoods.length != numAlleles*(numAlleles+1)/2) - throw new StingException(("BUG: Incorrect length of GL vector when creating MultiallelicGenotypeLikelihoods object!")); - - this.sample = sample; - this.alleleList = A; - this.GLs = log10Likelihoods; - this.depth = depth; - } - - public String getSample() { - return sample; - } - - public double[] getLikelihoods() { - return GLs; - } - - public List getAlleles() { - return alleleList; - } - - public int getDepth() { - return depth; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 9bdc754e9..4087443f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -31,107 +31,147 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - // the alternate allele with the largest sum of quality scores - protected Byte bestAlternateAllele = null; + private static final int MIN_QUAL_SUM_FOR_ALT_ALLELE = 50; + + private boolean ALLOW_MULTIPLE_ALLELES; private final boolean useAlleleFromVCF; protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); + ALLOW_MULTIPLE_ALLELES = UAC.MULTI_ALLELIC; useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; } - public Allele getLikelihoods(RefMetaDataTracker tracker, - ReferenceContext ref, - Map contexts, - AlignmentContextUtils.ReadOrientation contextType, - GenotypePriors priors, - Map GLs, - Allele alternateAlleleToUse, - boolean useBAQedPileup) { + public VariantContext getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + AlignmentContextUtils.ReadOrientation contextType, + GenotypePriors priors, + Allele alternateAlleleToUse, + boolean useBAQedPileup) { if ( !(priors instanceof DiploidSNPGenotypePriors) ) throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); - byte refBase = ref.getBase(); - Allele refAllele = Allele.create(refBase, true); + final boolean[] basesToUse = new boolean[4]; + final byte refBase = ref.getBase(); + final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); - // find the alternate allele with the largest sum of quality scores + // start making the VariantContext + final GenomeLoc loc = ref.getLocus(); + final List alleles = new ArrayList(); + alleles.add(Allele.create(refBase, true)); + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles); + + // find the alternate allele(s) that we should be using if ( alternateAlleleToUse != null ) { - bestAlternateAllele = alternateAlleleToUse.getBases()[0]; + basesToUse[BaseUtils.simpleBaseToBaseIndex(alternateAlleleToUse.getBases()[0])] = true; } else if ( useAlleleFromVCF ) { - VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); + final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); - // ignore places where we don't have a variant - if ( vc == null ) + // ignore places where we don't have a SNP + if ( vc == null || !vc.isSNP() ) return null; - if ( !vc.isBiallelic() ) { - // for multi-allelic sites go back to the reads and find the most likely alternate allele - initializeBestAlternateAllele(refBase, contexts, useBAQedPileup); - } else { - bestAlternateAllele = vc.getAlternateAllele(0).getBases()[0]; - } + for ( Allele allele : vc.getAlternateAlleles() ) + basesToUse[BaseUtils.simpleBaseToBaseIndex(allele.getBases()[0])] = true; } else { - initializeBestAlternateAllele(refBase, contexts, useBAQedPileup); + + determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup); + + // how many alternate alleles are we using? + int alleleCounter = countSetBits(basesToUse); + + // if there are no non-ref alleles... + if ( alleleCounter == 0 ) { + // if we only want variants, then we don't need to calculate genotype likelihoods + if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) + return builder.make(); + + // otherwise, choose any alternate allele (it doesn't really matter) + basesToUse[indexOfRefBase == 0 ? 1 : 0] = true; + } } - // if there are no non-ref bases... - if ( bestAlternateAllele == null ) { - // if we only want variants, then we don't need to calculate genotype likelihoods - if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) - return refAllele; - - // otherwise, choose any alternate allele (it doesn't really matter) - bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C'); + // create the alternate alleles and the allele ordering (the ordering is crucial for the GLs) + final int numAltAlleles = countSetBits(basesToUse); + final int[] alleleOrdering = new int[numAltAlleles + 1]; + alleleOrdering[0] = indexOfRefBase; + int alleleOrderingIndex = 1; + int numLikelihoods = 1; + for ( int i = 0; i < 4; i++ ) { + if ( i != indexOfRefBase && basesToUse[i] ) { + alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(i), false)); + alleleOrdering[alleleOrderingIndex++] = i; + numLikelihoods += alleleOrderingIndex; + } } + builder.alleles(alleles); - Allele altAllele = Allele.create(bestAlternateAllele, false); + // create the genotypes; no-call everyone for now + GenotypesContext genotypes = GenotypesContext.create(); + final List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); - if( useBAQedPileup ) { pileup = createBAQedPileup( pileup ); } + if ( useBAQedPileup ) + pileup = createBAQedPileup( pileup ); // create the GenotypeLikelihoods object - DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); - int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); + final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error); + final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); if ( nGoodBases == 0 ) continue; - double[] likelihoods = GL.getLikelihoods(); + final double[] allLikelihoods = GL.getLikelihoods(); + final double[] myLikelihoods = new double[numLikelihoods]; - DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase); - DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele); - DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele); - ArrayList aList = new ArrayList(); - aList.add(refAllele); - aList.add(altAllele); - double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ; + int myLikelihoodsIndex = 0; + for ( int i = 0; i <= numAltAlleles; i++ ) { + for ( int j = i; j <= numAltAlleles; j++ ) { + myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()]; + } + } // normalize in log space so that max element is zero. - GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), - aList, MathUtils.normalizeFromLog10(dlike, false, true), getFilteredDepth(pileup))); + GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); + + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, getFilteredDepth(pileup)); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); + genotypes.add(new Genotype(sample.getKey(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); } - return refAllele; + return builder.genotypes(genotypes).make(); } - protected void initializeBestAlternateAllele(byte ref, Map contexts, boolean useBAQedPileup) { + private int countSetBits(boolean[] array) { + int counter = 0; + for ( int i = 0; i < array.length; i++ ) { + if ( array[i] ) + counter++; + } + return counter; + } + + // fills in the allelesToUse array + protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map contexts, boolean useBAQedPileup) { int[] qualCounts = new int[4]; for ( Map.Entry sample : contexts.entrySet() ) { @@ -139,7 +179,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ReadBackedPileup pileup = useBAQedPileup ? createBAQedPileup( sample.getValue().getBasePileup() ) : sample.getValue().getBasePileup(); for ( PileupElement p : pileup ) { // ignore deletions - if ( p.isDeletion() || (! p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE )) + if ( p.isDeletion() || (!p.isReducedRead() && p.getQual() < UAC.MIN_BASE_QUALTY_SCORE) ) continue; final int index = BaseUtils.simpleBaseToBaseIndex(p.getBase()); @@ -149,17 +189,31 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC } } - // set the non-ref base with maximum quality score sum - int maxCount = 0; - bestAlternateAllele = null; - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] > maxCount ) { - maxCount = qualCounts[index]; - bestAlternateAllele = altAllele; + if ( ALLOW_MULTIPLE_ALLELES ) { + for ( byte altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + int index = BaseUtils.simpleBaseToBaseIndex(altAllele); + if ( qualCounts[index] >= MIN_QUAL_SUM_FOR_ALT_ALLELE ) { + allelesToUse[index] = true; + } } + } else { + // set the non-ref base which has the maximum quality score sum + int maxCount = 0; + int indexOfMax = 0; + for ( byte altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + int index = BaseUtils.simpleBaseToBaseIndex(altAllele); + if ( qualCounts[index] > maxCount ) { + maxCount = qualCounts[index]; + indexOfMax = index; + } + } + + if ( maxCount > 0 ) + allelesToUse[indexOfMax] = true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 62218416d..53600b145 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -153,6 +153,18 @@ public class UnifiedArgumentCollection { @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; + @Hidden + @Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow multiple alleles in discovery", required = false) + public boolean MULTI_ALLELIC = false; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then this site will be skipped and a warning printed. Note that genotyping sites with many alternate alleles is both CPU and memory intensive. + */ + @Hidden + @Argument(fullName = "max_alternate_alleles", shortName = "maxAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) + public int MAX_ALTERNATE_ALLELES = 5; + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! public UnifiedArgumentCollection clone() { @@ -176,10 +188,12 @@ public class UnifiedArgumentCollection { uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; uac.alleles = alleles; + uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION; + uac.MULTI_ALLELIC = MULTI_ALLELIC; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index c861af1a2..21aaeffba 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -59,6 +59,9 @@ public class UnifiedGenotyperEngine { EMIT_ALL_SITES } + protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + // the unified argument collection private final UnifiedArgumentCollection UAC; public UnifiedArgumentCollection getUAC() { return UAC; } @@ -73,11 +76,12 @@ public class UnifiedGenotyperEngine { private ThreadLocal afcm = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[] log10AlleleFrequencyPriorsSNPs; - private final double[] log10AlleleFrequencyPriorsIndels; + private final double[][] log10AlleleFrequencyPriorsSNPs; + private final double[][] log10AlleleFrequencyPriorsIndels; // the allele frequency likelihoods (allocated once as an optimization) - private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); + private ThreadLocal log10AlleleFrequencyLikelihoods = new ThreadLocal(); + private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); // the priors object private final GenotypePriors genotypePriorsSNPs; @@ -96,6 +100,7 @@ public class UnifiedGenotyperEngine { // the standard filter to use for calls below the confidence threshold but above the emit threshold private static final Set filter = new HashSet(1); + private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; @@ -113,6 +118,7 @@ public class UnifiedGenotyperEngine { @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples) { this.BAQEnabledOnCMDLine = toolkit.getArguments().BAQMode != BAQ.CalculationMode.OFF; + genomeLocParser = toolkit.getGenomeLocParser(); this.samples = new TreeSet(samples); // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ this.UAC = UAC.clone(); @@ -122,10 +128,10 @@ public class UnifiedGenotyperEngine { this.annotationEngine = engine; N = 2 * this.samples.size(); - log10AlleleFrequencyPriorsSNPs = new double[N+1]; - log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, GenotypeLikelihoodsCalculationModel.Model.INDEL); + log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); @@ -152,7 +158,6 @@ public class UnifiedGenotyperEngine { } VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); - if ( vc == null ) return null; @@ -214,14 +219,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - Map GLs = new HashMap(); - - Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); - - if ( refAllele != null ) - return createVariantContextFromLikelihoods(refContext, refAllele, GLs); - else - return null; + return glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -256,136 +254,170 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, false); } - private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { - // no-call everyone for now - List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); - - Set alleles = new LinkedHashSet(); - alleles.add(refAllele); - boolean addedAltAlleles = false; - - GenotypesContext genotypes = GenotypesContext.create(); - for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) { - if ( !addedAltAlleles ) { - addedAltAlleles = true; - // ordering important to maintain consistency - for (Allele a: GL.getAlleles()) { - alleles.add(a); - } - } - - HashMap attributes = new HashMap(); - //GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); - GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(GL.getLikelihoods()); - attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); - attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, likelihoods); - - genotypes.add(new Genotype(GL.getSample(), noCall, Genotype.NO_LOG10_PERROR, null, attributes, false)); - } - - GenomeLoc loc = refContext.getLocus(); - int endLoc = calculateEndPos(alleles, refAllele, loc); - - return new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles).genotypes(genotypes).referenceBaseForIndel(refContext.getBase()).make(); + public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model); } - // private method called by both UnifiedGenotyper and UGCallVariants entry points into the engine - private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + + boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[N+1]); + log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]); + log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); } + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > UAC.MAX_ALTERNATE_ALLELES ) { + logger.warn("the Unified Genotyper is currently set to genotype at most " + UAC.MAX_ALTERNATE_ALLELES + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + vc.getAlternateAlleles().size() + " alternate alleles; see the --max_alternate_alleles argument"); + return null; + } + // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) + if ( vc.getNSamples() == 0 ) { + if ( limitedContext ) + return null; return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ? estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), false, 1.0) : generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); + } // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) + clearAFarray(log10AlleleFrequencyLikelihoods.get()); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); - // find the most likely frequency - int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); + // is the most likely frequency conformation AC=0 for all alternate alleles? + boolean bestGuessIsRef = true; + + // which alternate allele has the highest MLE AC? + int indexOfHighestAlt = -1; + int alleleCountOfHighestAlt = -1; + + // determine which alternate alleles have AF>0 + boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()]; + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]); + + // if the most likely AC is not 0, then this is a good alternate allele to use + if ( indexOfBestAC != 0 ) { + altAllelesToUse[i] = true; + bestGuessIsRef = false; + } + // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele + else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + altAllelesToUse[i] = true; + } + + // keep track of the "best" alternate allele to use + if ( indexOfBestAC > alleleCountOfHighestAlt) { + alleleCountOfHighestAlt = indexOfBestAC; + indexOfHighestAlt = i; + } + } // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); + // TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]); double sum = 0.0; for (int i = 1; i <= N; i++) sum += normalizedPosteriors[i]; double PofF = Math.min(sum, 1.0); // deal with precision errors double phredScaledConfidence; - if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0]; } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { sum = 0.0; for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) break; - sum += log10AlleleFrequencyPosteriors.get()[i]; + sum += log10AlleleFrequencyPosteriors.get()[0][i]; } phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF); } + // strip out any alleles that aren't going to be used in the VariantContext + List myAlleles; + if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { + myAlleles = new ArrayList(vc.getAlleles().size()); + myAlleles.add(vc.getReference()); + for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { + if ( altAllelesToUse[i] ) + myAlleles.add(vc.getAlternateAllele(i)); + } + } else { + // use all of the alleles if we are given them by the user + myAlleles = vc.getAlleles(); + } + + // start constructing the resulting VC + GenomeLoc loc = genomeLocParser.createGenomeLoc(vc); + VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles); + builder.log10PError(phredScaledConfidence/-10.0); + if ( ! passesCallThreshold(phredScaledConfidence) ) + builder.filters(filter); + if ( !limitedContext ) + builder.referenceBaseForIndel(refContext.getBase()); + // create the genotypes - GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse); // print out stats if we have a writer - if ( verboseWriter != null ) + if ( verboseWriter != null && !limitedContext ) printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last HashMap attributes = new HashMap(); // if the site was downsampled, record that fact - if ( rawContext.hasPileupBeenDownsampled() ) + if ( !limitedContext && rawContext.hasPileupBeenDownsampled() ) attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - - if ( UAC.COMPUTE_SLOD && bestAFguess != 0 ) { + if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); + clearAFarray(log10AlleleFrequencyLikelihoods.get()); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); + clearAFarray(log10AlleleFrequencyLikelihoods.get()); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0]; + double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); + clearAFarray(log10AlleleFrequencyLikelihoods.get()); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get()); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0]; + double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -401,26 +433,12 @@ public class UnifiedGenotyperEngine { attributes.put("SB", strandScore); } - GenomeLoc loc = refContext.getLocus(); - - int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); - - Set myAlleles = new HashSet(vc.getAlleles()); - // strip out the alternate allele if it's a ref call - if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new HashSet(1); - myAlleles.add(vc.getReference()); - } - - VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles); + // finish constructing the resulting VC builder.genotypes(genotypes); - builder.log10PError(phredScaledConfidence/-10.0); - if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter); builder.attributes(attributes); - builder.referenceBaseForIndel(refContext.getBase()); VariantContext vcCall = builder.make(); - if ( annotationEngine != null ) { + if ( annotationEngine != null && !limitedContext ) { // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations ReadBackedPileup pileup = null; if (rawContext.hasExtendedEventPileup()) @@ -435,119 +453,6 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - // A barebones entry point to the exact model when there is no tracker or stratified contexts available -- only GLs - public VariantCallContext calculateGenotypes(final VariantContext vc, final GenomeLoc loc, final GenotypeLikelihoodsCalculationModel.Model model) { - - // initialize the data for this thread if that hasn't been done yet - if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[N+1]); - afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - } - - // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) - return null; - - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); - - // find the most likely frequency - int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); - - // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); - double sum = 0.0; - for (int i = 1; i <= N; i++) - sum += normalizedPosteriors[i]; - double PofF = Math.min(sum, 1.0); // deal with precision errors - - double phredScaledConfidence; - if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); - if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); - if ( Double.isInfinite(phredScaledConfidence) ) { - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += log10AlleleFrequencyPosteriors.get()[i]; - } - phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); - } - } - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { - // technically, at this point our confidence in a reference call isn't accurately estimated - // because it didn't take into account samples with no data, so let's get a better estimate - return null; - } - - // create the genotypes - GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - HashMap attributes = new HashMap(); - - int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); - - Set myAlleles = new HashSet(vc.getAlleles()); - // strip out the alternate allele if it's a ref call - if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { - myAlleles = new HashSet(1); - myAlleles.add(vc.getReference()); - } - - VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, myAlleles); - builder.genotypes(genotypes); - builder.log10PError(phredScaledConfidence/-10.0); - if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter); - builder.attributes(attributes); - builder.referenceBaseForIndel(vc.getReferenceBaseForIndel()); - - return new VariantCallContext(builder.make(), confidentlyCalled(phredScaledConfidence, PofF)); - } - - private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { - // TODO - temp fix until we can deal with extended events properly - // for indels, stop location is one more than ref allele length - boolean isSNP = true, hasNullAltAllele = false; - for (Allele a : alleles){ - if (a.length() != 1) { - isSNP = false; - break; - } - } - for (Allele a : alleles){ - if (a.isNull()) { - hasNullAltAllele = true; - break; - } - } - // standard deletion: ref allele length = del length. endLoc = startLoc + refAllele.length(), alt allele = null - // standard insertion: ref allele length = 0, endLos = startLoc - // mixed: want end loc = start Loc for case {A*,AT,T} but say {ATG*,A,T} : want then end loc = start loc + refAllele.length - // So, in general, end loc = startLoc + refAllele.length, except in complex substitutions where it's one less - // - // todo - this is unnecessarily complicated and is so just because of Tribble's arbitrary vc conventions, should be cleaner/simpler, - // the whole vc processing infrastructure seems too brittle and riddled with special case handling - - - int endLoc = loc.getStart(); - if ( !isSNP) { - endLoc += refAllele.length(); - if(!hasNullAltAllele) - endLoc--; - - } - - return endLoc; - } - private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { Map stratifiedContexts = null; @@ -604,9 +509,12 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - protected static void clearAFarray(double[] AFs) { - for ( int i = 0; i < AFs.length; i++ ) - AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + protected static void clearAFarray(double[][] AFs) { + for ( int i = 0; i < AFs.length; i++ ) { + for ( int j = 0; j < AFs[i].length; j++ ) { + AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + } + } } private final static double[] binomialProbabilityDepthCache = new double[10000]; @@ -676,7 +584,7 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) + if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) AFline.append("0.00000000\t"); else AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); @@ -689,8 +597,8 @@ public class UnifiedGenotyperEngine { verboseWriter.println(); } - protected boolean passesEmitThreshold(double conf, int bestAFguess) { - return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) { + return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); } protected boolean passesCallThreshold(double conf) { @@ -744,27 +652,25 @@ public class UnifiedGenotyperEngine { return null; } - protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) { - // calculate the allele frequency priors for 1-N - double sum = 0.0; - double heterozygosity; + protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { - if (model == GenotypeLikelihoodsCalculationModel.Model.INDEL) - heterozygosity = UAC.INDEL_HETEROZYGOSITY; - else - heterozygosity = UAC.heterozygosity; - - for (int i = 1; i <= N; i++) { - double value = heterozygosity / (double)i; - priors[i] = Math.log10(value); - sum += value; + // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i + for (int alleles = 1; alleles <= priors.length; alleles++) { + double sum = 0.0; + + // for each i + for (int i = 1; i <= N; i++) { + double value = Math.pow(theta, alleles) / (double)i; + priors[alleles-1][i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[alleles-1][0] = Math.log10(1.0 - sum); } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[0] = Math.log10(1.0 - sum); } - protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { switch( model ) { case SNP: return log10AlleleFrequencyPriorsSNPs; @@ -837,4 +743,88 @@ public class UnifiedGenotyperEngine { return vc; } + + /** + * @param vc variant context with genotype likelihoods + * @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use + * + * @return genotypes + */ + public GenotypesContext assignGenotypes(VariantContext vc, + boolean[] allelesToUse) { + + final GenotypesContext GLs = vc.getGenotypes(); + + final List sampleIndices = GLs.getSampleNamesOrderedByName(); + + final GenotypesContext calls = GenotypesContext.create(); + + for ( int k = GLs.size() - 1; k >= 0; k-- ) { + final String sample = sampleIndices.get(k); + final Genotype g = GLs.get(sample); + if ( !g.hasLikelihoods() ) + continue; + + final double[] likelihoods = g.getLikelihoods().getAsVector(); + + // if there is no mass on the likelihoods, then just no-call the sample + if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) { + calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false)); + continue; + } + + // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. + // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. + + final int numAltAlleles = allelesToUse.length; + + // start with the assumption that the ideal genotype is homozygous reference + Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference(); + double maxLikelihoodSeen = likelihoods[0]; + int indexOfMax = 0; + + // keep track of some state + Allele firstAllele = vc.getReference(); + int subtractor = numAltAlleles + 1; + int subtractionsMade = 0; + + for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) { + if ( PLindex == subtractor ) { + firstAllele = vc.getAlternateAllele(subtractionsMade); + PLindex -= subtractor; + subtractor--; + subtractionsMade++; + + // we can skip this allele if it's not usable + if ( !allelesToUse[subtractionsMade-1] ) { + i += subtractor - 1; + PLindex += subtractor - 1; + continue; + } + } + + // we don't care about the entry if we've already seen better + if ( likelihoods[i] <= maxLikelihoodSeen ) + continue; + + // if it's usable then update the alleles + int alleleIndex = subtractionsMade + PLindex - 1; + if ( allelesToUse[alleleIndex] ) { + maxAllele1 = firstAllele; + maxAllele2 = vc.getAlternateAllele(alleleIndex); + maxLikelihoodSeen = likelihoods[i]; + indexOfMax = i; + } + } + + ArrayList myAlleles = new ArrayList(); + myAlleles.add(maxAllele1); + myAlleles.add(maxAllele2); + + final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods); + calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); + } + + return calls; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index fee87b21f..cea7dd007 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -320,17 +320,17 @@ public class PhaseByTransmission extends RodWalker, HashMa if(transmissionProb != NO_TRANSMISSION_PROB) phredScoreTransmission = MathUtils.probabilityToPhredScale(1-(transmissionProb)); - //Handle null, missing and unavailable genotypes - //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable - //genotype so it is safe to return the original genotype in this case. - //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. - if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) - return genotype; + //Handle null, missing and unavailable genotypes + //Note that only cases where a null/missing/unavailable genotype was passed in the first place can lead to a null/missing/unavailable + //genotype so it is safe to return the original genotype in this case. + //In addition, if the phasing confidence is 0, then return the unphased, original genotypes. + if(phredScoreTransmission ==0 || genotype == null || !isPhasable(genotype.getType())) + return genotype; - //Add the transmission probability - Map genotypeAttributes = new HashMap(); - genotypeAttributes.putAll(genotype.getAttributes()); - if(transmissionProb>NO_TRANSMISSION_PROB) + //Add the transmission probability + Map genotypeAttributes = new HashMap(); + genotypeAttributes.putAll(genotype.getAttributes()); + if(transmissionProb>NO_TRANSMISSION_PROB) genotypeAttributes.put(TRANSMISSION_PROBABILITY_TAG_NAME, phredScoreTransmission); ArrayList phasedAlleles = new ArrayList(2); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java index 1504f5baf..74291e025 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java @@ -164,13 +164,7 @@ public class VariantEvalWalker extends RodWalker implements Tr @Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false) protected double MIN_PHASE_QUALITY = 10.0; - /** - * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined. - */ - @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false) - protected String FAMILY_STRUCTURE; - - @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) + @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) @@ -561,8 +555,6 @@ public class VariantEvalWalker extends RodWalker implements Tr public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } - public String getFamilyStructure() { return FAMILY_STRUCTURE; } - public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } public TreeSet getStratificationObjects() { return stratificationObjects; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index 0cadf6c0d..363f5665f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -7,9 +9,11 @@ import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker; import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Map; +import java.util.Set; /** * Mendelian violation detection and counting @@ -40,12 +44,25 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; @Analysis(name = "Mendelian Violation Evaluator", description = "Mendelian Violation Evaluator") public class MendelianViolationEvaluator extends VariantEvaluator { - @DataPoint(description = "Number of mendelian variants found") + @DataPoint(description = "Number of variants found with at least one family having genotypes") long nVariants; + @DataPoint(description = "Number of variants found with no family having genotypes -- these sites do not count in the nNoCall") + long nSkipped; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual)") + long nFamCalled; + @DataPoint(description="Number of variants x families called (no missing genotype or lowqual) that contain at least one var allele.") + long nVarFamCalled; + @DataPoint(description="Number of variants x families discarded as low quality") + long nLowQual; + @DataPoint(description="Number of variants x families discarded as no call") + long nNoCall; + @DataPoint(description="Number of loci with mendelian violations") + long nLociViolations; @DataPoint(description = "Number of mendelian violations found") long nViolations; - @DataPoint(description = "number of child hom ref calls where the parent was hom variant") + + /*@DataPoint(description = "number of child hom ref calls where the parent was hom variant") long KidHomRef_ParentHomVar; @DataPoint(description = "number of child het calls where the parent was hom ref") long KidHet_ParentsHomRef; @@ -53,11 +70,65 @@ public class MendelianViolationEvaluator extends VariantEvaluator { long KidHet_ParentsHomVar; @DataPoint(description = "number of child hom variant calls where the parent was hom ref") long KidHomVar_ParentHomRef; + */ + + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HOM_VAR") + long mvRefRef_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_REF -> HET") + long mvRefRef_Het; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HET -> HOM_VAR") + long mvRefHet_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_VAR") + long mvRefVar_Var; + @DataPoint(description="Number of mendelian violations of the type HOM_REF/HOM_VAR -> HOM_REF") + long mvRefVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HET -> HOM_REF") + long mvVarHet_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HOM_REF") + long mvVarVar_Ref; + @DataPoint(description="Number of mendelian violations of the type HOM_VAR/HOM_VAR -> HET") + long mvVarVar_Het; + + + /*@DataPoint(description ="Number of inherited var alleles from het parents") + long nInheritedVar; + @DataPoint(description ="Number of inherited ref alleles from het parents") + long nInheritedRef;*/ + + @DataPoint(description="Number of HomRef/HomRef/HomRef trios") + long HomRefHomRef_HomRef; + @DataPoint(description="Number of Het/Het/Het trios") + long HetHet_Het; + @DataPoint(description="Number of Het/Het/HomRef trios") + long HetHet_HomRef; + @DataPoint(description="Number of Het/Het/HomVar trios") + long HetHet_HomVar; + @DataPoint(description="Number of HomVar/HomVar/HomVar trios") + long HomVarHomVar_HomVar; + @DataPoint(description="Number of HomRef/HomVar/Het trios") + long HomRefHomVAR_Het; + @DataPoint(description="Number of ref alleles inherited from het/het parents") + long HetHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from het/het parents") + long HetHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homRef/het parents") + long HomRefHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homRef/het parents") + long HomRefHet_inheritedVar; + @DataPoint(description="Number of ref alleles inherited from homVar/het parents") + long HomVarHet_inheritedRef; + @DataPoint(description="Number of var alleles inherited from homVar/het parents") + long HomVarHet_inheritedVar; MendelianViolation mv; + PrintStream mvFile; + Map> families; public void initialize(VariantEvalWalker walker) { - mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + //Changed by Laurent Francioli - 2011-06-07 + //mv = new MendelianViolation(walker.getFamilyStructure(), walker.getMendelianViolationQualThreshold()); + mv = new MendelianViolation(walker.getMendelianViolationQualThreshold(),false); + families = walker.getSampleDB().getFamilies(); } public boolean enabled() { @@ -75,110 +146,48 @@ public class MendelianViolationEvaluator extends VariantEvaluator { public String update1(VariantContext vc, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if (vc.isBiallelic() && vc.hasGenotypes()) { // todo -- currently limited to biallelic loci - if (mv.setAlleles(vc)) { + + if(mv.countViolations(families,vc)>0){ + nLociViolations++; + nViolations += mv.getViolationsCount(); + mvRefRef_Var += mv.getParentsRefRefChildVar(); + mvRefRef_Het += mv.getParentsRefRefChildHet(); + mvRefHet_Var += mv.getParentsRefHetChildVar(); + mvRefVar_Var += mv.getParentsRefVarChildVar(); + mvRefVar_Ref += mv.getParentsRefVarChildRef(); + mvVarHet_Ref += mv.getParentsVarHetChildRef(); + mvVarVar_Ref += mv.getParentsVarVarChildRef(); + mvVarVar_Het += mv.getParentsVarVarChildHet(); + + } + HomRefHomRef_HomRef += mv.getRefRefRef(); + HetHet_Het += mv.getHetHetHet(); + HetHet_HomRef += mv.getHetHetHomRef(); + HetHet_HomVar += mv.getHetHetHomVar(); + HomVarHomVar_HomVar += mv.getVarVarVar(); + HomRefHomVAR_Het += mv.getRefVarHet(); + HetHet_inheritedRef += mv.getParentsHetHetInheritedRef(); + HetHet_inheritedVar += mv.getParentsHetHetInheritedVar(); + HomRefHet_inheritedRef += mv.getParentsRefHetInheritedRef(); + HomRefHet_inheritedVar += mv.getParentsRefHetInheritedVar(); + HomVarHet_inheritedRef += mv.getParentsVarHetInheritedRef(); + HomVarHet_inheritedVar += mv.getParentsVarHetInheritedVar(); + + if(mv.getFamilyCalledCount()>0){ nVariants++; - - Genotype momG = vc.getGenotype(mv.getSampleMom()); - Genotype dadG = vc.getGenotype(mv.getSampleDad()); - Genotype childG = vc.getGenotype(mv.getSampleChild()); - - if (mv.isViolation()) { - nViolations++; - - String label; - if (childG.isHomRef() && (momG.isHomVar() || dadG.isHomVar())) { - label = "KidHomRef_ParentHomVar"; - KidHomRef_ParentHomVar++; - } else if (childG.isHet() && (momG.isHomRef() && dadG.isHomRef())) { - label = "KidHet_ParentsHomRef"; - KidHet_ParentsHomRef++; - } else if (childG.isHet() && (momG.isHomVar() && dadG.isHomVar())) { - label = "KidHet_ParentsHomVar"; - KidHet_ParentsHomVar++; - } else if (childG.isHomVar() && (momG.isHomRef() || dadG.isHomRef())) { - label = "KidHomVar_ParentHomRef"; - KidHomVar_ParentHomRef++; - } else { - throw new ReviewedStingException("BUG: unexpected child genotype class " + childG); - } - - return "MendelViolation=" + label; - } + nFamCalled += mv.getFamilyCalledCount(); + nLowQual += mv.getFamilyLowQualsCount(); + nNoCall += mv.getFamilyNoCallCount(); + nVarFamCalled += mv.getVarFamilyCalledCount(); } - } - - return null; // we don't capture any intersting sites - } - - -/* - private double getQThreshold() { - //return getVEWalker().MENDELIAN_VIOLATION_QUAL_THRESHOLD / 10; // we aren't 10x scaled in the GATK a la phred - return mendelianViolationQualThreshold / 10; // we aren't 10x scaled in the GATK a la phred - //return 0.0; - } - - TrioStructure trio; - double mendelianViolationQualThreshold; - - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - - public static class TrioStructure { - public String mom, dad, child; - } - - public static TrioStructure parseTrioDescription(String family) { - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - TrioStructure trio = new TrioStructure(); - //System.out.printf("Found a family pattern: %s%n", parent.FAMILY_STRUCTURE); - trio.mom = m.group(1); - trio.dad = m.group(2); - trio.child = m.group(3); - return trio; - } else { - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); - } - } - - public void initialize(VariantEvalWalker walker) { - trio = parseTrioDescription(walker.getFamilyStructure()); - mendelianViolationQualThreshold = walker.getMendelianViolationQualThreshold(); - } - - private boolean includeGenotype(Genotype g) { - return g.getLog10PError() > getQThreshold() && g.isCalled(); - } - - public static boolean isViolation(VariantContext vc, Genotype momG, Genotype dadG, Genotype childG) { - return isViolation(vc, momG.getAlleles(), dadG.getAlleles(), childG.getAlleles()); - } - - public static boolean isViolation(VariantContext vc, TrioStructure trio ) { - return isViolation(vc, vc.getGenotype(trio.mom), vc.getGenotype(trio.dad), vc.getGenotype(trio.child) ); - } - - public static boolean isViolation(VariantContext vc, List momA, List dadA, List childA) { - //VariantContext momVC = vc.subContextFromGenotypes(momG); - //VariantContext dadVC = vc.subContextFromGenotypes(dadG); - int i = 0; - Genotype childG = new Genotype("kidG", childA); - for (Allele momAllele : momA) { - for (Allele dadAllele : dadA) { - if (momAllele.isCalled() && dadAllele.isCalled()) { - Genotype possibleChild = new Genotype("possibleGenotype" + i, Arrays.asList(momAllele, dadAllele)); - if (childG.sameGenotype(possibleChild)) { - return false; - } - } + else{ + nSkipped++; } + + + return null; } - return true; + return null; // we don't capture any interesting sites } - - -*/ - - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index b0016ff4b..d20fb54aa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -26,9 +26,10 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.variantcontext.*; @@ -41,7 +42,6 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import java.io.File; -import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.*; @@ -180,7 +180,7 @@ import java.util.*; * * */ -public class SelectVariants extends RodWalker { +public class SelectVariants extends RodWalker implements TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); /** @@ -282,6 +282,9 @@ public class SelectVariants extends RodWalker { @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false) private double fractionRandom = 0; + @Argument(fullName="remove_fraction_genotypes", shortName="fractionGenotypes", doc="Selects a fraction (a number between 0 and 1) of the total genotypes at random from the variant track and sets them to nocall", required=false) + private double fractionGenotypes = 0; + /** * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. * When specified one or more times, a particular type of variant is selected. @@ -325,7 +328,7 @@ public class SelectVariants extends RodWalker { private boolean DISCORDANCE_ONLY = false; private boolean CONCORDANCE_ONLY = false; - private Set mvSet = new HashSet(); + private MendelianViolation mv; /* variables used by the SELECT RANDOM modules */ @@ -344,6 +347,8 @@ public class SelectVariants extends RodWalker { private PrintStream outMVFileStream = null; + //Random number generator for the genotypes to remove + private Random randomGenotypes = new Random(); /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher @@ -380,8 +385,6 @@ public class SelectVariants extends RodWalker { for ( String sample : samples ) logger.info("Including sample '" + sample + "'"); - - // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include if (TYPES_TO_INCLUDE.isEmpty()) { @@ -421,29 +424,7 @@ public class SelectVariants extends RodWalker { if (CONCORDANCE_ONLY) logger.info("Selecting only variants concordant with the track: " + concordanceTrack.getName()); if (MENDELIAN_VIOLATIONS) { - if ( FAMILY_STRUCTURE_FILE != null) { - try { - for ( final String line : new XReadLines( FAMILY_STRUCTURE_FILE ) ) { - MendelianViolation mv = new MendelianViolation(line, MENDELIAN_VIOLATION_QUAL_THRESHOLD); - if (samples.contains(mv.getSampleChild()) && samples.contains(mv.getSampleDad()) && samples.contains(mv.getSampleMom())) - mvSet.add(mv); - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); - } - if (outMVFile != null) - try { - outMVFileStream = new PrintStream(outMVFile); - } - catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); } - } - else - mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); - } - else if (!FAMILY_STRUCTURE.isEmpty()) { - mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); - MENDELIAN_VIOLATIONS = true; + mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); } SELECT_RANDOM_NUMBER = numRandom > 0; @@ -479,26 +460,26 @@ public class SelectVariants extends RodWalker { } for (VariantContext vc : vcs) { - if (MENDELIAN_VIOLATIONS) { - boolean foundMV = false; - for (MendelianViolation mv : mvSet) { - if (mv.isViolation(vc)) { - foundMV = true; - //System.out.println(vc.toString()); - if (outMVFile != null) - outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + + if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1) + break; + + if (outMVFile != null){ + for( String familyId : mv.getViolationFamilies()){ + for(Sample sample : this.getSampleDB().getFamily(familyId)){ + if(sample.getParents().size() > 0){ + outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)), - mv.getSampleMom(), mv.getSampleDad(), mv.getSampleChild(), - vc.getGenotype(mv.getSampleMom()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), - vc.getGenotype(mv.getSampleDad()).toBriefString(), vc.getGenotype(mv.getSampleMom()).getLikelihoods().getAsString(), - vc.getGenotype(mv.getSampleChild()).toBriefString(),vc.getGenotype(mv.getSampleChild()).getLikelihoods().getAsString() ); + sample.getMaternalID(), sample.getPaternalID(), sample.getID(), + vc.getGenotype(sample.getMaternalID()).toBriefString(), vc.getGenotype(sample.getMaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getPaternalID()).toBriefString(), vc.getGenotype(sample.getPaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getID()).toBriefString(),vc.getGenotype(sample.getID()).getLikelihoods().getAsString() ); + + } } } - - if (!foundMV) - break; } + if (DISCORDANCE_ONLY) { Collection compVCs = tracker.getValues(discordanceTrack, context.getLocation()); if (!isDiscordant(vc, compVCs)) @@ -629,6 +610,11 @@ public class SelectVariants extends RodWalker { @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; + } + public void onTraversalDone(Integer result) { logger.info(result + " records processed."); @@ -657,9 +643,31 @@ public class SelectVariants extends RodWalker { final VariantContext sub = vc.subContextFromSamples(samples, vc.getAlleles()); VariantContextBuilder builder = new VariantContextBuilder(sub); + GenotypesContext newGC = sub.getGenotypes(); + // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - builder.genotypes(VariantContextUtils.stripPLs(vc.getGenotypes())); + newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + + //Remove a fraction of the genotypes if needed + if(fractionGenotypes>0){ + ArrayList genotypes = new ArrayList(); + for ( Genotype genotype : newGC ) { + //Set genotype to no call if it falls in the fraction. + if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = new ArrayList(2); + alleles.add(Allele.create((byte)'.')); + alleles.add(Allele.create((byte)'.')); + genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap(),false)); + } + else{ + genotypes.add(genotype); + } + } + newGC = GenotypesContext.create(genotypes); + } + + builder.genotypes(newGC); int depth = 0; for (String sample : sub.getSampleNames()) { diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java index d0579fc25..b9c209e69 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java @@ -1,147 +1,394 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Collection; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.*; /** - * User: carneiro + * User: carneiro / lfran * Date: 3/9/11 * Time: 12:38 PM + * + * Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways: + * - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while + * walking over the variants + * - Or using the static methods to directly get information about mendelian violation in a family at a given locus + * */ public class MendelianViolation { - String sampleMom; - String sampleDad; - String sampleChild; + //List of families with violations + private List violationFamilies; - List allelesMom; - List allelesDad; - List allelesChild; + //Call information + private int nocall = 0; + private int familyCalled = 0; + private int varFamilyCalled = 0; + private int lowQual = 0; - double minGenotypeQuality; + private boolean allCalledOnly = true; + + //Stores occurrences of inheritance + private EnumMap>> inheritance; + + private int violations_total=0; + + private double minGenotypeQuality; + + private boolean abortOnSampleNotFound; + + //Number of families with genotype information for all members + public int getFamilyCalledCount(){ + return familyCalled; + } + + //Number of families with genotype information for all members + public int getVarFamilyCalledCount(){ + return varFamilyCalled; + } + + //Number of families missing genotypes for one or more of their members + public int getFamilyNoCallCount(){ + return nocall; + } + + //Number of families with genotypes below the set quality threshold + public int getFamilyLowQualsCount(){ + return lowQual; + } + + public int getViolationsCount(){ + return violations_total; + } + + //Count of alt alleles inherited from het parents (no violation) + public int getParentHetInheritedVar(){ + return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar(); + } + + //Count of ref alleles inherited from het parents (no violation) + public int getParentHetInheritedRef(){ + return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef(); + } + + //Count of HomRef/HomRef/HomRef trios + public int getRefRefRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + } + + //Count of HomVar/HomVar/HomVar trios + public int getVarVarVar(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + } + + //Count of HomRef/HomVar/Het trios + public int getRefVarHet(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + } + + //Count of Het/Het/Het trios + public int getHetHetHet(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET); + } + + //Count of Het/Het/HomRef trios + public int getHetHetHomRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + } + + //Count of Het/Het/HomVar trios + public int getHetHetHomVar(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + } + + //Count of ref alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) + + 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + //return parentsHetHet_childRef; + } + + //Count of var alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedVar(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HET) + + 2*inheritance.get(Genotype.Type.HET).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR); + //return parentsHetHet_childVar; + } + + //Count of ref alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + //return parentsHomRefHet_childRef; + } + + //Count of var alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + //return parentsHomRefHet_childVar; + } + + //Count of ref alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedRef(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HET) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + //return parentsHomVarHet_childRef; + } + + //Count of var alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedVar(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR); + //return parentsHomVarHet_childVar; + } + + //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR + public int getParentsRefRefChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_REF -> HET + public int getParentsRefRefChildHet(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF).get(Genotype.Type.HET); + } + + //Count of violations of the type HOM_REF/HET -> HOM_VAR + public int getParentsRefHetChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR + public int getParentsRefVarChildVar(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF + public int getParentsRefVarChildRef(){ + return inheritance.get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF).get(Genotype.Type.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HET -> HOM_REF + public int getParentsVarHetChildRef(){ + return inheritance.get(Genotype.Type.HET).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF) + + inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET).get(Genotype.Type.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF + public int getParentsVarVarChildRef(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HET + public int getParentsVarVarChildHet(){ + return inheritance.get(Genotype.Type.HOM_VAR).get(Genotype.Type.HOM_VAR).get(Genotype.Type.HET); + } + + + //Count of violations of the type HOM_VAR/? -> HOM_REF + public int getParentVarChildRef(){ + return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef(); + } + + //Count of violations of the type HOM_REF/? -> HOM_VAR + public int getParentRefChildVar(){ + return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar(); + } + + //Returns a String containing all trios where a Mendelian violation was observed. + //The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..." + public String getViolationFamiliesString(){ + if(violationFamilies.isEmpty()) + return ""; + + Iterator it = violationFamilies.iterator(); + String violationFams = it.next(); + while(it.hasNext()){ + violationFams += ","+it.next(); + } + return violationFams; + } + + public List getViolationFamilies(){ + return violationFamilies; + } static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; - private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)"); - - public String getSampleMom() { - return sampleMom; - } - public String getSampleDad() { - return sampleDad; - } - public String getSampleChild() { - return sampleChild; - } public double getMinGenotypeQuality() { return minGenotypeQuality; } - /** - * - * @param sampleMomP - sample name of mom - * @param sampleDadP - sample name of dad - * @param sampleChildP - sample name of child - */ - public MendelianViolation (String sampleMomP, String sampleDadP, String sampleChildP) { - sampleMom = sampleMomP; - sampleDad = sampleDadP; - sampleChild = sampleChildP; - } - - /** - * - * @param family - the sample names string "mom+dad=child" + /** + * Constructor * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - */ - public MendelianViolation(String family, double minGenotypeQualityP) { - minGenotypeQuality = minGenotypeQualityP; - - Matcher m = FAMILY_PATTERN.matcher(family); - if (m.matches()) { - sampleMom = m.group(1); - sampleDad = m.group(2); - sampleChild = m.group(3); - } - else - throw new IllegalArgumentException("Malformatted family structure string: " + family + " required format is mom+dad=child"); - } - - /** - * An alternative to the more general constructor if you want to get the Sample information from the engine yourself. - * @param sample - the sample object extracted from the sample metadata YAML file given to the engine. - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to assess mendelian violation - */ - public MendelianViolation(Sample sample, double minGenotypeQualityP) { - sampleMom = sample.getMother().getID(); - sampleDad = sample.getFather().getID(); - sampleChild = sample.getID(); - minGenotypeQuality = minGenotypeQualityP; - } - - /** - * This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to - * isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site - * before evaluating it for mendelian violation, you can call setAlleles and then isViolation(). - * @param vc - the variant context to extract the genotypes and alleles for mom, dad and child. - * @return false if couldn't find the genotypes or context has empty alleles. True otherwise. - */ - public boolean setAlleles (VariantContext vc) - { - Genotype gMom = vc.getGenotypes(sampleMom).get(sampleMom); - Genotype gDad = vc.getGenotypes(sampleDad).get(sampleDad); - Genotype gChild = vc.getGenotypes(sampleChild).get(sampleChild); - - if (gMom == null || gDad == null || gChild == null) - throw new IllegalArgumentException(String.format("Variant %s:%d didn't contain genotypes for all family members: mom=%s dad=%s child=%s", vc.getChr(), vc.getStart(), sampleMom, sampleDad, sampleChild)); - - if (gMom.isNoCall() || gDad.isNoCall() || gChild.isNoCall() || - gMom.getPhredScaledQual() < minGenotypeQuality || - gDad.getPhredScaledQual() < minGenotypeQuality || - gChild.getPhredScaledQual() < minGenotypeQuality ) { - - return false; - } - - allelesMom = gMom.getAlleles(); - allelesDad = gDad.getAlleles(); - allelesChild = gChild.getAlleles(); - return !allelesMom.isEmpty() && !allelesDad.isEmpty() && !allelesChild.isEmpty(); - } - - - /** * + */ + public MendelianViolation(double minGenotypeQualityP) { + this(minGenotypeQualityP,true); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + * @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + allCalledOnly = completeTriosOnly; + } + + /** + * @param families the families to be checked for Mendelian violations * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. - * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation. - * - */ - public boolean isViolation(VariantContext vc) - { - return setAlleles(vc) && isViolation(); - } - - /** * @return whether or not there is a mendelian violation at the site. */ - public boolean isViolation() { - if (allelesMom.contains(allelesChild.get(0)) && allelesDad.contains(allelesChild.get(1)) || - allelesMom.contains(allelesChild.get(1)) && allelesDad.contains(allelesChild.get(0))) - return false; - return true; + public int countViolations(Map> families, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + + for(Set family : families.values()){ + Iterator sampleIterator = family.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() > 0) + updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc); + } + } + return violations_total; + } + + public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc); + return violations_total>0; + } + + + private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){ + + int count; + Genotype gMom = vc.getGenotype(motherId); + Genotype gDad = vc.getGenotype(fatherId); + Genotype gChild = vc.getGenotype(childId); + + if (gMom == null || gDad == null || gChild == null){ + if(abortOnSampleNotFound) + throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId)); + else + return; + } + //Count No calls + if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){ + nocall++; + } + else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){ + nocall++; + } + //Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned + else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality || + gDad.getPhredScaledQual() < minGenotypeQuality || + gChild.getPhredScaledQual() < minGenotypeQuality )) { + lowQual++; + } + else{ + //Count all families per loci called + familyCalled++; + //If the family is all homref, not too interesting + if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef())) + { + varFamilyCalled++; + if(isViolation(gMom, gDad, gChild)){ + violationFamilies.add(familyId); + violations_total++; + } + } + count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType()); + inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1); + + } + } + + private boolean isViolation(Genotype gMom, Genotype gDad, Genotype gChild) { + //1 parent is no "call + if(!gMom.isCalled()){ + return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef()); + } + else if(!gDad.isCalled()){ + return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef()); + } + //Both parents have genotype information + return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) || + gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0))); + } + + private void createInheritanceMap(){ + + inheritance = new EnumMap>>(Genotype.Type.class); + for(Genotype.Type mType : Genotype.Type.values()){ + inheritance.put(mType, new EnumMap>(Genotype.Type.class)); + for(Genotype.Type dType : Genotype.Type.values()){ + inheritance.get(mType).put(dType, new EnumMap(Genotype.Type.class)); + for(Genotype.Type cType : Genotype.Type.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + + } + + private void clearInheritanceMap(){ + for(Genotype.Type mType : Genotype.Type.values()){ + for(Genotype.Type dType : Genotype.Type.values()){ + for(Genotype.Type cType : Genotype.Type.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } } /** * @return the likelihood ratio for a mendelian violation */ - public double violationLikelihoodRatio(VariantContext vc) { + public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) { double[] logLikAssignments = new double[27]; // the matrix to set up is // MOM DAD CHILD @@ -152,9 +399,9 @@ public class MendelianViolation { // AA AB | AB // |- BB // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs - double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector(); - double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector(); - double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector(); + double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector(); + double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector(); + double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector(); int offset = 0; for ( int oMom = 0; oMom < 3; oMom++ ) { for ( int oDad = 0; oDad < 3; oDad++ ) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java index d498ee61a..7f21da4f4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -27,11 +27,42 @@ public class SampleDBUnitTest extends BaseTest { new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED))); + private static final Set testPEDFamilyF2 = new HashSet(Arrays.asList( + new Sample("s2", "fam2", "d2", "m2", Gender.FEMALE, Affection.AFFECTED), + new Sample("d2", "fam2", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("m2", "fam2", null, null, Gender.FEMALE, Affection.UNKNOWN) + )); + + private static final Set testPEDFamilyF3 = new HashSet(Arrays.asList( + new Sample("s1", "fam3", "d1", "m1", Gender.FEMALE, Affection.AFFECTED), + new Sample("d1", "fam3", null, null, Gender.FEMALE, Affection.UNKNOWN), + new Sample("m1", "fam3", null, null, Gender.FEMALE, Affection.UNKNOWN) + )); + private static final Set testSAMSamples = new HashSet(Arrays.asList( new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN))); + private static final HashMap> testGetFamilies = new HashMap>(); + static { + testGetFamilies.put("fam1", testPEDSamples); + testGetFamilies.put("fam2", testPEDFamilyF2); + testGetFamilies.put("fam3", testPEDFamilyF3); + } + + private static final Set testKidsWithParentsFamilies2 = new HashSet(Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("kid3", "fam5", "dad2", "mom2", Gender.MALE, Affection.AFFECTED), + new Sample("kid2", "fam5", "dad2", "mom2", Gender.MALE, Affection.AFFECTED))); + + private static final HashSet testGetPartialFamiliesIds = new HashSet(Arrays.asList("kid","s1")); + private static final HashMap> testGetPartialFamilies = new HashMap>(); + static { + testGetPartialFamilies.put("fam1", new HashSet(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED)))); + testGetPartialFamilies.put("fam3", new HashSet(Arrays.asList(new Sample("s1", "fam3", "d1", "m1", Gender.FEMALE, Affection.AFFECTED)))); + } + private static final String testPEDString = String.format("%s%n%s%n%s", "fam1 kid dad mom 1 2", @@ -46,6 +77,18 @@ public class SampleDBUnitTest extends BaseTest { "fam3 s1 d1 m1 2 2", "fam2 s2 d2 m2 2 2"); + private static final String testPEDMultipleFamilies2 = + String.format("%s%n%s%n%s%n%s%n%s%n%s%n%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2", + "fam4 kid4 dad4 0 1 2", + "fam4 dad4 0 0 1 1", + "fam5 kid2 dad2 mom2 1 2", + "fam5 kid3 dad2 mom2 1 2", + "fam5 dad2 0 0 1 1", + "fam5 mom2 0 0 2 2"); + private static final String testPEDStringInconsistentGender = "fam1 kid 0 0 2 2"; @@ -138,6 +181,25 @@ public class SampleDBUnitTest extends BaseTest { Assert.assertEquals(db.getFamily("fam1"), testPEDSamplesAsSet); } + @Test() + public void getFamilies(){ + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamilies(),testGetFamilies); + Assert.assertEquals(db.getFamilies(null),testGetFamilies); + Assert.assertEquals(db.getFamilies(testGetPartialFamiliesIds),testGetPartialFamilies); + } + + @Test() + public void testGetChildrenWithParents() + { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies2)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getChildrenWithParents(), testKidsWithParentsFamilies2); + Assert.assertEquals(db.getChildrenWithParents(false), testKidsWithParentsFamilies2); + Assert.assertEquals(db.getChildrenWithParents(true), new HashSet(Arrays.asList(new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED)))); + } + @Test() public void loadFamilyIDs() { builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java new file mode 100644 index 000000000..9640a8963 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -0,0 +1,104 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + + +public class ExactAFCalculationModelUnitTest extends BaseTest { + + static double[] AA1, AB1, BB1; + static double[] AA2, AB2, AC2, BB2, BC2, CC2; + static final int numSamples = 3; + static double[][] priors = new double[2][2*numSamples+1]; // flat priors + + @BeforeSuite + public void before() { + AA1 = new double[]{0.0, -20.0, -20.0}; + AB1 = new double[]{-20.0, 0.0, -20.0}; + BB1 = new double[]{-20.0, -20.0, 0.0}; + AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; + AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; + BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; + BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; + CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; + } + + private class GetGLsTest extends TestDataProvider { + GenotypesContext GLs; + int numAltAlleles; + String name; + + private GetGLsTest(String name, int numAltAlleles, Genotype... arg) { + super(GetGLsTest.class, name); + GLs = GenotypesContext.create(arg); + this.name = name; + this.numAltAlleles = numAltAlleles; + } + + public String toString() { + return String.format("%s input=%s", super.toString(), GLs); + } + } + + private static Genotype createGenotype(String name, double[] gls) { + return new Genotype(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), Genotype.NO_LOG10_PERROR, gls); + } + + @DataProvider(name = "getGLs") + public Object[][] createGLsData() { + + // bi-allelic case + new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1)); + new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1)); + new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1)); + new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1)); + new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1)); + new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1)); + new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1)); + new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1)); + + // tri-allelic case + new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2)); + new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2)); + new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2)); + new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2)); + new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2)); + new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2)); + new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2)); + + return GetGLsTest.getTests(GetGLsTest.class); + } + + + @Test(dataProvider = "getGLs") + public void testGLs(GetGLsTest cfg) { + + final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1]; + final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1]; + for ( int i = 0; i < 2; i++ ) { + for ( int j = 0; j < 2*numSamples+1; j++ ) { + log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + } + } + + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false); + + int nameIndex = 1; + for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { + int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); + int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]); + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 34e1ad30e..4ae00431c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,7 +7,6 @@ import org.testng.annotations.Test; import java.util.Arrays; import java.util.HashMap; -import java.util.List; import java.util.Map; // ********************************************************************************** // @@ -29,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("286f0de92e4ce57986ba861390c6019d")); + Arrays.asList("a2d3839c4ebb390b0012d495e4e53b3a")); executeTest("test MultiSample Pilot1", spec); } @@ -45,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d0593483e85a7d815f4c5ee6db284d2a")); + Arrays.asList("43e7a17d95b1a0cf72e669657794d802")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -53,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("3ccce5d909f8f128e496f6841836e5f7")); + Arrays.asList("ae29b9c9aacce8046dc780430540cd62")); executeTest("test SingleSample Pilot2", spec); } @@ -63,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864"; + private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65"; @Test public void testCompressedOutput() { @@ -84,7 +83,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "95614280c565ad90f8c000376fef822c"; + String md5 = "32a34362dff51d8b73a3335048516d82"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -165,8 +164,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" ); - e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" ); + e.put( 0.01, "2cb2544739e01f6c08fd820112914317" ); + e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -276,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd")); + Arrays.asList("45633d905136c86e9d3f90ce613255e5")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -286,7 +285,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("f93f8a35b47bcf96594ada55e2312c73")); + Arrays.asList("98a4d1e1e0a363ba37518563ac6cbead")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -295,8 +294,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("9be28cb208d8b0314d2bc2696e2fd8d4")); - executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); + Arrays.asList("915e7a3e7cbfd995dbc41fdd382d0d51")); + executeTest("test MultiSample Phase1 indels with complicated records", spec4); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 17a813a70..4e3d38c4f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -291,6 +291,17 @@ public class VariantEvalIntegrationTest extends WalkerTest { executeTestParallel("testVEGenotypeConcordance" + vcfFile, spec); } + @Test + public void testVEMendelianViolationEvaluator() { + String vcfFile = "/MendelianViolationEval.vcf"; + String pedFile = "/MendelianViolationEval.ped"; + + WalkerTestSpec spec = new WalkerTestSpec("-T VariantEval -R "+b37KGReference+" --eval " + variantEvalTestDataRoot + vcfFile + " -ped "+ variantEvalTestDataRoot + pedFile +" -noEV -EV MendelianViolationEvaluator -L 1:10109-10315 -o %s -mvq 0 -noST", + 1, + Arrays.asList("66e72c887124f40933d32254b2dd44a3")); + executeTestParallel("testVEMendelianViolationEvaluator" + vcfFile, spec); + } + @Test public void testCompVsEvalAC() { String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 6e994be3a..72a07bd0e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -115,4 +115,26 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testUsingDbsnpName--" + testFile, spec); } + + @Test + public void testParallelization() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + WalkerTestSpec spec; + + spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), + 1, + Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") + ); + executeTest("testParallelization (2 threads)--" + testfile, spec); + + spec = new WalkerTestSpec( + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), + 1, + Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") + ); + + executeTest("testParallelization (4 threads)--" + testfile, spec); + } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index ccbe648d6..621afe817 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -83,6 +83,10 @@ class DataProcessingPipeline extends QScript { @Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) var defaultPlatform: String = "" + @Hidden + @Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) + var testMode: Boolean = false + /**************************************************************************** * Global Variables @@ -335,6 +339,7 @@ class DataProcessingPipeline extends QScript { this.known ++= qscript.indels this.consensusDeterminationModel = cleanModelEnum this.compress = 0 + this.noPGTag = qscript.testMode; this.scatterCount = nContigs this.analysisName = queueLogDir + outBam + ".clean" this.jobName = queueLogDir + outBam + ".clean" @@ -360,6 +365,7 @@ class DataProcessingPipeline extends QScript { this.out = outBam if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString) else if (qscript.intervals != null) this.intervals :+= qscript.intervals + this.no_pg_tag = qscript.testMode this.scatterCount = nContigs this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index 6f5dae2f8..4896eaed3 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -47,6 +47,10 @@ class PacbioProcessingPipeline extends QScript { @Input(shortName="bwastring", required=false) var bwastring: String = "" + @Hidden + @Input(shortName = "test", fullName = "test_mode", required = false) + var testMode: Boolean = false + val queueLogDir: String = ".qlog/" def script = { @@ -170,6 +174,7 @@ class PacbioProcessingPipeline extends QScript { this.input_file :+= inBam this.recal_file = inRecalFile this.out = outBam + this.no_pg_tag = testMode this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" @@ -177,7 +182,6 @@ class PacbioProcessingPipeline extends QScript { } case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { - this.resources = R this.recal_file = inRecalFile this.output_dir = outPath this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala new file mode 100644 index 000000000..7e1d09b70 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -0,0 +1,69 @@ +package org.broadinstitute.sting.queue.pipeline + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.BaseTest + +class DataProcessingPipelineTest { + @Test + def testSimpleBAM { + val projectName = "test1" + val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" + val spec = new PipelineTestSpec + spec.name = "DataProcessingPipeline" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -i " + BaseTest.testDir + "exampleBAM.bam", + " -D " + BaseTest.testDir + "exampleDBSNP.vcf", + " -nv ", + " -test ", + " -p " + projectName).mkString + spec.fileMD5s += testOut -> "1f85e76de760167a77ed1d9ab4da2936" + PipelineTest.executeTest(spec) + } + + @Test + def testBWAPEBAM { + val projectName = "test2" + val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" + val spec = new PipelineTestSpec + spec.name = "DataProcessingPipeline" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -i " + BaseTest.testDir + "exampleBAM.bam", + " -D " + BaseTest.testDir + "exampleDBSNP.vcf", + " -nv ", + " -test ", + " -bwa /home/unix/carneiro/bin/bwa", + " -bwape ", + " -p " + projectName).mkString + spec.fileMD5s += testOut -> "57416a0abdf9524bc92834d466529708" + PipelineTest.executeTest(spec) + } + +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala new file mode 100644 index 000000000..50aa66367 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.queue.pipeline + +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import org.testng.annotations.Test +import org.broadinstitute.sting.BaseTest + +class PacbioProcessingPipelineTest { + @Test + def testBAM { + val testOut = "exampleBAM.recal.bam" + val spec = new PipelineTestSpec + spec.name = "pacbioProcessingPipeline" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -i " + BaseTest.testDir + "exampleBAM.bam", + " -blasr ", + " -test ", + " -D " + BaseTest.testDir + "exampleDBSNP.vcf").mkString + spec.fileMD5s += testOut -> "f0adce660b55cb91d5f987f9a145471e" + PipelineTest.executeTest(spec) + } +} diff --git a/public/testdata/exampleBAM.bam b/public/testdata/exampleBAM.bam index a6ebb6fd1..319dd1a72 100644 Binary files a/public/testdata/exampleBAM.bam and b/public/testdata/exampleBAM.bam differ diff --git a/public/testdata/exampleBAM.bam.bai b/public/testdata/exampleBAM.bam.bai index cc6e1a145..052ac614b 100644 Binary files a/public/testdata/exampleBAM.bam.bai and b/public/testdata/exampleBAM.bam.bai differ diff --git a/public/testdata/exampleDBSNP.vcf b/public/testdata/exampleDBSNP.vcf new file mode 100644 index 000000000..9e7e96f51 --- /dev/null +++ b/public/testdata/exampleDBSNP.vcf @@ -0,0 +1,282 @@ +##fileformat=VCFv4.1 +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=5% minor allele frequency in 1+ populations"> +##INFO=5% minor allele frequency in each and all populations"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO=SubSNP->Batch.link_out"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##LeftAlignVariants="analysis_type=LeftAlignVariants input_file=[] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false disable_experimental_low_memory_sharding=false logging_level=INFO log_to_file=null help=false variant=(RodBinding name=variant source=00-All.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub filter_mismatching_base_and_quals=false" +##contig= +##phasing=partial +##reference=GRCh37.3 +##reference=file:///humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37.fasta +##source=dbSNP +##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 10144 rs144773400 TA T . PASS ASP;RSPOS=10145;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10234 rs145599635 C T . PASS ASP;RSPOS=10234;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 10248 rs148908337 A T . PASS ASP;RSPOS=10248;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 10254 rs140194106 TA T . PASS ASP;RSPOS=10255;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10291 rs145427775 C T . PASS ASP;RSPOS=10291;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 10327 rs112750067 T C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10327;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 +chr1 10329 rs150969722 AC A . PASS ASP;RSPOS=10330;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10351 rs145072688 CTA C,CA . PASS ASP;RSPOS=10352;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10382 rs147093981 AAC A,AC . PASS ASP;RSPOS=10383;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10433 rs56289060 A AC . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10433;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 10439 rs112766696 AC A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10440;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=132 +chr1 10439 rs138941843 AC A . PASS ASP;RSPOS=10440;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=134 +chr1 10440 rs112155239 C A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10440;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 +chr1 10492 rs55998931 C T . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.0617001828153565;RSPOS=10492;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040000000100;WGT=0;dbSNPBuildID=129 +chr1 10519 rs62636508 G C . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10519;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129 +chr1 10583 rs58108140 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GMAF=0.270566727605119;KGPilot123;RSPOS=10583;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=129 +chr1 10611 rs189107123 C G . PASS KGPilot123;RSPOS=10611;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 10828 rs10218492 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10828;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119 +chr1 10904 rs10218493 G A . PASS ASP;GENEINFO=LOC100652771:100652771;GNO;RSPOS=10904;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=119 +chr1 10927 rs10218527 A G . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10927;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=119 +chr1 10938 rs28853987 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=10938;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 +chr1 11014 rs28484712 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11014;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 +chr1 11022 rs28775022 G A . PASS ASP;GENEINFO=LOC100652771:100652771;RSPOS=11022;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=125 +chr1 11081 rs10218495 G T . PASS CFL;GENEINFO=LOC100652771:100652771;GNO;RSPOS=11081;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=119 +chr1 11863 rs187669455 C A . PASS RSPOS=11863;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135 +chr1 13302 rs180734498 C T . PASS KGPilot123;RSPOS=13302;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 13327 rs144762171 G C . PASS ASP;KGPilot123;RSPOS=13327;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 13684 rs71260404 C T . PASS GENEINFO=LOC100652771:100652771;GNO;RSPOS=13684;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 13980 rs151276478 T C . PASS ASP;KGPilot123;RSPOS=13980;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 14889 rs142444908 G A . PASS ASP;RSPOS=14889;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 14907 rs79585140 A G . PASS GNO;RSPOS=14907;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131 +chr1 14930 rs75454623 A G . PASS GNO;RSPOS=14930;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131 +chr1 14976 rs71252251 G A . PASS ASP;GNO;RSPOS=14976;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=130 +chr1 15061 rs71268703 T TG . PASS ASP;GNO;RSPOS=15061;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130 +chr1 15118 rs71252250 A G . PASS ASP;GNO;RSPOS=15118;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=130 +chr1 15211 rs144718396 T G . PASS ASP;RSPOS=15211;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 15211 rs78601809 T G . PASS ASP;GNO;RSPOS=15211;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 16257 rs78588380 G C . PASS ASP;GNO;RSPOS=16257;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131 +chr1 16378 rs148220436 T C . PASS ASP;RSPOS=16378;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 16495 rs141130360 G C . PASS ASP;RSPOS=16495;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 16497 rs150723783 A G . PASS ASP;RSPOS=16497;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 17519 rs192890528 G T . PASS RSPOS=17519;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=135 +chr1 19226 rs138930629 T A . PASS ASP;RSPOS=19226;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 20141 rs56336884 G A . PASS HD;RSPOS=20141;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000400000100;WGT=0;dbSNPBuildID=129 +chr1 20144 rs143346096 G A . PASS ASP;RSPOS=20144;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 20206 rs71262675 C T . PASS GNO;RSPOS=20206;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 20245 rs71262674 G A . PASS GMAF=0.256398537477148;GNO;RSPOS=20245;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 20304 rs71262673 G C . PASS GMAF=0.338208409506399;GNO;RSPOS=20304;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000000100000100;WGT=0;dbSNPBuildID=130 +chr1 26999 rs147506580 A G . PASS ASP;RSPOS=26999;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 29436 rs2462493 G A . PASS GNO;RSPOS=29436;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=100 +chr1 30923 rs140337953 G T . PASS ASP;KGPilot123;RSPOS=30923;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 33487 rs77459554 C T . PASS ASP;GNO;RSPOS=33487;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131 +chr1 33495 rs75468675 C T . PASS ASP;GNO;RSPOS=33495;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 33505 rs75627161 T C . PASS ASP;GNO;RSPOS=33505;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 33508 rs75609629 A T . PASS ASP;GNO;RSPOS=33508;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040100000100;WGT=0;dbSNPBuildID=131 +chr1 33521 rs76098219 T A . PASS GNO;RSPOS=33521;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040100000100;WGT=0;dbSNPBuildID=131 +chr1 33593 rs557585 G A . PASS RSPOS=33593;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=83 +chr1 33648 rs62028204 G T . PASS RSPOS=33648;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=129 +chr1 33656 rs113821789 T C . PASS RSPOS=33656;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=132 +chr1 51476 rs187298206 T C . PASS KGPilot123;RSPOS=51476;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 51479 rs116400033 T A . PASS ASP;G5;G5A;GMAF=0.113802559414991;KGPilot123;RSPOS=51479;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004070010000100;WGT=0;dbSNPBuildID=132 +chr1 51803 rs62637812 T C . PASS GMAF=0.468921389396709;RSPOS=51803;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040000000100;WGT=0;dbSNPBuildID=129 +chr1 51898 rs76402894 C A . PASS GMAF=0.0731261425959781;GNO;RSPOS=51898;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=131 +chr1 51914 rs190452223 T G . PASS KGPilot123;RSPOS=51914;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 51928 rs78732933 G A . PASS GNO;RSPOS=51928;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=131 +chr1 51935 rs181754315 C T . PASS KGPilot123;RSPOS=51935;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 51954 rs185832753 G C . PASS KGPilot123;RSPOS=51954;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 52058 rs62637813 G C . PASS GMAF=0.0342778793418647;KGPilot123;RSPOS=52058;SAO=0;SSR=1;VC=SNV;VLD;VP=050000000000040010000140;WGT=0;dbSNPBuildID=129 +chr1 52144 rs190291950 T A . PASS KGPilot123;RSPOS=52144;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 52238 rs150021059 T G . PASS ASP;KGPilot123;RSPOS=52238;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54353 rs140052487 C A . PASS ASP;KGPilot123;RSPOS=54353;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54421 rs146477069 A G . PASS ASP;KGPilot123;RSPOS=54421;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54490 rs141149254 G A . PASS ASP;KGPilot123;RSPOS=54490;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54676 rs2462492 C T . PASS ASP;GMAF=0.191956124314442;GNO;HD;KGPilot123;RSPOS=54676;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040510000100;WGT=0;dbSNPBuildID=100 +chr1 54753 rs143174675 T G . PASS ASP;KGPilot123;RSPOS=54753;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 54788 rs59861892 CC C,CCT . PASS ASP;RSPOS=54789;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 54795 rs58014817 T A . PASS ASP;RSPOS=54795;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=129 +chr1 55164 rs3091274 C A . PASS G5;G5A;GMAF=0.145338208409506;GNO;KGPilot123;RSPOS=55164;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000030110000100;WGT=0;dbSNPBuildID=103 +chr1 55299 rs10399749 C T . PASS G5;G5A;GMAF=0.278793418647166;GNO;KGPilot123;PH2;RSPOS=55299;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000000030112000100;WGT=0;dbSNPBuildID=119 +chr1 55302 rs3091273 C T . PASS RSPOS=55302;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=103 +chr1 55313 rs182462964 A T . PASS KGPilot123;RSPOS=55313;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55322 rs3107974 C T . PASS RSPOS=55322;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=103 +chr1 55326 rs3107975 T C . PASS GNO;HD;KGPilot123;RSPOS=55326;SAO=0;SSR=0;VC=SNV;VP=050000000000000510000100;WGT=0;dbSNPBuildID=103 +chr1 55330 rs185215913 G A . PASS KGPilot123;RSPOS=55330;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55367 rs190850374 G A . PASS KGPilot123;RSPOS=55367;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55388 rs182711216 C T . PASS KGPilot123;RSPOS=55388;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55394 rs2949420 T A . PASS GNO;KGPilot123;PH2;RSPOS=55394;SAO=0;SSR=0;VC=SNV;VP=050000000000000112000100;WGT=0;dbSNPBuildID=101 +chr1 55416 rs193242050 G A . PASS KGPilot123;RSPOS=55416;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55427 rs183189405 T C . PASS KGPilot123;RSPOS=55427;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55545 rs28396308 C T . PASS GNO;RSPOS=55545;SAO=0;SSR=0;VC=SNV;VP=050000000000000100000100;WGT=0;dbSNPBuildID=125 +chr1 55816 rs187434873 G A . PASS KGPilot123;RSPOS=55816;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55850 rs191890754 C G . PASS KGPilot123;RSPOS=55850;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 55852 rs184233019 G C . PASS KGPilot123;RSPOS=55852;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 56644 rs143342222 A C . PASS ASP;KGPilot123;RSPOS=56644;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 57952 rs189727433 A C . PASS KGPilot123;RSPOS=57952;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 58771 rs140128481 T C . PASS ASP;RSPOS=58771;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 58814 rs114420996 G A . PASS ASP;G5;GMAF=0.0982632541133455;KGPilot123;RSPOS=58814;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004050010000100;WGT=0;dbSNPBuildID=132 +chr1 59040 rs149755937 T C . PASS ASP;KGPilot123;RSPOS=59040;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 60718 rs78395614 G A . PASS CFL;GNO;RSPOS=60718;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 60726 rs192328835 C A . PASS KGPilot123;RSPOS=60726;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 60791 rs76199781 A G . PASS CFL;GNO;RSPOS=60791;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 61442 rs74970982 A G . PASS CFL;GMAF=0.076782449725777;GNO;KGPilot123;RSPOS=61442;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131 +chr1 61462 rs56992750 T A . PASS CFL;G5;G5A;GMAF=0.0383912248628885;GNO;KGPilot123;RSPOS=61462;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008030110000100;WGT=0;dbSNPBuildID=129 +chr1 61480 rs75526266 G C . PASS CFL;GNO;RSPOS=61480;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 61499 rs75719746 G A . PASS CFL;GNO;RSPOS=61499;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 61743 rs184286948 G C . PASS KGPilot123;RSPOS=61743;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 61920 rs62637820 G A . PASS CFL;GMAF=0.0255941499085923;RSPOS=61920;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=129 +chr1 61987 rs76735897 A G . PASS CFL;GMAF=0.292961608775137;GNO;KGPilot123;RSPOS=61987;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131 +chr1 61989 rs77573425 G C . PASS CFL;GMAF=0.309414990859232;GNO;KGPilot123;RSPOS=61989;SAO=0;SSR=0;VC=SNV;VP=050000000008000110000100;WGT=0;dbSNPBuildID=131 +chr1 61993 rs190553843 C T . PASS KGPilot123;RSPOS=61993;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 62156 rs181864839 C T . PASS KGPilot123;RSPOS=62156;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 62157 rs10399597 G A . PASS CFL;GMAF=0.00228519195612431;KGPilot123;RSPOS=62157;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=119 +chr1 62162 rs140556834 G A . PASS ASP;KGPilot123;RSPOS=62162;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 62203 rs28402963 T C . PASS CFL;KGPilot123;RSPOS=62203;SAO=0;SSR=0;VC=SNV;VP=050000000008000010000100;WGT=0;dbSNPBuildID=125 +chr1 62271 rs28599927 A G . PASS CFL;GMAF=0.138482632541133;RSPOS=62271;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=125 +chr1 63268 rs75478250 T C . PASS CFL;GNO;RSPOS=63268;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=131 +chr1 63276 rs185977555 G A . PASS KGPilot123;RSPOS=63276;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 63297 rs188886746 G A . PASS KGPilot123;RSPOS=63297;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 63671 rs116440577 G A . PASS ASP;G5;GMAF=0.170018281535649;KGPilot123;RSPOS=63671;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004050010000100;WGT=0;dbSNPBuildID=132 +chr1 63737 rs77426996 TACT T,TCTA . PASS CFL;RSPOS=63738;SAO=0;SSR=0;VC=DIV;VP=050000000008000000000200;WGT=0;dbSNPBuildID=131 +chr1 64649 rs181431124 A C . PASS KGPilot123;RSPOS=64649;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66008 rs2691286 C G . PASS CFL;GNO;RSPOS=66008;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=100 +chr1 66162 rs62639105 A T . PASS CFL;GMAF=0.320383912248629;GNO;KGPilot123;RSPOS=66162;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000110000100;WGT=0;dbSNPBuildID=129 +chr1 66176 rs28552463 T A . PASS CFL;GMAF=0.0484460694698355;KGPilot123;RSPOS=66176;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=125 +chr1 66219 rs181028663 A T . PASS KGPilot123;RSPOS=66219;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66238 rs113961546 T A . PASS CFL;GNO;RSPOS=66238;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=132 +chr1 66314 rs28534012 T A . PASS CFL;RSPOS=66314;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=125 +chr1 66331 rs186063952 A C . PASS KGPilot123;RSPOS=66331;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66334 rs28464214 T A . PASS CFL;RSPOS=66334;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=125 +chr1 66442 rs192044252 T A . PASS KGPilot123;RSPOS=66442;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 66457 rs13328655 T A . PASS CFL;GMAF=0.0795246800731261;KGPilot123;RSPOS=66457;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=121 +chr1 66503 rs112350669 T A . PASS CFL;RSPOS=66503;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=132 +chr1 66507 rs12401368 T A . PASS CFL;GMAF=0.479890310786106;KGPilot123;RSPOS=66507;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040010000100;WGT=0;dbSNPBuildID=120 +chr1 66651 rs2257270 A T . PASS CFL;GNO;RSPOS=66651;SAO=0;SSR=0;VC=SNV;VP=050000000008000100000100;WGT=0;dbSNPBuildID=100 +chr1 67179 rs149952626 C G . PASS ASP;KGPilot123;RSPOS=67179;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 67181 rs77662731 A G . PASS ASP;G5;G5A;GENEINFO=OR4F5:79501;GMAF=0.0470749542961609;GNO;KGPilot123;RSPOS=67181;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004070110000100;WGT=0;dbSNPBuildID=131 +chr1 67223 rs78676975 C A . PASS ASP;GENEINFO=OR4F5:79501;GNO;RSPOS=67223;SAO=0;SSR=0;VC=SNV;VP=050000000004000100000100;WGT=0;dbSNPBuildID=131 +chr1 69428 rs140739101 T G . PASS ASP;RSPOS=69428;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134 +chr1 69453 rs142004627 G A . PASS ASP;RSPOS=69453;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 69476 rs148502021 T C . PASS ASP;RSPOS=69476;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134 +chr1 69496 rs150690004 G A . PASS ASP;RSPOS=69496;S3D;SAO=0;SSR=0;VC=SNV;VLD;VP=050200000004040000000100;WGT=0;dbSNPBuildID=134 +chr1 69511 rs75062661 A G . PASS GENEINFO=OR4F5:79501;GMAF=0.193784277879342;GNO;KGPilot123;RSPOS=69511;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000000000110000100;WGT=0;dbSNPBuildID=131 +chr1 69534 rs190717287 T C . PASS KGPilot123;RSPOS=69534;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 69552 rs55874132 G C . PASS GENEINFO=OR4F5:79501;HD;RSPOS=69552;S3D;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050300000000040400000100;WGT=0;dbSNPBuildID=129 +chr1 69590 rs141776804 T A . PASS ASP;RSPOS=69590;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 69594 rs144967600 T C . PASS ASP;RSPOS=69594;S3D;SAO=0;SSR=0;VC=SNV;VP=050200000004000000000100;WGT=0;dbSNPBuildID=134 +chr1 72148 rs182862337 C T . PASS KGPilot123;RSPOS=72148;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 73841 rs143773730 C T . PASS ASP;KGPilot123;RSPOS=73841;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 74651 rs62641291 G A . PASS RSPOS=74651;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=129 +chr1 74681 rs13328683 G T . PASS CFL;GMAF=0.286106032906764;RSPOS=74681;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000008040000000100;WGT=0;dbSNPBuildID=121 +chr1 74709 rs62641292 T A . PASS CFL;RSPOS=74709;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=129 +chr1 74771 rs13328675 A G . PASS CFL;RSPOS=74771;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121 +chr1 74790 rs13328700 C G . PASS CFL;RSPOS=74790;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121 +chr1 74792 rs13328684 G A . PASS CFL;RSPOS=74792;SAO=0;SSR=0;VC=SNV;VP=050000000008000000000100;WGT=0;dbSNPBuildID=121 +chr1 77462 rs188023513 G A . PASS KGPilot123;RSPOS=77462;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 77470 rs192898053 T C . PASS KGPilot123;RSPOS=77470;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 77874 rs184538873 G A . PASS KGPilot123;RSPOS=77874;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 77961 rs78385339 G A . PASS GMAF=0.125685557586837;KGPilot123;RSPOS=77961;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000000040010000100;WGT=0;dbSNPBuildID=131 +chr1 79033 rs62641298 A G . PASS GMAF=0.438299817184644;GNO;HD;KGPilot123;RSPOS=79033;SAO=0;SSR=0;VC=SNV;VP=050000000000000510000100;WGT=0;dbSNPBuildID=129 +chr1 79050 rs62641299 G T . PASS GMAF=0.224405850091408;GNO;KGPilot123;RSPOS=79050;SAO=0;SSR=0;VC=SNV;VP=050000000000000110000100;WGT=0;dbSNPBuildID=129 +chr1 79137 rs143777184 A T . PASS ASP;KGPilot123;RSPOS=79137;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 79417 rs184768190 C T . PASS KGPilot123;RSPOS=79417;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 79418 rs2691296 G C . PASS GMAF=0.0178244972577697;RSPOS=79418;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000000040000000100;WGT=0;dbSNPBuildID=100 +chr1 79538 rs2691295 C T . PASS RSPOS=79538;RV;SAO=0;SSR=0;VC=SNV;VP=050000000000000000000100;WGT=0;dbSNPBuildID=100 +chr1 79772 rs147215883 C G . PASS ASP;KGPilot123;RSPOS=79772;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 79872 rs189224661 T G . PASS KGPilot123;RSPOS=79872;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 80323 rs3942603 G C . PASS CFL;GNO;RSPOS=80323;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000008000100000100;WGT=0;dbSNPBuildID=108 +chr1 80386 rs3878915 C A . PASS GMAF=0.0118829981718464;RSPOS=80386;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000000040000000100;WGT=0;dbSNPBuildID=108 +chr1 80454 rs144226842 G C . PASS ASP;KGPilot123;RSPOS=80454;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 81836 rs2259560 A T . PASS ASP;GNO;RSPOS=81836;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=100 +chr1 81949 rs181567186 T C . PASS KGPilot123;RSPOS=81949;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 81962 rs4030308 T TAA . PASS ASP;RSPOS=81962;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108 +chr1 82102 rs4030307 C T . PASS ASP;RSPOS=82102;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108 +chr1 82103 rs2020400 T C . PASS ASP;RSPOS=82103;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=92 +chr1 82126 rs1815133 C T . PASS ASP;RSPOS=82126;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=92 +chr1 82133 rs4030306 CA C,CAAAAAAAAAAAAAAA . PASS ASP;RSPOS=82136;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108 +chr1 82154 rs4477212 A G . PASS ASP;HD;RSPOS=82154;SAO=0;SSR=0;VC=SNV;VP=050000000004000400000100;WGT=0;dbSNPBuildID=111 +chr1 82162 rs1815132 C A . PASS ASP;GMAF=0.0351919561243144;GNO;RSPOS=82162;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=92 +chr1 82163 rs139113303 G A . PASS ASP;KGPilot123;RSPOS=82163;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 82196 rs112844054 A T . PASS ASP;RSPOS=82196;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=132 +chr1 82249 rs1851945 A G . PASS ASP;GMAF=0.0452468007312614;KGPilot123;RSPOS=82249;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000004040010000100;WGT=0;dbSNPBuildID=92 +chr1 82282 rs3871775 G A . PASS ASP;RSPOS=82282;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108 +chr1 82303 rs3871776 T C . PASS ASP;RSPOS=82303;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000000000100;WGT=0;dbSNPBuildID=108 +chr1 82316 rs4030305 A C . PASS ASP;GNO;RSPOS=82316;RV;SAO=0;SLO;SSR=0;VC=SNV;VP=050100000004000100000100;WGT=0;dbSNPBuildID=108 +chr1 82609 rs149189449 C G . PASS ASP;KGPilot123;RSPOS=82609;SAO=0;SSR=0;VC=SNV;VP=050000000004000010000100;WGT=0;dbSNPBuildID=134 +chr1 82676 rs185237834 T G . PASS KGPilot123;RSPOS=82676;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 82734 rs4030331 T C . PASS ASP;GMAF=0.261882998171846;KGPilot123;RSPOS=82734;RV;SAO=0;SLO;SSR=0;VC=SNV;VLD;VP=050100000004040010000100;WGT=0;dbSNPBuildID=108 +chr1 82957 rs189774606 C T . PASS KGPilot123;RSPOS=82957;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83084 rs181193408 T A . PASS KGPilot123;RSPOS=83084;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83088 rs186081601 G C . PASS KGPilot123;RSPOS=83088;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83107 rs4405097 G C . PASS ASP;RSPOS=83107;SAO=0;SSR=0;VC=SNV;VP=050000000004000000000100;WGT=0;dbSNPBuildID=111 +chr1 83119 rs4030324 AA A,ATAAC . PASS ASP;RSPOS=83120;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000000000200;WGT=0;dbSNPBuildID=108 +chr1 83771 rs189906733 T G . PASS KGPilot123;RSPOS=83771;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 83786 rs58520670 T TA . PASS ASP;RSPOS=83794;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83815 rs58857344 GAGAA G . PASS ASP;RSPOS=83827;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83826 rs71281475 AAAGA A,AAA . PASS ASP;GNO;RSPOS=83827;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130 +chr1 83855 rs59596480 GAA G . PASS ASP;RSPOS=83857;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83872 rs59556914 AA A,AAGA . PASS ASP;RSPOS=83873;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83884 rs59586754 GAAA G . PASS ASP;RSPOS=83885;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83897 rs61330047 GAA G . PASS ASP;RSPOS=83899;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83901 rs58254183 GAAAGAA G . PASS ASP;RSPOS=83903;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83921 rs61338823 GAA G . PASS ASP;RSPOS=83923;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83930 rs71281474 AG A,AGA . PASS ASP;GNO;RSPOS=83931;RV;SAO=0;SLO;SSR=0;VC=DIV;VP=050100000004000100000200;WGT=0;dbSNPBuildID=130 +chr1 83934 rs59235392 AG A,AGAAA . PASS ASP;RSPOS=83935;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 83977 rs180759811 A G . PASS KGPilot123;RSPOS=83977;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84002 rs28850140 G A . PASS ASP;GMAF=0.138939670932358;KGPilot123;RSPOS=84002;SAO=0;SSR=0;VC=SNV;VLD;VP=050000000004040010000100;WGT=0;dbSNPBuildID=125 +chr1 84010 rs186443818 G A . PASS KGPilot123;RSPOS=84010;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84018 rs61352176 GAA G . PASS ASP;RSPOS=84020;SAO=0;SSR=0;VC=DIV;VP=050000000004000000000200;WGT=0;dbSNPBuildID=129 +chr1 84079 rs190867312 T C . PASS KGPilot123;RSPOS=84079;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84139 rs183605470 A T . PASS KGPilot123;RSPOS=84139;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84156 rs188652299 A C . PASS KGPilot123;RSPOS=84156;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84244 rs191297051 A C . PASS KGPilot123;RSPOS=84244;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84295 rs183209871 G A . PASS KGPilot123;RSPOS=84295;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84346 rs187855973 T C . PASS KGPilot123;RSPOS=84346;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84453 rs191379015 C G . PASS KGPilot123;RSPOS=84453;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 +chr1 84705 rs183470350 T G . PASS KGPilot123;RSPOS=84705;SAO=0;SSR=0;VC=SNV;VP=050000000000000010000100;WGT=0;dbSNPBuildID=135 diff --git a/public/testdata/exampleFASTA.fasta.amb b/public/testdata/exampleFASTA.fasta.amb new file mode 100644 index 000000000..986e6d603 --- /dev/null +++ b/public/testdata/exampleFASTA.fasta.amb @@ -0,0 +1 @@ +100000 1 0 diff --git a/public/testdata/exampleFASTA.fasta.ann b/public/testdata/exampleFASTA.fasta.ann new file mode 100644 index 000000000..642ddb6d7 --- /dev/null +++ b/public/testdata/exampleFASTA.fasta.ann @@ -0,0 +1,3 @@ +100000 1 11 +0 chr1 (null) +0 100000 0 diff --git a/public/testdata/exampleFASTA.fasta.bwt b/public/testdata/exampleFASTA.fasta.bwt new file mode 100644 index 000000000..fe7422280 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.bwt differ diff --git a/public/testdata/exampleFASTA.fasta.pac b/public/testdata/exampleFASTA.fasta.pac new file mode 100644 index 000000000..b0f55c0c4 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.pac differ diff --git a/public/testdata/exampleFASTA.fasta.rbwt b/public/testdata/exampleFASTA.fasta.rbwt new file mode 100644 index 000000000..f623b8c39 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.rbwt differ diff --git a/public/testdata/exampleFASTA.fasta.rpac b/public/testdata/exampleFASTA.fasta.rpac new file mode 100644 index 000000000..b88ff49eb Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.rpac differ diff --git a/public/testdata/exampleFASTA.fasta.rsa b/public/testdata/exampleFASTA.fasta.rsa new file mode 100644 index 000000000..6e7e213df Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.rsa differ diff --git a/public/testdata/exampleFASTA.fasta.sa b/public/testdata/exampleFASTA.fasta.sa new file mode 100644 index 000000000..d6db971b7 Binary files /dev/null and b/public/testdata/exampleFASTA.fasta.sa differ