diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 5016526c0..70ee049f3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -85,9 +85,6 @@ public class StandardCallerArgumentCollection { @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) - public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - /** * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this @@ -199,7 +196,6 @@ public class StandardCallerArgumentCollection { this.heterozygosity = SCAC.heterozygosity; this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY; this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.OutputMode = SCAC.OutputMode; this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index f156468cc..4fae3d6e3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -214,6 +214,9 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) + public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + /** * Create a new UAC with defaults for all UAC arguments */ @@ -262,6 +265,8 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; this.pairHMM = uac.pairHMM; + this.OutputMode = uac.OutputMode; + this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs; // todo- arguments to remove this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java new file mode 100644 index 000000000..f07dbb392 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java @@ -0,0 +1,89 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; + +/** + * Result of assembling, with the resulting graph and status + * + * User: depristo + * Date: 7/1/13 + * Time: 5:35 PM + */ +public class AssemblyResult { + private final Status status; + private final SeqGraph graph; + + /** + * Create a new assembly result + * @param status the status, cannot be null + * @param graph the resulting graph of the assembly, can only be null if result is failed + */ + public AssemblyResult(final Status status, final SeqGraph graph) { + if ( status == null ) throw new IllegalArgumentException("status cannot be null"); + if ( status != Status.FAILED && graph == null ) throw new IllegalArgumentException("graph is null but status is " + status); + + this.status = status; + this.graph = graph; + } + + public Status getStatus() { return status; } + public SeqGraph getGraph() { return graph; } + + /** + * Status of the assembly result + */ + public enum Status { + /** Something went wrong, and we couldn't produce a meaningful graph */ + FAILED, + /** Assembly succeeded, but graph degenerated into just the reference sequence */ + JUST_ASSEMBLED_REFERENCE, + /** Assembly succeeded, and the graph has some meaningful structure */ + ASSEMBLED_SOME_VARIATION + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java index d876a403b..1a59cdb63 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java @@ -49,7 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -93,8 +92,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { } @Override - protected List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { - final List graphs = new LinkedList<>(); + protected List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { + final List results = new LinkedList<>(); final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; if( maxKmer < minKmer) { @@ -118,19 +117,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine { " future subsystem will actually go and error correct the reads"); } - final SeqGraph seqGraph = cleanupSeqGraph(graph.convertToSequenceGraph()); + results.add(cleanupSeqGraph(graph.convertToSequenceGraph())); - if ( seqGraph != null ) { // if the graph contains interesting variation from the reference - graphs.add(seqGraph); - - if ( debugGraphTransformations ) // we only want to use one graph size - break; - } + if ( debugGraphTransformations ) // we only want to use one graph size + break; } - } - return graphs; + return results; } @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index db1ca552a..18c93f2fb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,10 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -60,7 +57,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; -import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -81,13 +78,12 @@ import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.gvcf.GVCFWriter; import org.broadinstitute.sting.utils.haplotype.*; import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pairhmm.PairHMM; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; @@ -297,6 +293,61 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // general advanced arguments to control haplotype caller behavior // ----------------------------------------------------------------------------------------------- + @Advanced + @Argument(fullName="emitRefConfidence", shortName="ERC", doc="Emit experimental reference confidence scores", required = false) + protected ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; + + public enum ReferenceConfidenceMode { + NONE, + BP_RESOLUTION, + GVCF + } + + /** + * The GQ partition intervals + * + * Should be a non-empty list of boundaries. For example, suppose this variable is + * + * [A, B, C] + * + * We would partition our hom-ref sites into the following bands: + * + * X < A + * A <= X < B + * B <= X < C + * X >= C + * + * The default bands give the following GQ blocks: + * + * [0, 0] + * (0, 10] + * (10, 20] + * (20, 30] + * (30, 40] + * (40, 50] + * (50, 99] + * + * Note that in the GATK GQ values are capped at 99. + */ + @Advanced + @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) + protected List GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50); + + /** + * This parameter determines the maximum size of an indel considered as potentially segregating in the + * reference model. It is used to eliminate reads from being indel informative at a site, and determines + * by that mechanism the certainty in the reference base. Conceptually, setting this parameter to + * X means that each informative read is consistent with any indel of size < X being present at a specific + * position in the genome, given its alignment to the reference. + */ + @Advanced + @Argument(fullName="indelSizeToEliminateInRefModel", shortName="ERCIS", doc="The size of an indel to check for in the reference model", required = false) + protected int indelSizeToEliminateInRefModel = 10; + + // ----------------------------------------------------------------------------------------------- + // general advanced arguments to control haplotype caller behavior + // ----------------------------------------------------------------------------------------------- + @Advanced @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 2; @@ -466,6 +517,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file + ReferenceConfidenceModel referenceConfidenceModel = null; + //--------------------------------------------------------------------------------------------------------------- // // initialize @@ -484,6 +537,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final int nSamples = samples.size(); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user + // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this? + UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested @@ -528,6 +584,19 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // where the filters are used. For example, in emitting all sites the lowQual field is used headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); + if ( emitReferenceConfidence() ) { + if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); + headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + try { + vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); + } + } + } + vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); try { @@ -598,7 +667,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Override public EnumSet desiredReadStates() { if ( includeUnmappedReads ) { - throw new UserException.BadArgumentValue("includeUmappedReads", "is not yet functional"); + throw new UserException.BadArgumentValue("includeUnmappedReads", "is not yet functional"); // return EnumSet.of( // ActiveRegionReadState.PRIMARY, // ActiveRegionReadState.NONPRIMARY, @@ -632,38 +701,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // if we don't have any data, just abort early return new ActivityProfileState(ref.getLocus(), 0.0); - final List noCall = new ArrayList<>(); // used to noCall all genotypes until the exact model is applied - noCall.add(Allele.NO_CALL); - + final List noCall = Collections.singletonList(Allele.NO_CALL); // used to noCall all genotypes until the exact model is applied final Map splitContexts = AlignmentContextUtils.splitContextBySampleName(context); final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size()); final MathUtils.RunningAverage averageHQSoftClips = new MathUtils.RunningAverage(); for( final Map.Entry sample : splitContexts.entrySet() ) { - final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event) - Arrays.fill(genotypeLikelihoods, 0.0); - - for( final PileupElement p : sample.getValue().getBasePileup() ) { - final byte qual = p.getQual(); - if( p.isDeletion() || qual > (byte) 18) { - int AA = 0; final int AB = 1; int BB = 2; - if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { - AA = 2; - BB = 0; - if( p.isNextToSoftClip() ) { - averageHQSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); - } - } - genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); - genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); - genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; - } - } + final double[] genotypeLikelihoods = referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(sample.getValue().getBasePileup(), ref.getBase(), (byte) 18, averageHQSoftClips).genotypeLikelihoods; genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); } - final List alleles = new ArrayList<>(); - alleles.add( FAKE_REF_ALLELE ); - alleles.add( FAKE_ALT_ALLELE ); + final List alleles = Arrays.asList(FAKE_REF_ALLELE , FAKE_ALT_ALLELE); final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); @@ -683,7 +730,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return NO_CALLS; - if( !originalActiveRegion.isActive() ) { return NO_CALLS; } // Not active so nothing to do! + if( !originalActiveRegion.isActive() ) { + // Not active so nothing to do! + return referenceModelForNoVariation(originalActiveRegion, true); + } final List activeAllelesToGenotype = new ArrayList<>(); if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { @@ -693,23 +743,30 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } // No alleles found in this region so nothing to do! - if ( activeAllelesToGenotype.isEmpty() ) { return NO_CALLS; } + if ( activeAllelesToGenotype.isEmpty() ) { return referenceModelForNoVariation(originalActiveRegion, true); } } else { - if( originalActiveRegion.size() == 0 ) { return NO_CALLS; } // No reads here so nothing to do! + // No reads here so nothing to do! + if( originalActiveRegion.size() == 0 ) { return referenceModelForNoVariation(originalActiveRegion, true); } } // run the local assembler, getting back a collection of information on how we should proceed final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); // abort early if something is out of the acceptable range - if( ! assemblyResult.isVariationPresent() ) { return NO_CALLS; } // only the reference haplotype remains so nothing else to do! + if( ! assemblyResult.isVariationPresent() ) { + return referenceModelForNoVariation(originalActiveRegion, false); + } // only the reference haplotype remains so nothing else to do! + if (dontGenotype) return NO_CALLS; // user requested we not proceed // filter out reads from genotyping which fail mapping quality based criteria final Collection filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - if( assemblyResult.regionForGenotyping.size() == 0 ) { return NO_CALLS; } // no reads remain after filtering so nothing else to do! + if( assemblyResult.regionForGenotyping.size() == 0 ) { + // no reads remain after filtering so nothing else to do! + return referenceModelForNoVariation(originalActiveRegion, false); + } // evaluate each sample's reads against all haplotypes //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); @@ -734,7 +791,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { - haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc, + haplotypeBAMWriter.writeReadsAlignedToHaplotypes( + assemblyResult.haplotypes, + assemblyResult.paddedReferenceLoc, assemblyResult.haplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); @@ -742,7 +801,13 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } - return calledHaplotypes.getCalls(); + if ( emitReferenceConfidence() ) { + return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(), + calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping, + stratifiedReadMap, calledHaplotypes.getCalls()); + } else { + return calledHaplotypes.getCalls(); + } } private final static class AssemblyResult { @@ -751,6 +816,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final byte[] fullReferenceWithPadding; final GenomeLoc paddedReferenceLoc; final boolean variationPresent; + final Haplotype refHaplotype; private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) { this.haplotypes = haplotypes; @@ -758,6 +824,21 @@ public class HaplotypeCaller extends ActiveRegionWalker, In this.fullReferenceWithPadding = fullReferenceWithPadding; this.paddedReferenceLoc = paddedReferenceLoc; this.variationPresent = variationPresent; + + Haplotype firstRefHaplotype = null; + for ( final Haplotype h : haplotypes ) { + if ( h.isReference() ) { + if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h); + firstRefHaplotype = h; + } + } + + if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes); + this.refHaplotype = firstRefHaplotype; + } + + public Haplotype getRefHaplotype() { + return refHaplotype; } public boolean isVariationPresent() { @@ -789,7 +870,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In try { final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); - if ( ! dontTrimActiveRegions ) { + if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) { return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); } else { // we don't want to trim active regions, so go ahead and use the old one @@ -815,12 +896,54 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * @return a non-null haplotype */ private Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final GenomeLoc paddedReferenceLoc) { - final Haplotype refHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); - refHaplotype.setAlignmentStartHapwrtRef(activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart()); - final Cigar c = new Cigar(); - c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); - refHaplotype.setCigar(c); - return refHaplotype; + return ReferenceConfidenceModel.createReferenceHaplotype(activeRegion, activeRegion.getActiveRegionReference(referenceReader), paddedReferenceLoc); + } + + /** + * Create an ref model result (ref model or no calls depending on mode) for an active region without any variation + * (not is active, or assembled to just ref) + * + * @param region the region to return a no-variation result + * @param needsToBeFinalized should the region be finalized before computing the ref model (should be false if already done) + * @return a list of variant contexts (can be empty) to emit for this ref region + */ + private List referenceModelForNoVariation(final ActiveRegion region, final boolean needsToBeFinalized) { + if ( emitReferenceConfidence() ) { + if ( needsToBeFinalized ) finalizeActiveRegion(region); + filterNonPassingReads(region); // TODO -- remove when filtering is done in finalizeActiveRegion + final GenomeLoc paddedLoc = region.getExtendedLoc(); + final Haplotype refHaplotype = createReferenceHaplotype(region, paddedLoc); + final List haplotypes = Collections.singletonList(refHaplotype); + return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, + paddedLoc, region, createDummyStratifiedReadMap(refHaplotype, samplesList, region), + Collections.emptyList()); + } else { + return NO_CALLS; + } + } + + /** + * Create a context that maps each read to the reference haplotype with log10 L of 0 + * @param refHaplotype a non-null reference haplotype + * @param samples a list of all samples + * @param region the active region containing reads + * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref + */ + public static Map createDummyStratifiedReadMap(final Haplotype refHaplotype, + final List samples, + final ActiveRegion region) { + final Allele refAllele = Allele.create(refHaplotype, true); + + final Map map = new LinkedHashMap<>(1); + for ( final Map.Entry> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) { + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for ( final GATKSAMRecord read : entry.getValue() ) { + likelihoodMap.add(read, refAllele, 0.0); + } + map.put(entry.getKey(), likelihoodMap); + } + + return map; } /** @@ -913,6 +1036,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Override public void onTraversalDone(Integer result) { + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it + referenceConfidenceModel.close(); likelihoodCalculationEngine.close(); logger.info("Ran local assembly on " + result + " active regions"); } @@ -929,28 +1054,28 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Loop through the reads hard clipping the adaptor and low quality tails final List readsToUse = new ArrayList<>(activeRegion.getReads().size()); for( final GATKSAMRecord myRead : activeRegion.getReads() ) { - final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); - if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { - GATKSAMRecord clippedRead; - if (errorCorrectReads) - clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION ); - else if (useLowQualityBasesForAssembly) - clippedRead = postAdapterRead; - else // default case: clip low qual ends of reads - clippedRead= ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); + GATKSAMRecord clippedRead; + if (errorCorrectReads) + clippedRead = ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION ); + else if (useLowQualityBasesForAssembly) + clippedRead = myRead; + else // default case: clip low qual ends of reads + clippedRead= ReadClipper.hardClipLowQualEnds( myRead, MIN_TAIL_QUALITY ); - if ( dontUseSoftClippedBases ) { - // uncomment to remove hard clips from consideration at all - clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); - } else { - // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches - // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't - // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion - // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the - // TODO -- reference haplotype start must be removed - clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); - } + if ( dontUseSoftClippedBases || ! ReadUtils.hasWellDefinedFragmentSize(clippedRead) ) { + // remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all + clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead); + } else { + // revert soft clips so that we see the alignment start and end assuming the soft clips are all matches + // TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't + // TODO -- truly in the extended region, as the unclipped bases might actually include a deletion + // TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the + // TODO -- reference haplotype start must be removed + clippedRead = ReadClipper.revertSoftClippedBases(clippedRead); + } + clippedRead = ( clippedRead.getReadUnmappedFlag() ? clippedRead : ReadClipper.hardClipAdaptorSequence( clippedRead ) ); + if( !clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0 ) { clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() ); if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { //logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd()); @@ -981,6 +1106,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } private Map> splitReadsBySample( final Collection reads ) { + return splitReadsBySample(samplesList, reads); + } + + public static Map> splitReadsBySample( final List samplesList, final Collection reads ) { final Map> returnMap = new HashMap<>(); for( final String sample : samplesList) { List readList = returnMap.get( sample ); @@ -997,4 +1126,11 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } + /** + * Are we emitting a reference confidence in some form, or not? + * @return true if we are + */ + private boolean emitReferenceConfidence(){ + return emitReferenceConfidence != ReferenceConfidenceMode.NONE; + } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index c889d7995..27178c78f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -51,17 +51,12 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -115,9 +110,9 @@ public abstract class LocalAssemblyEngine { * @param refHaplotype the reference haplotype * @return a non-null list of reads */ - protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes); + protected abstract List assemble(List reads, Haplotype refHaplotype, List activeAlleleHaplotypes); - protected List assemble(List reads, Haplotype refHaplotype) { + protected List assemble(List reads, Haplotype refHaplotype) { return assemble(reads, refHaplotype, Collections.emptyList()); } @@ -145,7 +140,6 @@ public abstract class LocalAssemblyEngine { // create the list of artificial haplotypes that should be added to the graph for GGA mode final List activeAlleleHaplotypes = createActiveAlleleHaplotypes(refHaplotype, activeAllelesToGenotype, activeRegion.getExtendedLoc()); - // error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them final List correctedReads; if (readErrorCorrector != null) { @@ -154,20 +148,31 @@ public abstract class LocalAssemblyEngine { // and we only want the read-error corrected reads for graph building. readErrorCorrector.addReadsToKmers(activeRegion.getReads()); correctedReads = new ArrayList<>(readErrorCorrector.correctReads(activeRegion.getReads())); + } else { + correctedReads = activeRegion.getReads(); } - else correctedReads = activeRegion.getReads(); + final List nonRefGraphs = new LinkedList<>(); // create the graphs by calling our subclass assemble method - final List graphs = assemble(correctedReads, refHaplotype, activeAlleleHaplotypes); - - // do some QC on the graphs - for ( final SeqGraph graph : graphs ) { sanityCheckGraph(graph, refHaplotype); } + for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) { + if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) { + // do some QC on the graph + sanityCheckGraph(result.getGraph(), refHaplotype); + // add it to graphs with meaningful non-reference features + nonRefGraphs.add(result.getGraph()); + } + } // print the graphs if the appropriate debug option has been turned on - if ( graphWriter != null ) { printGraphs(graphs); } + if ( graphWriter != null ) { printGraphs(nonRefGraphs); } - // find the best paths in the graphs and return them as haplotypes - return findBestPaths( graphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() ); + if ( nonRefGraphs.isEmpty() ) { + // we couldn't assemble any meaningful graphs, so return just the reference haplotype + return Collections.singletonList(refHaplotype); + } else { + // find the best paths in the graphs and return them as haplotypes + return findBestPaths( nonRefGraphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() ); + } } /** @@ -288,7 +293,7 @@ public abstract class LocalAssemblyEngine { } } - protected SeqGraph cleanupSeqGraph(final SeqGraph seqGraph) { + protected AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { printDebugGraphTransform(seqGraph, new File("sequenceGraph.1.dot")); // the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive @@ -309,7 +314,7 @@ public abstract class LocalAssemblyEngine { // happen in cases where for example the reference somehow manages to acquire a cycle, or // where the entire assembly collapses back into the reference sequence. if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null ) - return null; + return new AssemblyResult(AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE, seqGraph); seqGraph.removePathsNotConnectedToRef(); seqGraph.simplifyGraph(); @@ -324,7 +329,7 @@ public abstract class LocalAssemblyEngine { } printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot")); - return seqGraph; + return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java new file mode 100644 index 000000000..ee7565282 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java @@ -0,0 +1,72 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +/** + * Holds information about a genotype call of a single sample reference vs. any non-ref event + * + * User: depristo + * Date: 6/21/13 + * Time: 12:58 PM + * To change this template use File | Settings | File Templates. + */ +final class RefVsAnyResult { + /** + * The genotype likelihoods for ref/ref ref/non-ref non-ref/non-ref + */ + final double[] genotypeLikelihoods = new double[3]; + + /** + * AD field value for ref / non-ref + */ + final int[] AD_Ref_Any = new int[2]; + + /** + * @return Get the DP (sum of AD values) + */ + public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java new file mode 100644 index 000000000..98264d4c2 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -0,0 +1,476 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter; +import org.broadinstitute.sting.utils.haplotypeBAMWriter.ReadDestination; +import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine; + +import java.io.File; +import java.util.*; + +/** + * Code for estimating the reference confidence + * + * This code can estimate the probability that the data for a single sample is consistent with a + * well-determined REF/REF diploid genotype. + * + * User: depristo + * Date: 6/21/13 + * Time: 12:52 PM + */ +public class ReferenceConfidenceModel { + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + + public final static String INDEL_INFORMATIVE_DEPTH = "CD"; + + private final GenomeLocParser genomeLocParser; + private final Set samples; + private final SAMFileHeader header; // TODO -- really shouldn't depend on this + private final int indelInformativeDepthIndelSize; + + private final static boolean WRITE_DEBUGGING_BAM = false; + private final SAMFileWriter debuggingWriter; + + /** + * Create a new ReferenceConfidenceModel + * + * @param genomeLocParser how we create genome locs + * @param samples the list of all samples we'll be considering with this model + * @param header the SAMFileHeader describing the read information (used for debugging) + * @param indelInformativeDepthIndelSize the max size of indels to consider when calculating indel informative depths + */ + public ReferenceConfidenceModel(final GenomeLocParser genomeLocParser, + final Set samples, + final SAMFileHeader header, + final int indelInformativeDepthIndelSize) { + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + if ( samples == null ) throw new IllegalArgumentException("samples cannot be null"); + if ( samples.isEmpty() ) throw new IllegalArgumentException("samples cannot be empty"); + if ( header == null ) throw new IllegalArgumentException("header cannot be empty"); + if ( indelInformativeDepthIndelSize < 0) throw new IllegalArgumentException("indelInformativeDepthIndelSize must be >= 1 but got " + indelInformativeDepthIndelSize); + + this.genomeLocParser = genomeLocParser; + this.samples = samples; + this.header = header; + this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; + + if ( WRITE_DEBUGGING_BAM ) { + final SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(true); + debuggingWriter = factory.makeBAMWriter(header, false, new File("refCalc.bam")); + } else { + debuggingWriter = null; + } + } + + /** + * Get the VCF header lines to include when emitting reference confidence values via calculateRefConfidence + * @return a non-null set of VCFHeaderLines + */ + public Set getVCFHeaderLines() { + final Set headerLines = new LinkedHashSet<>(); + headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); + headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + return headerLines; + } + + /** + * Close down this reference model, closing down any debugging information opened during execution + */ + public void close() { + if ( debuggingWriter != null ) debuggingWriter.close(); + } + + + /** + * Calculate the reference confidence for a single sample given the its read data + * + * Returns a list of variant contexts, one for each position in the activeregion.getLoc(), each containing + * detailed information about the certainty that the sample is hom-ref for each base in the region. + * + * + * + * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() + * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the + * stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. + * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) + * @param activeRegion the active region we want to get the reference confidence over + * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes + * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the + * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence + * under any position is covers (for snps that 1 bp, but for deletion its the entire ref span) + * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence + * contexts as well as calls from variantCalls if any were provided + */ + public List calculateRefConfidence(final Haplotype refHaplotype, + final Collection calledHaplotypes, + final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, + final Map stratifiedReadMap, + final List variantCalls) { + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); + + final GenomeLoc refSpan = activeRegion.getLocation(); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap); + final byte[] ref = refHaplotype.getBases(); + final List results = new ArrayList<>(refSpan.size()); + final String sampleName = stratifiedReadMap.keySet().iterator().next(); + + final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); + for ( final ReadBackedPileup pileup : refPileups ) { + final GenomeLoc curPos = pileup.getLocation(); + final int offset = curPos.getStart() - refSpan.getStart(); + + final VariantContext overlappingSite = getOverlappingVariantContext(curPos, variantCalls); + if ( overlappingSite != null ) { + // we have some overlapping site, add it to the list of positions + if ( overlappingSite.getStart() == curPos.getStart() ) + results.add(overlappingSite); + } else { + // otherwise emit a reference confidence variant context + final int refOffset = offset + globalRefOffset; + final byte refBase = ref[refOffset]; + final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + + final Allele refAllele = Allele.create(refBase, true); + final List refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE); + final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); + gb.AD(homRefCalc.AD_Ref_Any); + gb.DP(homRefCalc.getDP()); + + // genotype likelihood calculation + final GenotypeLikelihoods snpGLs = GenotypeLikelihoods.fromLog10Likelihoods(homRefCalc.genotypeLikelihoods); + final int nIndelInformativeReads = calcNIndelInformativeReads(pileup, refOffset, ref, indelInformativeDepthIndelSize); + final GenotypeLikelihoods indelGLs = getIndelPLs(nIndelInformativeReads); + + // now that we have the SNP and indel GLs, we take the one with the least confidence, + // as this is the most conservative estimate of our certainty that we are hom-ref. + // For example, if the SNP PLs are 0,10,100 and the indel PLs are 0,100,1000 + // we are very certain that there's no indel here, but the SNP confidence imply that we are + // far less confident that the ref base is actually the only thing here. So we take 0,10,100 + // as our GLs for the site. + final GenotypeLikelihoods leastConfidenceGLs = getGLwithWorstGQ(indelGLs, snpGLs); + + gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); + gb.PL(leastConfidenceGLs.getAsPLs()); + gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); + + vcb.genotypes(gb.make()); + results.add(vcb.make()); +// logger.info(" => VariantContext " + vcb.make()); + } + } + + return results; + } + + /** + * Get the GenotypeLikelihoods with the least strong corresponding GQ value + * @param gl1 first to consider (cannot be null) + * @param gl2 second to consider (cannot be null) + * @return gl1 or gl2, whichever has the worst GQ + */ + protected final GenotypeLikelihoods getGLwithWorstGQ(final GenotypeLikelihoods gl1, final GenotypeLikelihoods gl2) { + return gl1.getLog10GQ(GenotypeType.HOM_REF) > gl2.getLog10GQ(GenotypeType.HOM_REF) ? gl1 : gl2; + } + + /** + * Get indel PLs corresponding to seeing N nIndelInformativeReads at this site + * + * @param nInformativeReads the number of reads that inform us about being ref without an indel at this site + * @return non-null GenotypeLikelihoods given N + */ + protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { + // TODO -- optimization -- this could easily be optimized with some caching + final double homRef = 0.0; + final double het = - LOG10_2 * nInformativeReads; + final double homVar = INDEL_ERROR_RATE * nInformativeReads; + return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + } + private final static double LOG10_2 = Math.log10(2); + private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + + /** + * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt + * + * @param pileup the read backed pileup containing the data we want to evaluate + * @param refBase the reference base at this pileup position + * @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation + * @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips + * @return a RefVsAnyResult genotype call + */ + public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) { + final RefVsAnyResult result = new RefVsAnyResult(); + + for( final PileupElement p : pileup ) { + final byte qual = p.getQual(); + if( p.isDeletion() || qual > minBaseQual) { + int AA = 0; final int AB = 1; int BB = 2; + if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { + AA = 2; + BB = 0; + if( hqSoftClips != null && p.isNextToSoftClip() ) { + hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); + } + result.AD_Ref_Any[1]++; + } else { + result.AD_Ref_Any[0]++; + } + result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); + result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); + result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; + } + } + + return result; + } + + /** + * Get a list of pileups that span the entire active region span, in order, one for each position + */ + private List getPileupsOverReference(final Haplotype refHaplotype, + final Collection calledHaplotypes, + final GenomeLoc paddedReferenceLoc, + final GenomeLoc activeRegionSpan, + final Map stratifiedReadMap) { + final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); + final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); + writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves + writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); + final List realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + + if ( debuggingWriter != null ) + for ( final GATKSAMRecord read : realignedReads ) + debuggingWriter.addAlignment(read); + + final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, + false, genomeLocParser, samples, false); + + final List pileups = new LinkedList<>(); + final int startPos = activeRegionSpan.getStart(); + AlignmentContext next = libs.advanceToLocus(startPos, true); + for ( int curPos = startPos; curPos <= activeRegionSpan.getStop(); curPos++ ) { + if ( next != null && next.getLocation().getStart() == curPos ) { + pileups.add(next.getBasePileup()); + next = libs.hasNext() ? libs.next() : null; + } else { + // no data, so we create empty pileups + pileups.add(new ReadBackedPileupImpl(genomeLocParser.createGenomeLoc(activeRegionSpan.getContig(), curPos))); + } + } + + return pileups; + } + + /** + * Return the rightmost variant context in maybeOverlapping that overlaps curPos + * + * @param curPos non-null genome loc + * @param maybeOverlapping a collection of variant contexts that might overlap curPos + * @return a VariantContext, or null if none overlaps + */ + protected final VariantContext getOverlappingVariantContext(final GenomeLoc curPos, final Collection maybeOverlapping) { + VariantContext overlaps = null; + for ( final VariantContext vc : maybeOverlapping ) { + if ( genomeLocParser.createGenomeLoc(vc).overlapsP(curPos) ) { + if ( overlaps == null || vc.getStart() > overlaps.getStart() ) { + overlaps = vc; + } + } + } + return overlaps; + } + + /** + * Compute the sum of mismatching base qualities for readBases aligned to refBases at readStart / refStart + * assuming no insertions or deletions in the read w.r.t. the reference + * + * @param readBases non-null bases of the read + * @param readQuals non-null quals of the read + * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) + * @param refBases the reference bases + * @param refStart the offset into refBases that aligns to the readStart position in readBases + * @param maxSum if the sum goes over this value, return immediately + * @return the sum of quality scores for readBases that mismatch their corresponding ref bases + */ + protected final int sumMismatchingQualities(final byte[] readBases, + final byte[] readQuals, + final int readStart, + final byte[] refBases, + final int refStart, + final int maxSum) { + final int n = Math.min(readBases.length - readStart, refBases.length - refStart); + int sum = 0; + + for ( int i = 0; i < n; i++ ) { + final byte readBase = readBases[readStart + i]; + final byte refBase = refBases[refStart + i]; + if ( readBase != refBase ) { + sum += readQuals[readStart + i]; + if ( sum > maxSum ) + return sum; + } + } + + return sum; + } + + /** + * Compute whether a read is informative to eliminate an indel of size <= maxIndelSize segregating at readStart/refStart + * + * @param readBases non-null bases of the read + * @param readQuals non-null quals of the read + * @param readStart the starting position of the read (i.e., that aligns it to a position in the reference) + * @param refBases the reference bases + * @param refStart the offset into refBases that aligns to the readStart position in readBases + * @param maxIndelSize the max indel size to consider for the read to be informative + * @return true if read can eliminate the possibility that there's an indel of size <= maxIndelSize segregating at refStart + */ + protected boolean isReadInformativeAboutIndelsOfSize(final byte[] readBases, + final byte[] readQuals, + final int readStart, + final byte[] refBases, + final int refStart, + final int maxIndelSize) { + // todo -- fast exit when n bases left < maxIndelSize + + final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); + + // consider each indel size up to max in term, checking if an indel that deletes either the ref bases (deletion + // or read bases (insertion) would fit as well as the origin baseline sum of mismatching quality scores + for ( int indelSize = 1; indelSize <= maxIndelSize; indelSize++ ) { + for ( final boolean checkInsertion : Arrays.asList(true, false) ) { + final int readI, refI; + if ( checkInsertion ) { + readI = readStart + indelSize; + refI = refStart; + } else { + readI = readStart; + refI = refStart + indelSize; + } + + final int score = sumMismatchingQualities(readBases, readQuals, readI, refBases, refI, baselineMMSum); + if ( score <= baselineMMSum ) + return false; + } + } + + return true; + } + + /** + * Calculate the number of indel informative reads at pileup + * + * @param pileup a pileup + * @param pileupOffsetIntoRef the position of the pileup in the reference + * @param ref the ref bases + * @param maxIndelSize maximum indel size to consider in the informativeness calculation + * @return an integer >= 0 + */ + protected final int calcNIndelInformativeReads(final ReadBackedPileup pileup, final int pileupOffsetIntoRef, final byte[] ref, final int maxIndelSize) { + int nInformative = 0; + for ( final PileupElement p : pileup ) { + final GATKSAMRecord read = p.getRead(); + final int offset = p.getOffset(); + + // doesn't count as evidence + if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() ) + continue; + + // todo -- this code really should handle CIGARs directly instead of relying on the above tests + if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize)) + nInformative++; + } + return nInformative; + } + + /** + * Create a reference haplotype for an active region + * + * @param activeRegion the active region + * @param refBases the ref bases + * @param paddedReferenceLoc the location spanning of the refBases -- can be longer than activeRegion.getLocation() + * @return a reference haplotype + */ + public static Haplotype createReferenceHaplotype(final ActiveRegion activeRegion, final byte[] refBases, final GenomeLoc paddedReferenceLoc) { + final Haplotype refHaplotype = new Haplotype(refBases, true); + final int alignmentStart = activeRegion.getExtendedLoc().getStart() - paddedReferenceLoc.getStart(); + if ( alignmentStart < 0 ) throw new IllegalStateException("Bad alignment start in createReferenceHaplotype " + alignmentStart); + refHaplotype.setAlignmentStartHapwrtRef(alignmentStart); + final Cigar c = new Cigar(); + c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); + refHaplotype.setCigar(c); + return refHaplotype; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 672c61c0f..d575f14a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LocalAssemblyEngine; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.MathUtils; @@ -98,32 +99,33 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { this.justReturnRawGraph = justReturnRawGraph; } + private void addResult(final List results, final AssemblyResult maybeNullResult) { + if ( maybeNullResult != null ) + results.add(maybeNullResult); + } + @Override - public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { - final List graphs = new LinkedList<>(); + public List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes) { + final List results = new LinkedList<>(); // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles); - if ( graph != null ) - graphs.add(graph); + addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, dontIncreaseKmerSizesForCycles)); } // if none of those worked, iterate over larger sizes if allowed to do so - if ( graphs.isEmpty() && !dontIncreaseKmerSizesForCycles ) { + if ( results.isEmpty() && !dontIncreaseKmerSizesForCycles ) { int kmerSize = MathUtils.arrayMaxInt(kmerSizes) + KMER_SIZE_ITERATION_INCREASE; int numIterations = 1; - while ( graphs.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { + while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { // on the last attempt we will allow low complexity graphs - final SeqGraph graph = createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT); - if ( graph != null ) - graphs.add(graph); + addResult(results, createGraph(reads, refHaplotype, kmerSize, activeAlleleHaplotypes, numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT)); kmerSize += KMER_SIZE_ITERATION_INCREASE; numIterations++; } } - return graphs; + return results; } /** @@ -136,11 +138,16 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity) */ - protected SeqGraph createGraph(final List reads, - final Haplotype refHaplotype, - final int kmerSize, - final List activeAlleleHaplotypes, - final boolean allowLowComplexityGraphs) { + protected AssemblyResult createGraph(final List reads, + final Haplotype refHaplotype, + final int kmerSize, + final List activeAlleleHaplotypes, + final boolean allowLowComplexityGraphs) { + if ( refHaplotype.length() < kmerSize ) { + // happens in cases where the assembled region is just too small + return new AssemblyResult(AssemblyResult.Status.FAILED, null); + } + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples); // add the reference sequence to the graph @@ -193,14 +200,15 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph - if ( justReturnRawGraph ) return initialSeqGraph; + if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph); if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction - final SeqGraph seqGraph = cleanupSeqGraph(initialSeqGraph); - return ( seqGraph != null && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(seqGraph) ) ? null : seqGraph; + final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); + final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus(); + return new AssemblyResult(status, cleaned.getGraph()); } /** diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java new file mode 100644 index 000000000..8f509b36b --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -0,0 +1,298 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Genome-wide VCF writer + * + * User: depristo + * Date: 6/24/13 + * Time: 2:51 PM + */ +public class GVCFWriter implements VariantContextWriter { + // + // static VCF field names + // + protected final static String BLOCK_SIZE_INFO_FIELD = "BLOCK_SIZE"; + protected final static String MIN_DP_FORMAT_FIELD = "MIN_DP"; + protected final static String MIN_GQ_FORMAT_FIELD = "MIN_GQ"; + + // + // Final fields initialized in constructor + // + /** Where we'll ultimately write our VCF records */ + final private VariantContextWriter underlyingWriter; + + final private List GQPartitions; + + /** fields updated on the fly during GVCFWriter operation */ + int nextAvailableStart = -1; + private String sampleName = null; + private HomRefBlock currentBlock = null; + + /** + * Is the proposed GQ partitions well-formed? + * + * @param GQPartitions proposed GQ partitions + * @return a non-null string if something is wrong (string explains issue) + */ + protected static List parsePartitions(final List GQPartitions) { + if ( GQPartitions == null ) throw new IllegalArgumentException("GQpartitions cannot be null"); + if ( GQPartitions.isEmpty() ) throw new IllegalArgumentException("GQpartitions cannot be empty"); + + final List result = new LinkedList<>(); + int lastThreshold = 0; + for ( final Integer value : GQPartitions ) { + if ( value == null ) throw new IllegalArgumentException("GQPartitions contains a null integer"); + if ( value < lastThreshold ) throw new IllegalArgumentException("GQPartitions is out of order. Last is " + lastThreshold + " but next is " + value); + if ( value == lastThreshold ) throw new IllegalArgumentException("GQPartitions is equal elements: Last is " + lastThreshold + " but next is " + value); + result.add(new HomRefBlock(lastThreshold, value)); + lastThreshold = value; + } + result.add(new HomRefBlock(lastThreshold, Integer.MAX_VALUE)); + + return result; + } + + /** + * Create a new GVCF writer + * + * Should be a non-empty list of boundaries. For example, suppose this variable is + * + * [A, B, C] + * + * We would partition our hom-ref sites into the following bands: + * + * X < A + * A <= X < B + * B <= X < C + * X >= C + * + * @param underlyingWriter the ultimate destination of the GVCF records + * @param GQPartitions a well-formed list of GQ partitions + */ + public GVCFWriter(final VariantContextWriter underlyingWriter, final List GQPartitions) { + if ( underlyingWriter == null ) throw new IllegalArgumentException("underlyingWriter cannot be null"); + this.underlyingWriter = underlyingWriter; + this.GQPartitions = parsePartitions(GQPartitions); + } + + /** + * Write the VCF header + * + * Adds standard GVCF fields to the header + * + * @param header a non-null header + */ + @Override + public void writeHeader(VCFHeader header) { + if ( header == null ) throw new IllegalArgumentException("header cannot be null"); + header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + header.addMetaDataLine(new VCFInfoHeaderLine(BLOCK_SIZE_INFO_FIELD, 1, VCFHeaderLineType.Integer, "Size of the homozygous reference GVCF block")); + header.addMetaDataLine(new VCFFormatHeaderLine(MIN_DP_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum DP observed within the GVCF block")); + header.addMetaDataLine(new VCFFormatHeaderLine(MIN_GQ_FORMAT_FIELD, 1, VCFHeaderLineType.Integer, "Minimum GQ observed within the GVCF block")); + + for ( final HomRefBlock partition : GQPartitions ) { + header.addMetaDataLine(partition.toVCFHeaderLine()); + } + + underlyingWriter.writeHeader(header); + } + + /** + * Close this GVCF writer. Finalizes any pending hom-ref blocks and emits those to the underlyingWriter as well + */ + @Override + public void close() { + close(true); + } + + /** + * Horrible work around because there's no clean way to get our VCFWriter closed by the GATK + * + * If closeUnderlyingWriter is true, then we'll close the underlying writer, otherwise we'll leave it open + * so the GATK closes it later + * + * @param closeUnderlyingWriter should we leave the underlying writer open or closed? + */ + public void close(final boolean closeUnderlyingWriter) { + emitCurrentBlock(); + if ( closeUnderlyingWriter ) underlyingWriter.close(); + } + + /** + * Add hom-ref site from vc to this gVCF hom-ref state tracking, emitting any pending states if appropriate + * + * @param vc a non-null VariantContext + * @param g a non-null genotype from VariantContext + * @return a VariantContext to be emitted, or null if non is appropriate + */ + protected VariantContext addHomRefSite(final VariantContext vc, final Genotype g) { + if ( nextAvailableStart != -1 && vc.getStart() <= nextAvailableStart ) { + // don't create blocks while the hom-ref site falls before nextAvailableStart (for deletions) + return null; + } else if ( currentBlock == null ) { + currentBlock = createNewBlock(vc, g); + return null; + } else if ( currentBlock.withinBounds(g.getGQ()) ) { + currentBlock.add(vc.getStart(), g); + return null; + } else { + final VariantContext result = blockToVCF(currentBlock); + currentBlock = createNewBlock(vc, g); + return result; + } + } + + /** + * Flush the current hom-ref block, if necessary, to the underlying writer, and reset the currentBlock to null + */ + private void emitCurrentBlock() { + if ( currentBlock != null ) { + // there's actually some work to do + underlyingWriter.add(blockToVCF(currentBlock)); + currentBlock = null; + } + } + + /** + * Convert a HomRefBlock into a VariantContext + * + * @param block the block to convert + * @return a VariantContext representing the gVCF encoding for this block + */ + private VariantContext blockToVCF(final HomRefBlock block) { + if ( block == null ) throw new IllegalArgumentException("block cannot be null"); + + final VariantContextBuilder vcb = new VariantContextBuilder(block.getStartingVC()); + vcb.attributes(new HashMap(2)); // clear the attributes + vcb.stop(block.getStop()); + vcb.attribute(VCFConstants.END_KEY, block.getStop()); + vcb.attribute(BLOCK_SIZE_INFO_FIELD, block.getSize()); + + // create the single Genotype with GQ and DP annotations + final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Collections.nCopies(2, block.getRef())); + gb.noAD().noPL().noAttributes(); // clear all attributes + gb.GQ(block.getMedianGQ()); + gb.DP(block.getMedianDP()); + gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); + gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); + + return vcb.genotypes(gb.make()).make(); + } + + /** + * Helper function to create a new HomRefBlock from a variant context and current genotype + * + * @param vc the VariantContext at the site where want to start the band + * @param g the genotype of the sample from vc that should be used to initialize the block + * @return a newly allocated and initialized block containing g already + */ + private HomRefBlock createNewBlock(final VariantContext vc, final Genotype g) { + // figure out the GQ limits to use based on the GQ of g + HomRefBlock partition = null; + for ( final HomRefBlock maybePartition : GQPartitions ) { + if ( maybePartition.withinBounds(g.getGQ()) ) { + partition = maybePartition; + break; + } + } + if ( partition == null ) throw new IllegalStateException("GQ " + g + " from " + vc + " didn't fit into any partition " + partition); + + // create the block, add g to it, and return it for use + final HomRefBlock block = new HomRefBlock(vc, partition.getGQLowerBound(), partition.getGQUpperBound()); + block.add(vc.getStart(), g); + return block; + } + + /** + * Add a VariantContext to this writer for emission + * + * Requires that the VC have exactly one genotype + * + * @param vc a non-null VariantContext + */ + @Override + public void add(VariantContext vc) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + + if ( sampleName == null ) + sampleName = vc.getGenotype(0).getSampleName(); + + if ( ! vc.hasGenotypes() ) { + throw new IllegalArgumentException("GVCF assumes that the VariantContext has genotypes"); + } else if ( vc.getGenotypes().size() != 1 ) { + throw new IllegalArgumentException("GVCF assumes that the VariantContext has exactly one genotype but saw " + vc.getGenotypes().size()); + } else { + if ( currentBlock != null && ! currentBlock.isContiguous(vc) ) { + // we've made a non-contiguous step (across interval, onto another chr), so finalize + emitCurrentBlock(); + } + + final Genotype g = vc.getGenotype(0); + if ( g.isHomRef() ) { + // create bands + final VariantContext maybeCompletedBand = addHomRefSite(vc, g); + if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); + } else { + // g is variant, so flush the bands and emit vc + emitCurrentBlock(); + nextAvailableStart = vc.getEnd(); + underlyingWriter.add(vc); + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java new file mode 100644 index 000000000..282e49217 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -0,0 +1,169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFHeaderLine; + +import java.util.ArrayList; +import java.util.List; + +/** + * Helper class for calculating a GQ band in the GVCF writer + * + * A band contains GQ and DP values for a contiguous stretch of hom-ref genotypes, + * and provides summary information about the entire block of genotypes. + * + * Genotypes within the HomRefBlock are restricted to hom-ref genotypes within a band of GQ scores + * + * User: depristo + * Date: 6/25/13 + * Time: 9:41 AM + */ +final class HomRefBlock { + private final VariantContext startingVC; + int stop; + private final int minGQ, maxGQ; + private List GQs = new ArrayList<>(100); + private List DPs = new ArrayList<>(100); + private final Allele ref; + + /** + * Create a new HomRefBlock + * + * @param startingVC the VariantContext that starts this band (for starting position information) + * @param minGQ the minGQ (inclusive) to use in this band + * @param maxGQ the maxGQ (exclusive) to use in this band + */ + public HomRefBlock(final VariantContext startingVC, int minGQ, int maxGQ) { + if ( startingVC == null ) throw new IllegalArgumentException("startingVC cannot be null"); + if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); + + this.startingVC = startingVC; + this.stop = getStart() - 1; + this.ref = startingVC.getReference(); + this.minGQ = minGQ; + this.maxGQ = maxGQ; + } + + /** + * Create a new HomRefBlock only for doing bounds checking + * + * @param minGQ the minGQ (inclusive) to use in this band + * @param maxGQ the maxGQ (exclusive) to use in this band + */ + public HomRefBlock(int minGQ, int maxGQ) { + if ( minGQ > maxGQ ) throw new IllegalArgumentException("bad minGQ " + minGQ + " as its > maxGQ " + maxGQ); + + this.startingVC = null; + this.stop = -1; + this.ref = null; + this.minGQ = minGQ; + this.maxGQ = maxGQ; + } + + /** + * Add information from this Genotype to this band + * @param g a non-null Genotype with GQ and DP attributes + */ + public void add(final int pos, final Genotype g) { + if ( g == null ) throw new IllegalArgumentException("g cannot be null"); + if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); + if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); + if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + + stop = pos; + GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission + DPs.add(g.getDP()); + } + + /** + * Is the GQ value within the bounds of this GQ (GQ >= minGQ && GQ < maxGQ) + * @param GQ the GQ value to test + * @return true if within bounds, false otherwise + */ + public boolean withinBounds(final int GQ) { + return GQ >= minGQ && GQ < maxGQ; + } + + /** Get the min GQ observed within this band */ + public int getMinGQ() { return MathUtils.arrayMin(GQs); } + /** Get the median GQ observed within this band */ + public int getMedianGQ() { return MathUtils.median(GQs); } + /** Get the min DP observed within this band */ + public int getMinDP() { return MathUtils.arrayMin(DPs); } + /** Get the median DP observed within this band */ + public int getMedianDP() { return MathUtils.median(DPs); } + + protected int getGQUpperBound() { return maxGQ; } + protected int getGQLowerBound() { return minGQ; } + + public boolean isContiguous(final VariantContext vc) { + return vc.getEnd() == getStop() + 1 && startingVC.getChr().equals(vc.getChr()); + } + + public VariantContext getStartingVC() { return startingVC; } + public int getStart() { return startingVC.getStart(); } + public int getStop() { return stop; } + public Allele getRef() { return ref; } + public int getSize() { return getStop() - getStart() + 1; } + + @Override + public String toString() { + return "HomRefBlock{" + + "minGQ=" + minGQ + + ", maxGQ=" + maxGQ + + '}'; + } + + public VCFHeaderLine toVCFHeaderLine() { + return new VCFHeaderLine("GVCFBlock", "minGQ=" + getGQLowerBound() + "(inclusive),maxGQ=" + getGQUpperBound() + "(exclusive)"); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java index e7e5cf0e1..2a435f10d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/AllHaplotypeBAMWriter.java @@ -46,11 +46,10 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; -import net.sf.samtools.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; @@ -67,17 +66,17 @@ import java.util.*; * Time: 1:50 PM */ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { - public AllHaplotypeBAMWriter(final SAMFileWriter bamWriter) { - super(bamWriter); + public AllHaplotypeBAMWriter(final ReadDestination destination) { + super(destination); } /** * {@inheritDoc} */ @Override - public void writeReadsAlignedToHaplotypes(final List haplotypes, + public void writeReadsAlignedToHaplotypes(final Collection haplotypes, final GenomeLoc paddedReferenceLoc, - final List bestHaplotypes, + final Collection bestHaplotypes, final Set calledHaplotypes, final Map stratifiedReadMap) { writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc); diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index 7206dd674..c298485f6 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -68,17 +68,17 @@ import java.util.*; * Time: 1:50 PM */ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { - public CalledHaplotypeBAMWriter(final SAMFileWriter bamWriter) { - super(bamWriter); + public CalledHaplotypeBAMWriter(final ReadDestination destination) { + super(destination); } /** * {@inheritDoc} */ @Override - public void writeReadsAlignedToHaplotypes(final List haplotypes, + public void writeReadsAlignedToHaplotypes(final Collection haplotypes, final GenomeLoc paddedReferenceLoc, - final List bestHaplotypes, + final Collection bestHaplotypes, final Set calledHaplotypes, final Map stratifiedReadMap) { if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes @@ -98,10 +98,8 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { // next, output the interesting reads for each sample aligned against one of the called haplotypes for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( final Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { - if ( entry.getKey().getMappingQuality() > 0 ) { - final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); - } + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); + writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); } } } diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 1afbeed63..509399fd9 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -46,16 +46,18 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; -import net.sf.samtools.*; +import net.sf.samtools.Cigar; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMTag; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; import java.util.*; @@ -75,8 +77,8 @@ public abstract class HaplotypeBAMWriter { protected final static String READ_GROUP_ID = "ArtificialHaplotype"; protected final static String HAPLOTYPE_TAG = "HC"; - final SAMFileWriter bamWriter; - final SAMFileHeader bamHeader; + final ReadDestination output; + boolean writeHaplotypesAsWell = true; /** * Possible modes for writing haplotypes to BAMs @@ -104,27 +106,10 @@ public abstract class HaplotypeBAMWriter { * @return a new HaplotypeBAMWriter */ public static HaplotypeBAMWriter create(final Type type, final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header) { - if ( header == null ) throw new IllegalArgumentException("header cannot be null"); - if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null"); if ( type == null ) throw new IllegalArgumentException("type cannot be null"); - // prepare the bam header - final SAMFileHeader bamHeader = new SAMFileHeader(); - bamHeader.setSequenceDictionary(header.getSequenceDictionary()); - bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); - - // include the original read groups plus a new artificial one for the haplotypes - final List readGroups = new ArrayList(header.getReadGroups()); - final SAMReadGroupRecord rg = new SAMReadGroupRecord(READ_GROUP_ID); - rg.setSample("HC"); - rg.setSequencingCenter("BI"); - readGroups.add(rg); - bamHeader.setReadGroups(readGroups); - - // TODO -- this will be a performance problem at high-scale - stingSAMWriter.setPresorted(false); - stingSAMWriter.writeHeader(bamHeader); - return create(type, stingSAMWriter); + final ReadDestination toBam = new ReadDestination.ToBAM(stingSAMWriter, header, READ_GROUP_ID); + return create(type, toBam); } /** @@ -134,16 +119,16 @@ public abstract class HaplotypeBAMWriter { * may come in out of order during writing * * @param type the type of the writer we want to create - * @param writer the destination, must not be null + * @param destination the destination, must not be null * @return a new HaplotypeBAMWriter */ - public static HaplotypeBAMWriter create(final Type type, final SAMFileWriter writer) { - if ( writer == null ) throw new IllegalArgumentException("writer cannot be null"); + public static HaplotypeBAMWriter create(final Type type, final ReadDestination destination) { + if ( destination == null ) throw new IllegalArgumentException("writer cannot be null"); if ( type == null ) throw new IllegalArgumentException("type cannot be null"); switch ( type ) { - case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(writer); - case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(writer); + case ALL_POSSIBLE_HAPLOTYPES: return new AllHaplotypeBAMWriter(destination); + case CALLED_HAPLOTYPES: return new CalledHaplotypeBAMWriter(destination); default: throw new IllegalArgumentException("Unknown type " + type); } } @@ -154,11 +139,10 @@ public abstract class HaplotypeBAMWriter { * Assumes that the header has been fully initialized with a single * read group READ_GROUP_ID * - * @param bamWriter our output destination + * @param output our output destination */ - protected HaplotypeBAMWriter(SAMFileWriter bamWriter) { - this.bamWriter = bamWriter; - this.bamHeader = bamWriter.getFileHeader(); + protected HaplotypeBAMWriter(final ReadDestination output) { + this.output = output; } /** @@ -170,12 +154,18 @@ public abstract class HaplotypeBAMWriter { * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference * @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes */ - public abstract void writeReadsAlignedToHaplotypes(final List haplotypes, + public abstract void writeReadsAlignedToHaplotypes(final Collection haplotypes, final GenomeLoc paddedReferenceLoc, - final List bestHaplotypes, + final Collection bestHaplotypes, final Set calledHaplotypes, final Map stratifiedReadMap); + public void writeReadsAlignedToHaplotypes(final Collection haplotypes, + final GenomeLoc paddedReferenceLoc, + final Map stratifiedReadMap) { + writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap); + } + /** * Write out read aligned to haplotype to the BAM file * @@ -193,7 +183,7 @@ public abstract class HaplotypeBAMWriter { final boolean isInformative) { final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); if ( alignedToRef != null ) - bamWriter.addAlignment(alignedToRef); + output.add(alignedToRef); } /** @@ -281,8 +271,9 @@ public abstract class HaplotypeBAMWriter { protected void writeHaplotypesAsReads(final Collection haplotypes, final Set bestHaplotypes, final GenomeLoc paddedReferenceLoc) { - for ( final Haplotype haplotype : haplotypes ) - writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); + if ( isWriteHaplotypesAsWell() ) + for ( final Haplotype haplotype : haplotypes ) + writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype)); } /** @@ -295,7 +286,7 @@ public abstract class HaplotypeBAMWriter { private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) { - final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + final GATKSAMRecord record = new GATKSAMRecord(output.getHeader()); record.setReadBases(haplotype.getBases()); record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); @@ -307,6 +298,14 @@ public abstract class HaplotypeBAMWriter { record.setReferenceIndex(paddedRefLoc.getContigIndex()); record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); record.setFlags(16); - bamWriter.addAlignment(record); + output.add(record); + } + + public boolean isWriteHaplotypesAsWell() { + return writeHaplotypesAsWell; + } + + public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) { + this.writeHaplotypesAsWell = writeHaplotypesAsWell; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java new file mode 100644 index 000000000..007b5402e --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/ReadDestination.java @@ -0,0 +1,135 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotypeBAMWriter; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * Utility class that allows us to easily create destinations for the HaplotypeBAMWriters + * + * User: depristo + * Date: 6/19/13 + * Time: 10:19 AM + */ +public abstract class ReadDestination { + public abstract void add(final GATKSAMRecord read); + + private final SAMFileHeader bamHeader; + + public SAMFileHeader getHeader() { + return bamHeader; + } + + protected ReadDestination(final SAMFileHeader header, final String readGroupID) { + // prepare the bam header + if ( header == null ) throw new IllegalArgumentException("header cannot be null"); + bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(header.getSequenceDictionary()); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + + // include the original read groups plus a new artificial one for the haplotypes + final List readGroups = new ArrayList(header.getReadGroups()); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupID); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + } + + public static class ToBAM extends ReadDestination { + final SAMFileWriter bamWriter; + + /** + * Create a ReadDestination that writes to a BAM file + */ + public ToBAM(final StingSAMFileWriter stingSAMWriter, final SAMFileHeader header, final String readGroupID) { + super(header, readGroupID); + if ( stingSAMWriter == null ) throw new IllegalArgumentException("writer cannot be null"); + + bamWriter = stingSAMWriter; + stingSAMWriter.setPresorted(false); + stingSAMWriter.writeHeader(getHeader()); + } + + @Override + public void add(GATKSAMRecord read) { + bamWriter.addAlignment(read); + } + } + + public static class ToList extends ReadDestination { + final List reads = new LinkedList<>(); + + /** + * Create a ReadDestination that captures the output reads in a list of reads + */ + public ToList(SAMFileHeader header, String readGroupID) { + super(header, readGroupID); + } + + @Override + public void add(GATKSAMRecord read) { + reads.add(read); + } + + /** + * Get the reads that have been written to this destination + * @return a non-null list of reads + */ + public List getReads() { + return reads; + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 0636d7c1b..1f7236c39 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4a3479fc4ad387d381593b328f737a1b"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "12ed9d67139e7a94d67e9e6c06ac6e16"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java new file mode 100644 index 000000000..8ab2c0779 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -0,0 +1,88 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { + @DataProvider(name = "MyDataProvider") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; + final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "2b54e4e948144030a829175bcd295e47"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ba1bb72caa06c1962a202b2012c266cb"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a841d9e94fb832066a04f13bdc62b101"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "6cc95c47368a568fb9e1eb8578f96b0b"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2703f1c0c27b3c977689604b5f78b61f"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b54e36bbb4dc6c3b786349fa267d1f6c"}); + + + return tests.toArray(new Object[][]{}); + } + + /** + * Example testng test using MyDataProvider + */ + @Test(dataProvider = "MyDataProvider") + public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { + final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header", + b37KGReference, bam, intervals, mode); + final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); + executeTest(name, spec); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index aca1172d4..43095dcf3 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -93,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "130d36448faeb7b8d4bce4be12dacd3a"); } @@ -208,7 +208,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("cc6f2a76ee97ecc14a5f956ffbb21d88")); + Arrays.asList("c5c63d03e1c4bbe32f06902acd4a10f9")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @@ -217,7 +217,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("51e91c8af61a6b47807165906baefb00")); + Arrays.asList("f0b2a96040429908cce17327442eec29")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index d009550f4..3b17725f9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -61,7 +61,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { List tests = new ArrayList(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "9da4cc89590c4c64a36f4a9c820f8609"}); + tests.add(new Object[]{nct, "e800f6bb3a820da5c6b29f0195480796"}); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java new file mode 100644 index 000000000..3a4ed7e59 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -0,0 +1,408 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMFileHeader; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.variant.variantcontext.GenotypeType; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class ReferenceConfidenceModelUnitTest extends BaseTest { + GenomeLocParser parser; + final String RGID = "ID1"; + GATKSAMReadGroupRecord rg; + final String sample = "NA12878"; + final Set samples = Collections.singleton(sample); + SAMFileHeader header; + ReferenceConfidenceModel model; + + @BeforeClass + public void setUp() throws Exception { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + rg = new GATKSAMReadGroupRecord(RGID); + rg.setSample(sample); + header.addReadGroup(rg); + parser = new GenomeLocParser(header.getSequenceDictionary()); + } + + @BeforeMethod + public void setupModel() { + model = new ReferenceConfidenceModel(parser, samples, header, 10); + } + + @DataProvider(name = "CalcNIndelInformativeReadsData") + public Object[][] makeMyDataProvider() { + List tests = new ArrayList(); + + { // very basic testing + final String ref = "ACGT"; + final String read = "ACGT"; + tests.add(new Object[]{read, ref, 1, Arrays.asList(1, 1, 1, 0)}); + tests.add(new Object[]{read, ref, 2, Arrays.asList(1, 1, 0, 0)}); + tests.add(new Object[]{read, ref, 3, Arrays.asList(1, 0, 0, 0)}); + tests.add(new Object[]{read, ref, 4, Arrays.asList(0, 0, 0, 0)}); + } + + { // actually interesting case where some sites aren't informative + final String ref = "NNAAAANN"; + final String read1 = "NNA"; + final String read2 = "NNAA"; + final String read3 = "NNAAA"; + final String read4 = "NNAAAA"; + final String read5 = "NNAAAAN"; + tests.add(new Object[]{read1, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read2, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read3, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read4, ref, 1, Arrays.asList(1, 1, 0, 0, 0, 0, 0, 0)}); + tests.add(new Object[]{read5, ref, 1, Arrays.asList(1, 1, 1, 1, 1, 1, 0, 0)}); + } + + { + for ( final String repeatUnit : Arrays.asList("A", "CA", "TAG", "TAGC", "TCAGA")) { + final String anchor = Utils.dupString("N", repeatUnit.length()); + for ( int nUnits = 1; nUnits < 10; nUnits++ ) { + final String repeat = Utils.dupString(repeatUnit, nUnits); + final String ref = anchor + repeat + anchor; + for ( int readLen = repeatUnit.length(); readLen < repeat.length(); readLen++ ) { + final String read = anchor + repeat.substring(0, readLen); + final List expected = new LinkedList<>(); + for ( int i = 0; i < anchor.length(); i++ ) expected.add(1); + for ( int i = 0; i < repeat.length(); i++ ) expected.add(readLen == repeat.length() ? 1 : 0); + for ( int i = 0; i < anchor.length(); i++ ) expected.add(0); + tests.add(new Object[]{read, ref, repeatUnit.length(), expected}); + + final List result = new ArrayList<>(Collections.nCopies(ref.length() - anchor.length(), 1)); + result.addAll(Collections.nCopies(anchor.length(), 0)); + tests.add(new Object[]{ref, ref, repeatUnit.length(), result}); + } + } + + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CalcNIndelInformativeReadsData") + public void testCalcNIndelInformativeReads(final String readBases, final String ref, final int maxIndelSize, final List expected ) { + final byte qual = (byte)30; + final byte[] quals = Utils.dupBytes(qual, readBases.length()); + + for ( int i = 0; i < readBases.getBytes().length; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), quals, readBases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, i, i); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), i); + final int actual = model.calcNIndelInformativeReads(pileup, i, ref.getBytes(), maxIndelSize); + Assert.assertEquals(actual, (int)expected.get(i), "failed at position " + i); + } + } + + @Test + public void testClose() { + model.close(); + } + + @Test + public void testWorstGL() { + final GenotypeLikelihoods gq10 = GenotypeLikelihoods.fromPLField("0,10,100"); + final GenotypeLikelihoods gq20 = GenotypeLikelihoods.fromPLField("0,20,200"); + final GenotypeLikelihoods gq0 = GenotypeLikelihoods.fromPLField("20,0,200"); + + Assert.assertSame(model.getGLwithWorstGQ(gq10, gq20), gq10); + Assert.assertSame(model.getGLwithWorstGQ(gq20, gq10), gq10); + Assert.assertSame(model.getGLwithWorstGQ(gq10, gq0), gq0); + Assert.assertSame(model.getGLwithWorstGQ(gq0, gq10), gq0); + } + + @Test + public void testIndelLikelihoods() { + GenotypeLikelihoods prev = model.getIndelPLs(0); + Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); + Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); + + for ( int i = 1; i < 10000; i++ ) { + final GenotypeLikelihoods current = model.getIndelPLs(i); + final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); + final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); + Assert.assertTrue(prevGQ < currGQ, "GQ Failed with prev " + prev + " curr " + current + " at " + i); + Assert.assertTrue(prev.getAsPLs()[1] < current.getAsPLs()[1], "het PL failed with prev " + prev + " curr " + current + " at " + i); + Assert.assertTrue(prev.getAsPLs()[2] < current.getAsPLs()[2], "hom-var PL Failed with prev " + prev + " curr " + current + " at " + i); +// logger.warn("result at " + i + " is " + current); + prev = current; + } + } + + @Test + public void testOverlappingVariantContext() { + final VariantContext vc10 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 10, Arrays.asList("A", "C")); + final VariantContext vc13 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 13, Arrays.asList("A", "C")); + final VariantContext vc12_15 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 12, Arrays.asList("ACAT", "A")); + final VariantContext vc18 = GATKVariantContextUtils.makeFromAlleles("test", "chr1", 18, Arrays.asList("A", "ACAT")); + + final List calls = Arrays.asList(vc13, vc12_15, vc18, vc10); + + checkOverlapping(8, calls, null); + checkOverlapping(9, calls, null); + checkOverlapping(10, calls, vc10); + checkOverlapping(11, calls, null); + checkOverlapping(12, calls, vc12_15); + checkOverlapping(13, calls, vc13); + checkOverlapping(14, calls, vc12_15); + checkOverlapping(15, calls, vc12_15); + checkOverlapping(16, calls, null); + checkOverlapping(17, calls, null); + checkOverlapping(18, calls, vc18); + checkOverlapping(19, calls, null); + checkOverlapping(20, calls, null); + } + + private void checkOverlapping(final int pos, Collection calls, final VariantContext expected) { + final GenomeLoc loc = parser.createGenomeLoc(parser.getContigs().getSequences().get(0).getSequenceName(), pos, pos); + final VariantContext actual = model.getOverlappingVariantContext(loc, calls); + Assert.assertEquals(actual, expected); + } + + // + // test reference calculation + // + private class RefConfData { + final String ref; + final int extension; + final Haplotype refHap; + final GenomeLoc refLoc, paddedRefLoc; + final ActiveRegion region; + int readCounter = 0; + + private RefConfData(String ref, int extension) { + this.ref = ref; + this.extension = extension; + + refLoc = parser.createGenomeLoc("chr1", getStart(), getEnd()); + paddedRefLoc = parser.createGenomeLoc("chr1", getStart() - extension, getEnd() + extension); + region = new ActiveRegion(getRefLoc(), parser, extension); + final String pad = Utils.dupString("N", extension); + refHap = ReferenceConfidenceModel.createReferenceHaplotype(getActiveRegion(), (pad + ref + pad).getBytes(), getPaddedRefLoc()); + } + + public GenomeLoc getRefLoc() { return refLoc; } + public GenomeLoc getPaddedRefLoc() { return paddedRefLoc; } + public ActiveRegion getActiveRegion() { return region; } + public Haplotype getRefHap() { return refHap; } + public int getStart() { return 100; } + public int getEnd() { return getStart() + getRefLength() - 1; } + public byte[] getRefBases() { return ref.getBytes(); } + public int getRefLength() { return ref.length(); } + + public GATKSAMRecord makeRead(final int start, final int length) { + final byte[] quals = Utils.dupBytes((byte)30, length); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read " + readCounter++, 0, start + getStart(), ref.substring(start, start + length).getBytes(), quals, length + "M"); + read.setReadGroup(rg); + return read; + } + } + + + @DataProvider(name = "RefConfidenceData") + public Object[][] makeRefConfidenceData() { + List tests = new ArrayList<>(); + + for ( int i = 0; i < 10; i++ ) { + for ( final int extension : Arrays.asList(0, 10) ) { + tests.add(new Object[]{i, extension}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "RefConfidenceData") + public void testRefConfidenceBasic(final int nReads, final int extension) { + final RefConfData data = new RefConfData("ACGTAACCGGTT", extension); + final List haplotypes = Arrays.asList(data.getRefHap()); + final List calls = Collections.emptyList(); + + for ( int i = 0; i < nReads; i++ ) { + data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); + } + + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + + @Test + public void testRefConfidencePartialReads() { + final String ref = "ACGTAACCGGTT"; + for ( int readLen = 3; readLen < ref.length(); readLen++ ) { + for ( int start = 0; start < ref.length() - readLen; start++ ) { + final RefConfData data = new RefConfData(ref, 0); + final List haplotypes = Arrays.asList(data.getRefHap()); + final List calls = Collections.emptyList(); + + data.getActiveRegion().add(data.makeRead(start, readLen)); + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); + for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + } + } + + @Test + public void testRefConfidenceWithCalls() { + final RefConfData xxxdata = new RefConfData("ACGTAACCGGTT", 0); + final int start = xxxdata.getStart(); + final int stop = xxxdata.getEnd(); + + for ( int nReads = 0; nReads < 2; nReads++ ) { + + final VariantContext vcStart = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start, Arrays.asList("A", "C")); + final VariantContext vcEnd = GATKVariantContextUtils.makeFromAlleles("test", "chr1", stop, Arrays.asList("A", "C")); + final VariantContext vcMiddle = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 2, Arrays.asList("A", "C")); + final VariantContext vcDel = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 4, Arrays.asList("ACG", "A")); + final VariantContext vcIns = GATKVariantContextUtils.makeFromAlleles("test", "chr1", start + 8, Arrays.asList("A", "ACG")); + + final List allCalls = Arrays.asList(vcStart, vcEnd, vcMiddle, vcDel, vcIns); + + for ( int n = 1; n <= allCalls.size(); n++ ) { + for ( final List calls : Utils.makePermutations(allCalls, n, false) ) { +// logger.warn("Executing " + n + " " + calls.size()); + final RefConfData data = new RefConfData("ACGTAACCGGTT", 0); + final List haplotypes = Arrays.asList(data.getRefHap()); + for ( int i = 0; i < nReads; i++ ) { + data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); + } + + final Map likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); + + final List expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); + final List contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); + checkReferenceModelResult(data, contexts, expectedDPs, calls); + } + } + } + } + + private void checkReferenceModelResult(final RefConfData data, final List contexts, final List expectedDPs, final List calls) { + Assert.assertNotNull(contexts); + + final GenomeLoc loc = data.getActiveRegion().getExtendedLoc(); + final List seenBP = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), false)); + + for ( int i = 0; i < loc.size(); i++ ) { + final GenomeLoc curPos = parser.createGenomeLoc(loc.getContig(), loc.getStart() + i); + final VariantContext call = model.getOverlappingVariantContext(curPos, calls); + final VariantContext refModel = model.getOverlappingVariantContext(curPos, contexts); + + if ( ! data.getActiveRegion().getLocation().containsP(curPos) ) { + // part of the extended interval, but not the full interval + Assert.assertNull(refModel); + continue; + } + + if ( call != null ) { + Assert.assertEquals(refModel, call, "Should have found call " + call + " but found " + refModel + " instead"); + } else { + final int expectedDP = expectedDPs.get(curPos.getStart() - data.getActiveRegion().getLocation().getStart()); + Assert.assertEquals(refModel.getStart(), loc.getStart() + i); + Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); + Assert.assertFalse(refModel.hasLog10PError()); + Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); + Assert.assertEquals(refModel.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertTrue(refModel.hasGenotype(sample)); + + final Genotype g = refModel.getGenotype(sample); + Assert.assertTrue(g.hasAD()); + Assert.assertTrue(g.hasDP()); + Assert.assertEquals(g.getDP(), expectedDP); + Assert.assertTrue(g.hasGQ()); + Assert.assertTrue(g.hasPL()); + Assert.assertTrue(g.hasExtendedAttribute(ReferenceConfidenceModel.INDEL_INFORMATIVE_DEPTH)); + } + + final VariantContext vc = call == null ? refModel : call; + if ( curPos.getStart() == vc.getStart() ) { + for ( int pos = vc.getStart(); pos <= vc.getEnd(); pos++ ) { + final int j = pos - data.getActiveRegion().getLocation().getStart(); + Assert.assertFalse(seenBP.get(j)); + seenBP.set(j, true); + } + } + } + + for ( int i = 0; i < seenBP.size(); i++ ) { + Assert.assertEquals((boolean)seenBP.get(i), true); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java index 8269b9c20..9172b6454 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -86,7 +86,7 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { assembler.removePathsNotConnectedToRef = false; // needed to pass some of the tests assembler.setRecoverDanglingTails(false); // needed to pass some of the tests assembler.setDebugGraphTransformations(true); - final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList()).get(0); + final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList()).get(0).getGraph(); if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); return graph; } diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java new file mode 100644 index 000000000..ffbc3c43f --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -0,0 +1,333 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class GVCFWriterUnitTest extends BaseTest { + private static class MockWriter implements VariantContextWriter { + final List emitted = new ArrayList<>(); + boolean headerWritten = false; + boolean closed = false; + + @Override + public void writeHeader(VCFHeader header) { + headerWritten = true; + } + + @Override + public void close() { + closed = true; + } + + @Override + public void add(VariantContext vc) { + emitted.add(vc); + } + } + + private MockWriter mockWriter; + private List standardPartition = Arrays.asList(1, 10, 20); + private Allele REF = Allele.create("N", true); + private Allele ALT = Allele.create("A"); + private List ALLELES = Arrays.asList(REF, ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + private final String SAMPLE_NAME = "XXYYZZ"; + + @BeforeMethod + public void setUp() throws Exception { + mockWriter = new MockWriter(); + } + + @Test + public void testHeaderWriting() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.writeHeader(new VCFHeader()); + Assert.assertTrue(mockWriter.headerWritten); + } + + @Test + public void testClose() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.close(); + Assert.assertTrue(mockWriter.closed); + } + + @Test + public void testCloseWithoutClosingUnderlyingWriter() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + writer.close(false); + Assert.assertFalse(mockWriter.closed); + } + + private VariantContext makeHomRef(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, ALLELES); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeNonRef(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + private VariantContext makeDeletion(final String contig, final int start, final int size) { + final String del = Utils.dupString("A", size); + final String alt = del.substring(0, 1); + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", contig, start, Arrays.asList(del, alt)); + final VariantContextBuilder vcb = new VariantContextBuilder(vc); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(vc.getReference(), vc.getAlternateAllele(0))); + gb.GQ(50); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + + @Test + public void testCloseEmitsLastVariant() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + + writer.close(); + Assert.assertTrue(mockWriter.closed); + Assert.assertEquals(mockWriter.emitted.size(), 1); + } + + @Test + public void testCloseDoesntEmitsLastVariantWhenNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + + writer.close(); + Assert.assertTrue(mockWriter.closed); + Assert.assertEquals(mockWriter.emitted.size(), 1); + } + + @Test + public void testCrossingContigBoundaryRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("21", 3, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, false); + } + + @Test + public void testCrossingContigBoundaryNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeNonRef("21", 3, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "21", 3, 3, true); + } + + @Test + public void testCrossingContigBoundaryNonRefThenNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeNonRef("20", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 1); + writer.add(makeNonRef("21", 1, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 1, true); + assertGoodVC(mockWriter.emitted.get(1), "21", 1, 1, true); + } + + private void assertGoodVC(final VariantContext vc, final String contig, final int start, final int stop, final boolean nonRef) { + Assert.assertEquals(vc.getChr(), contig); + Assert.assertEquals(vc.getStart(), start); + Assert.assertEquals(vc.getEnd(), stop); + if ( nonRef ) { + Assert.assertNotEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + } else { + Assert.assertEquals(vc.getNAlleles(), 2); + Assert.assertEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1); + Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); + Assert.assertTrue(vc.hasGenotypes()); + Assert.assertTrue(vc.hasGenotype(SAMPLE_NAME)); + Assert.assertEquals(vc.getGenotypes().size(), 1); + final Genotype g = vc.getGenotype(SAMPLE_NAME); + Assert.assertEquals(g.hasAD(), false); + Assert.assertEquals(g.hasLikelihoods(), false); + Assert.assertEquals(g.hasPL(), false); + Assert.assertEquals(g.hasDP(), true); + Assert.assertEquals(g.hasGQ(), true); + } + } + + @Test + public void testVariantForcesNonRef() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 30)); + writer.add(makeHomRef("20", 2, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeNonRef("20", 3, 30)); + writer.add(makeHomRef("20", 4, 30)); + writer.add(makeHomRef("20", 5, 30)); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 3, true); + writer.close(); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 5, false); + } + + @Test + public void testEmittingTwoBands() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + Assert.assertEquals(mockWriter.emitted.size(), 0); + writer.add(makeHomRef("20", 3, 50)); + writer.add(makeHomRef("20", 4, 50)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 4, false); + } + + @Test + public void testNonContiguousBlocks() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRef("20", 10, 0)); + writer.add(makeHomRef("20", 11, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 2); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 10, 11, false); + } + + @Test + public void testDeletion() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeDeletion("20", 3, 3)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + assertGoodVC(mockWriter.emitted.get(1), "20", 3, 5, true); + assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); + } + + @DataProvider(name = "BandPartitionData") + public Object[][] makeBandPartitionData() { + List tests = new ArrayList(); + + tests.add(new Object[]{null, false}); + tests.add(new Object[]{Collections.emptyList(), false}); + tests.add(new Object[]{Arrays.asList(1), true}); + tests.add(new Object[]{Arrays.asList(1, 10), true}); + tests.add(new Object[]{Arrays.asList(1, 10, 30), true}); + tests.add(new Object[]{Arrays.asList(10, 1, 30), false}); + tests.add(new Object[]{Arrays.asList(-1, 1), false}); + tests.add(new Object[]{Arrays.asList(1, null, 10), false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BandPartitionData") + public void testMyData(final List partitions, final boolean expectedGood) { + try { + GVCFWriter.parsePartitions(partitions); + Assert.assertTrue(expectedGood, "Expected to fail but didn't"); + } catch ( Exception e ) { + Assert.assertTrue(! expectedGood, "Expected to succeed but failed with message " + e.getMessage()); + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java new file mode 100644 index 000000000..239aa93b5 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java @@ -0,0 +1,161 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.gvcf; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HomRefBlockUnitTest extends BaseTest { + VariantContext vc; + + @BeforeMethod + public void setUp() throws Exception { + vc = new VariantContextBuilder("foo", "20", 1, 1, Arrays.asList(Allele.create("A", true), Allele.create("C"))).make(); + } + + @Test + public void testBasicConstruction() { + final HomRefBlock band = new HomRefBlock(vc, 10, 20); + Assert.assertSame(band.getStartingVC(), vc); + Assert.assertEquals(band.getRef(), vc.getReference()); + Assert.assertEquals(band.getGQLowerBound(), 10); + Assert.assertEquals(band.getGQUpperBound(), 20); + Assert.assertEquals(band.withinBounds(1), false); + Assert.assertEquals(band.withinBounds(10), true); + Assert.assertEquals(band.withinBounds(11), true); + Assert.assertEquals(band.withinBounds(20), false); + Assert.assertEquals(band.withinBounds(21), false); + } + + @Test + public void testMinMedian() { + final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); + + int pos = vc.getStart(); + band.add(pos++, gb.DP(10).GQ(11).make()); + Assert.assertEquals(band.getStop(), pos - 1); + assertValues(band, 10, 10, 11, 11); + + band.add(pos++, gb.DP(11).GQ(10).make()); + Assert.assertEquals(band.getStop(), pos - 1); + assertValues(band, 10, 11, 10, 11); + + band.add(pos++, gb.DP(12).GQ(12).make()); + Assert.assertEquals(band.getStop(), pos - 1); + assertValues(band, 10, 11, 10, 11); + + band.add(pos++, gb.DP(13).GQ(15).make()); + Assert.assertEquals(band.getStop(), pos - 1); + band.add(pos++, gb.DP(14).GQ(16).make()); + Assert.assertEquals(band.getStop(), pos - 1); + band.add(pos++, gb.DP(15).GQ(17).make()); + Assert.assertEquals(band.getStop(), pos - 1); + band.add(pos++, gb.DP(16).GQ(18).make()); + Assert.assertEquals(band.getStop(), pos - 1); + assertValues(band, 10, 13, 10, 15); + Assert.assertEquals(band.getSize(), pos - vc.getStart()); + } + + @Test + public void testBigGQIsCapped() { + final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); + + band.add(vc.getStart(), gb.DP(1000).GQ(1000).make()); + assertValues(band, 1000, 1000, 99, 99); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadAdd() { + final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); + + band.add(vc.getStart() + 10, gb.DP(10).GQ(11).make()); + } + + private void assertValues(final HomRefBlock band, final int minDP, final int medianDP, final int minGQ, final int medianGQ) { + Assert.assertEquals(band.getMinDP(), minDP); + Assert.assertEquals(band.getMedianDP(), medianDP); + Assert.assertEquals(band.getMinGQ(), minGQ); + Assert.assertEquals(band.getMedianGQ(), medianGQ); + } + + + @DataProvider(name = "ContiguousData") + public Object[][] makeContiguousData() { + List tests = new ArrayList(); + + for ( final String chrMod : Arrays.asList("", ".mismatch") ) { + for ( final int offset : Arrays.asList(-10, -1, 0, 1, 10) ) { + final boolean equals = chrMod.equals("") && offset == 0; + tests.add(new Object[]{vc.getChr() + chrMod, vc.getStart() + offset, equals}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ContiguousData") + public void testIsContiguous(final String contig, final int pos, final boolean expected) { + final HomRefBlock band = new HomRefBlock(vc, 10, 20); + final VariantContext testVC = new VariantContextBuilder(vc).chr(contig).start(pos).stop(pos).make(); + Assert.assertEquals(band.isContiguous(testVC), expected); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java index 0c76ad338..1f66abe28 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriterUnitTest.java @@ -81,26 +81,22 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { return hap; } - private static class MockBAMWriter implements SAMFileWriter { - @Override - public void addAlignment(SAMRecord alignment) { - //To change body of implemented methods use File | Settings | File Templates. + private static class MockDestination extends ReadDestination { + private final static SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + + private MockDestination() { + super(header, "foo"); } @Override - public SAMFileHeader getFileHeader() { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public void close() { + public void add(GATKSAMRecord read) { //To change body of implemented methods use File | Settings | File Templates. } } @Test public void testCreate() throws Exception { - final SAMFileWriter writer = new MockBAMWriter(); + final MockDestination writer = new MockDestination(); Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, writer) instanceof CalledHaplotypeBAMWriter); Assert.assertTrue(HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.ALL_POSSIBLE_HAPLOTYPES, writer) instanceof AllHaplotypeBAMWriter); } @@ -173,7 +169,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { @Test(dataProvider = "ReadAlignedToRefData", enabled = true) public void testReadAlignedToRef(final GATKSAMRecord read, final Haplotype haplotype, final int refStart, final int expectedReadStart, final String expectedReadCigar) throws Exception { - final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter()); + final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination()); final GATKSAMRecord originalReadCopy = (GATKSAMRecord)read.clone(); if ( expectedReadCigar == null ) { @@ -289,7 +285,7 @@ public class HaplotypeBAMWriterUnitTest extends BaseTest { @Test(dataProvider = "ComplexReadAlignedToRef", enabled = !DEBUG) public void testReadAlignedToRefComplexAlignment(final int testIndex, final GATKSAMRecord read, final String reference, final Haplotype haplotype, final int expectedMaxMismatches) throws Exception { - final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockBAMWriter()); + final HaplotypeBAMWriter writer = new CalledHaplotypeBAMWriter(new MockDestination()); final GATKSAMRecord alignedRead = writer.createReadAlignedToRef(read, haplotype, 1, true); if ( alignedRead != null ) { final int mismatches = AlignmentUtils.getMismatchCount(alignedRead, reference.getBytes(), alignedRead.getAlignmentStart() - 1).numMismatches; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 2a8a271e7..4e8daa0e3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -560,24 +560,21 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome /** * return a new genome loc, with an incremented position * - * @param loc the old location - * * @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + 1 */ - public GenomeLoc incPos(GenomeLoc loc) { - return incPos(loc, 1); + public GenomeLoc incPos() { + return incPos(1); } /** * return a new genome loc, with an incremented position * - * @param loc the old location * @param by how much to move the start and stop by * * @return a newly allocated GenomeLoc as loc but with start == loc.getStart() + by */ - public GenomeLoc incPos(GenomeLoc loc, int by) { - return new GenomeLoc(loc.getContig(), loc.getContigIndex(), loc.start + by, loc.stop + by); + public GenomeLoc incPos(int by) { + return new GenomeLoc(getContig(), getContigIndex(), start + by, stop + by); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 07aff5983..3af71eabb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -831,6 +831,36 @@ public class MathUtils { return array[minElementIndex(array)]; } + /** + * Compute the min element of a List + * @param array a non-empty list of integer + * @return the min + */ + public static int arrayMin(final List array) { + if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); + int min = array.get(0); + for ( final int i : array ) + if ( i < min ) min = i; + return min; + } + + /** + * Compute the median element of the array of integers + * @param array a list of integers + * @return the median element + */ + public static int median(final List array) { + if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); + final int size = array.size(); + if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); + else if ( size == 1 ) return array.get(0); + else { + final ArrayList sorted = new ArrayList<>(array); + Collections.sort(sorted); + return sorted.get(size / 2); + } + } + public static int minElementIndex(final double[] array) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 7f2fe6833..bb74619a7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -147,6 +147,13 @@ public class ActiveRegion implements HasGenomeLocation { } } + /** + * Simple interface to create an active region that isActive without any profile state + */ + public ActiveRegion( final GenomeLoc activeRegionLoc, final GenomeLocParser genomeLocParser, final int extension ) { + this(activeRegionLoc, Collections.emptyList(), true, genomeLocParser, extension); + } + @Override public String toString() { return "ActiveRegion " + activeRegionLoc.toString() + " active?=" + isActive() + " nReads=" + reads.size(); diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java index c7875354f..d2a53f657 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LIBSDownsamplingInfo.java @@ -33,8 +33,6 @@ package org.broadinstitute.sting.utils.locusiterator; * Time: 1:26 PM */ class LIBSDownsamplingInfo { - public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1); - final private boolean performDownsampling; final private int toCoverage; diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java index eed29feca..6f588ac0e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java @@ -71,6 +71,9 @@ import java.util.*; * a stream of unique, sorted reads */ public final class LocusIteratorByState extends LocusIterator { + /** Indicates that we shouldn't do any downsampling */ + public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1); + /** * our log, which we want to capture anything from this class */ @@ -174,12 +177,12 @@ public final class LocusIteratorByState extends LocusIterator { * @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them * available via the transferReadsFromAllPreviousPileups interface */ - protected LocusIteratorByState(final Iterator samIterator, - final LIBSDownsamplingInfo downsamplingInfo, - final boolean includeReadsWithDeletionAtLoci, - final GenomeLocParser genomeLocParser, - final Collection samples, - final boolean maintainUniqueReadsList) { + public LocusIteratorByState(final Iterator samIterator, + final LIBSDownsamplingInfo downsamplingInfo, + final boolean includeReadsWithDeletionAtLoci, + final GenomeLocParser genomeLocParser, + final Collection samples, + final boolean maintainUniqueReadsList) { if ( samIterator == null ) throw new IllegalArgumentException("samIterator cannot be null"); if ( downsamplingInfo == null ) throw new IllegalArgumentException("downsamplingInfo cannot be null"); if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); @@ -252,9 +255,15 @@ public final class LocusIteratorByState extends LocusIterator { * Will return null if cannot reach position (because we run out of data in the locus) * * @param position the start position of the AlignmentContext we want back + * @param stopAtFirstNonEmptySiteAfterPosition if true, we will stop as soon as we find a context with data with + * position >= position, otherwise we will return a null value + * and consume the data for the next position. This means that without + * specifying this value the LIBS will be in an indeterminate state + * after calling this function, and should be reconstructed from scratch + * for subsequent use * @return a AlignmentContext at position, or null if this isn't possible */ - public AlignmentContext advanceToLocus(final int position) { + public AlignmentContext advanceToLocus(final int position, final boolean stopAtFirstNonEmptySiteAfterPosition) { while ( hasNext() ) { final AlignmentContext context = next(); @@ -262,8 +271,11 @@ public final class LocusIteratorByState extends LocusIterator { // we ran out of data return null; - if ( context.getPosition() == position) + if ( context.getPosition() == position ) return context; + + if ( context.getPosition() > position) + return stopAtFirstNonEmptySiteAfterPosition ? context : null; } return null; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index f9393cc4b..ab866013f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -30,10 +30,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.NGSPlatform; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -214,34 +211,52 @@ public class ReadUtils { * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. */ public static int getAdaptorBoundary(final SAMRecord read) { - final int MAXIMUM_ADAPTOR_LENGTH = 8; - final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) - - if (insertSize == 0 || read.getReadUnmappedFlag()) // no adaptors in reads with mates in another chromosome or unmapped pairs - return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; - - if ( read.getReadPairedFlag() && read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) { - // note that the read.getProperPairFlag() is not reliably set, so many reads may have this tag but still be overlapping -// logger.info(String.format("Read %s start=%d end=%d insert=%d mateStart=%d readNeg=%b mateNeg=%b not properly paired, returning CANNOT_COMPUTE_ADAPTOR_BOUNDARY", -// read.getReadName(), read.getAlignmentStart(), read.getAlignmentEnd(), insertSize, read.getMateAlignmentStart(), -// read.getReadNegativeStrandFlag(), read.getMateNegativeStrandFlag())); + if ( ! hasWellDefinedFragmentSize(read) ) { return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; + } else if ( read.getReadNegativeStrandFlag() ) { + return read.getMateAlignmentStart() - 1; // case 1 (see header) + } else { + final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) + return read.getAlignmentStart() + insertSize + 1; // case 2 (see header) } - - - int adaptorBoundary; // the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read) - if (read.getReadNegativeStrandFlag()) - adaptorBoundary = read.getMateAlignmentStart() - 1; // case 1 (see header) - else - adaptorBoundary = read.getAlignmentStart() + insertSize + 1; // case 2 (see header) - - if ( (adaptorBoundary < read.getAlignmentStart() - MAXIMUM_ADAPTOR_LENGTH) || (adaptorBoundary > read.getAlignmentEnd() + MAXIMUM_ADAPTOR_LENGTH) ) - adaptorBoundary = CANNOT_COMPUTE_ADAPTOR_BOUNDARY; // we are being conservative by not allowing the adaptor boundary to go beyond what we belive is the maximum size of an adaptor - - return adaptorBoundary; } + public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; + /** + * Can the adaptor sequence of read be reliably removed from the read based on the alignment of + * read and its mate? + * + * @param read the read to check + * @return true if it can, false otherwise + */ + public static boolean hasWellDefinedFragmentSize(final SAMRecord read) { + if ( read.getInferredInsertSize() == 0 ) + // no adaptors in reads with mates in another chromosome or unmapped pairs + return false; + if ( ! read.getReadPairedFlag() ) + // only reads that are paired can be adaptor trimmed + return false; + if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() ) + // only reads when both reads are mapped can be trimmed + return false; +// if ( ! read.getProperPairFlag() ) +// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs +// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed +// return false; + if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) + // sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand + return false; + + if ( read.getReadNegativeStrandFlag() ) { + // we're on the negative strand, so our read runs right to left + return read.getAlignmentEnd() > read.getMateAlignmentStart(); + } else { + // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start) + return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize(); + } + } + /** * is the read a 454 read? * diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index f86a9b513..5efe2dfc3 100644 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -131,6 +131,9 @@ public abstract class BaseTest { public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; + public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; + public static final boolean pipelineTestRunModeIsSet = System.getProperty("pipeline.run").equals("run"); /** before the class starts up */ diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 3933b3830..f2718fb8c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -26,9 +26,11 @@ package org.broadinstitute.sting.utils; import cern.jet.random.Normal; +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; @@ -47,7 +49,7 @@ public class MathUtilsUnitTest extends BaseTest { @Test public void testGenerateUniqueHashFromThreePositiveIntegers() { logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers"); - + final Set observedLongs = new HashSet(); for (short i = 0; i < Byte.MAX_VALUE; i++) { for (short j = 0; j < Byte.MAX_VALUE; j++) { @@ -97,7 +99,7 @@ public class MathUtilsUnitTest extends BaseTest { final int numTrials = 10; for ( int i = 0; i < numTrials; i++ ) Assert.assertEquals(MathUtils.binomialCumulativeProbability(numTrials, i, i), MathUtils.binomialProbability(numTrials, i), 1e-10, String.format("k=%d, n=%d", i, numTrials)); - + Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 2), 0.05468750, 1e-7); Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 5), 0.62304687, 1e-7); Assert.assertEquals(MathUtils.binomialCumulativeProbability(10, 0, 10), 1.0, 1e-7); @@ -271,7 +273,7 @@ public class MathUtilsUnitTest extends BaseTest { @Test public void testApproximateLog10SumLog10() { - + final double requiredPrecision = 1E-4; Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision); @@ -446,4 +448,74 @@ public class MathUtilsUnitTest extends BaseTest { } } } + + @DataProvider(name = "ArrayMinData") + public Object[][] makeArrayMinData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList(10), 10}); + tests.add(new Object[]{Arrays.asList(-10), -10}); + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,3), 3, false) ) { + tests.add(new Object[]{values, 1}); + } + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { + tests.add(new Object[]{values, -3}); + } + + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinList(final List values, final int expected) { + final int actual = MathUtils.arrayMin(values); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinIntArray(final List values, final int expected) { + final int[] asArray = ArrayUtils.toPrimitive(values.toArray(new Integer[values.size()])); + final int actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, expected, "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinByteArray(final List values, final int expected) { + final byte[] asArray = new byte[values.size()]; + for ( int i = 0; i < values.size(); i++ ) asArray[i] = (byte)(values.get(i) & 0xFF); + final byte actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, (byte)(expected & 0xFF), "Failed with " + values); + } + + @Test(dataProvider = "ArrayMinData") + public void testArrayMinDoubleArray(final List values, final int expected) { + final double[] asArray = new double[values.size()]; + for ( int i = 0; i < values.size(); i++ ) asArray[i] = (double)(values.get(i)); + final double actual = MathUtils.arrayMin(asArray); + Assert.assertEquals(actual, (double)expected, "Failed with " + values); + } + + @DataProvider(name = "MedianData") + public Object[][] makeMedianData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList(10), 10}); + tests.add(new Object[]{Arrays.asList(1, 10), 10}); + + for ( final List values : Utils.makePermutations(Arrays.asList(1,2,-3), 3, false) ) { + tests.add(new Object[]{values, 1}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MedianData") + public void testMedian(final List values, final int expected) { + final int actual = MathUtils.median(values); + Assert.assertEquals(actual, expected, "Failed with " + values); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index 0886427ca..e2e253d0f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -253,17 +253,22 @@ public class FragmentUtilsUnitTest extends BaseTest { final GATKSAMRecord expectedMerged = makeOverlappingRead("", 30, common, commonQuals, "", 30, 10); read1.setCigarString("4S" + common.length() + "M"); read1.setProperPairFlag(true); + read1.setReadPairedFlag(true); read1.setFirstOfPairFlag(true); read1.setReadNegativeStrandFlag(true); - read1.setMateAlignmentStart(10); + read1.setMateNegativeStrandFlag(false); + read1.setMateAlignmentStart(read2.getAlignmentStart()); read2.setCigarString(common.length() + "M4S"); read2.setProperPairFlag(true); + read2.setReadPairedFlag(true); read2.setFirstOfPairFlag(false); read2.setReadNegativeStrandFlag(false); + read2.setMateNegativeStrandFlag(true); + read2.setMateAlignmentStart(read1.getAlignmentStart()); final int insertSize = common.length() - 1; - read1.setInferredInsertSize(insertSize); - read2.setInferredInsertSize(-insertSize); + read1.setInferredInsertSize(-insertSize); + read2.setInferredInsertSize(insertSize); final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); Assert.assertEquals(actual.getCigarString(), expectedMerged.getCigarString()); diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java index d2f29ee7a..54554ee64 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -54,7 +54,7 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { private static final boolean DEBUG = false; protected LocusIteratorByState li; - @Test(enabled = true) + @Test(enabled = !DEBUG) public void testUnmappedAndAllIReadsPassThrough() { final int readLength = 10; GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength); @@ -697,24 +697,28 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { final List tests = new LinkedList(); final int start = 10; -// for ( final int goodBases : Arrays.asList(10) ) { -// for ( final int nClipsOnTheRight : Arrays.asList(0)) { for ( final int goodBases : Arrays.asList(10, 20, 30) ) { for ( final int nClips : Arrays.asList(0, 1, 2, 10)) { for ( final boolean onLeft : Arrays.asList(true, false) ) { final int readLength = nClips + goodBases; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength); + read.setProperPairFlag(true); + read.setReadPairedFlag(true); + read.setReadUnmappedFlag(false); + read.setMateUnmappedFlag(false); read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); read.setBaseQualities(Utils.dupBytes((byte) '@', readLength)); read.setCigarString(readLength + "M"); if ( onLeft ) { read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); read.setMateAlignmentStart(start + nClips); read.setInferredInsertSize(readLength); tests.add(new Object[]{nClips, goodBases, 0, read}); } else { read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); read.setMateAlignmentStart(start - 1); read.setInferredInsertSize(goodBases - 1); tests.add(new Object[]{0, goodBases, nClips, read}); @@ -723,29 +727,13 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { } } -// for ( final int nClipsOnTheLeft : Arrays.asList(0, 1, 2, 10)) { -// final int readLength = nClipsOnTheLeft + goodBases; -// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength); -// read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); -// read.setBaseQualities(Utils.dupBytes((byte) '@', readLength)); -// read.setCigarString(readLength + "M"); -// read.setReadNegativeStrandFlag(true); -// -// read.setMateAlignmentStart(start + nClipsOnTheLeft); -// read.setInferredInsertSize(readLength); -// -// tests.add(new Object[]{nClipsOnTheLeft, goodBases, 0, read}); -// } -// } - return tests.toArray(new Object[][]{}); } @Test(enabled = true, dataProvider = "AdapterClippingTest") -// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000) public void testAdapterClipping(final int nClipsOnLeft, final int nReadContainingPileups, final int nClipsOnRight, final GATKSAMRecord read) { - li = new LocusIteratorByState(new FakeCloseableIterator(Collections.singletonList(read).iterator()), + li = new LocusIteratorByState(new FakeCloseableIterator<>(Collections.singletonList(read).iterator()), createTestReadProperties(DownsamplingMethod.NONE, false), genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); @@ -755,10 +743,6 @@ public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { while ( li.hasNext() ) { final AlignmentContext next = li.next(); Assert.assertEquals(next.getLocation().getStart(), expectedPos); -// if ( nPileups < nClipsOnLeft || nPileups > (nClipsOnLeft + nReadContainingPileups) ) -// Assert.assertEquals(next.getBasePileup().getNumberOfElements(), 0, "Expected empty pileups when the read is in the adapter clipping zone at " + nPileups); -// else -// Assert.assertEquals(next.getBasePileup().getNumberOfElements(), 1, "Expected a pileups with 1 element when the read is within the good part of the read at " + nPileups); nPileups++; expectedPos++; } diff --git a/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java index 77dd29e60..eb759fc1c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/locusiterator/PerSampleReadStateManagerUnitTest.java @@ -59,7 +59,7 @@ public class PerSampleReadStateManagerUnitTest extends LocusIteratorByStateBaseT } public void run() { - PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LIBSDownsamplingInfo.NO_DOWNSAMPLING); + PerSampleReadStateManager perSampleReadStateManager = new PerSampleReadStateManager(LocusIteratorByState.NO_DOWNSAMPLING); makeReads(); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index abe0c394b..7e085547f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -66,6 +66,8 @@ public class ReadUtilsUnitTest extends BaseTest { final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; final String cigar = "8M"; GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); + read.setProperPairFlag(true); + read.setReadPairedFlag(true); read.setMateAlignmentStart(mateStart); read.setInferredInsertSize(fragmentSize); return read; @@ -85,6 +87,7 @@ public class ReadUtilsUnitTest extends BaseTest { myStart = BEFORE; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, myStart + fragmentSize + 1); @@ -93,6 +96,7 @@ public class ReadUtilsUnitTest extends BaseTest { myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, myStart + fragmentSize + 1); @@ -101,6 +105,7 @@ public class ReadUtilsUnitTest extends BaseTest { myStart = AFTER; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, mateStart - 1); @@ -109,6 +114,7 @@ public class ReadUtilsUnitTest extends BaseTest { myStart = BEFORE; read.setAlignmentStart(myStart); read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, mateStart - 1); @@ -116,9 +122,11 @@ public class ReadUtilsUnitTest extends BaseTest { read = makeRead(fragmentSize, mateStart); read.setInferredInsertSize(0); read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); boundary = get.getAdaptor(read); Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); read.setInferredInsertSize(10); @@ -226,4 +234,91 @@ public class ReadUtilsUnitTest extends BaseTest { final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); Assert.assertEquals(result, 3); } + + @DataProvider(name = "HasWellDefinedFragmentSizeData") + public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { + final List tests = new LinkedList(); + + // setup a basic read that will work + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadUnmappedFlag(false); + read.setMateUnmappedFlag(false); + read.setAlignmentStart(100); + read.setCigarString("50M"); + read.setMateAlignmentStart(130); + read.setInferredInsertSize(80); + read.setFirstOfPairFlag(true); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + + tests.add( new Object[]{ "basic case", read.clone(), true }); + + { + final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone(); + bad1.setReadPairedFlag(false); + tests.add( new Object[]{ "not paired", bad1, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setProperPairFlag(false); + // we currently don't require the proper pair flag to be set + tests.add( new Object[]{ "not proper pair", bad, true }); +// tests.add( new Object[]{ "not proper pair", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadUnmappedFlag(true); + tests.add( new Object[]{ "read is unmapped", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setMateUnmappedFlag(true); + tests.add( new Object[]{ "mate is unmapped", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setMateNegativeStrandFlag(false); + tests.add( new Object[]{ "read and mate both on positive strand", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadNegativeStrandFlag(true); + tests.add( new Object[]{ "read and mate both on negative strand", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setInferredInsertSize(0); + tests.add( new Object[]{ "insert size is 0", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setAlignmentStart(1000); + tests.add( new Object[]{ "positve read starts after mate end", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadNegativeStrandFlag(true); + bad.setMateNegativeStrandFlag(false); + bad.setMateAlignmentStart(1000); + tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "HasWellDefinedFragmentSizeData") + private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) { + Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected); + } }