Merge pull request #544 from broadinstitute/eb_archive_reduce_reads
Moving Reduce Reads to the archive.
This commit is contained in:
commit
f7d10b9781
|
|
@ -57,9 +57,6 @@ import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
|||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -98,10 +95,7 @@ public class Coverage extends InfoFieldAnnotation implements StandardAnnotation,
|
|||
return null;
|
||||
|
||||
for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1);
|
||||
}
|
||||
depth += maps.getLikelihoodReadMap().size();
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -60,7 +60,6 @@ import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
|
|
@ -119,7 +118,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
final ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( alleleCounts.containsKey(p.getBase()) )
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
|
||||
}
|
||||
|
||||
// we need to add counts in the correct order
|
||||
|
|
@ -146,8 +145,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
if (! a.isInformative() ) continue; // read is non-informative
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int prevCount = alleleCounts.get(a.getMostLikelyAllele());
|
||||
final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
alleleCounts.put(a.getMostLikelyAllele(), prevCount + incCount);
|
||||
alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1);
|
||||
}
|
||||
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
|
|
|
|||
|
|
@ -54,7 +54,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnota
|
|||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
|
|
@ -109,9 +108,7 @@ public class DepthPerSampleHC extends GenotypeAnnotation {
|
|||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
|
||||
if ( a.isInformative() ) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
dp += incCount;
|
||||
dp++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -64,7 +64,6 @@ import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
|||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
|
|
@ -418,8 +417,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
for (final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount);
|
||||
updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt);
|
||||
}
|
||||
if ( passesMinimumThreshold(myTable) )
|
||||
copyToMainTable(myTable, table);
|
||||
|
|
@ -464,7 +462,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider )
|
||||
continue;
|
||||
|
||||
updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount());
|
||||
updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt);
|
||||
}
|
||||
if ( passesMinimumThreshold(myTable) )
|
||||
copyToMainTable(myTable, table);
|
||||
|
|
@ -487,7 +485,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE);
|
||||
}
|
||||
|
||||
private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) {
|
||||
private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) {
|
||||
|
||||
final boolean matchesRef = allele.equals(ref, true);
|
||||
final boolean matchesAlt = allele.equals(alt, true);
|
||||
|
|
@ -496,21 +494,15 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
final int offset = matchesRef ? 0 : 2;
|
||||
|
||||
if ( read.isStrandless() ) {
|
||||
|
||||
// ignore strandless reduced reads because they are always on the forward strand!
|
||||
if ( !read.isReducedRead() ) {
|
||||
|
||||
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
|
||||
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
|
||||
// if the read is only seen once, because it's a merged read or other)
|
||||
final int toAdd = Math.max(representativeCount / 2, 1);
|
||||
table[offset] += toAdd;
|
||||
table[offset + 1] += toAdd;
|
||||
}
|
||||
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
|
||||
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
|
||||
// if the read is only seen once, because it's a merged read or other)
|
||||
table[offset]++;
|
||||
table[offset + 1]++;
|
||||
} else {
|
||||
// a normal read with an actual strand
|
||||
final boolean isFW = !read.getReadNegativeStrandFlag();
|
||||
table[offset + (isFW ? 0 : 1)] += representativeCount;
|
||||
table[offset + (isFW ? 0 : 1)]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
|
|||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFStandardHeaderLines;
|
||||
|
|
@ -87,7 +86,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
|
|||
for ( final Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
final AlignmentContext context = sample.getValue();
|
||||
for ( final PileupElement p : context.getBasePileup() )
|
||||
fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), p.getRepresentativeCount(), qualities);
|
||||
fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), qualities);
|
||||
}
|
||||
}
|
||||
else if (perReadAlleleLikelihoodMap != null) {
|
||||
|
|
@ -96,7 +95,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
|
|||
|
||||
for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
|
||||
for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() )
|
||||
fillMappingQualitiesFromPileup(read.getMappingQuality(), (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1), qualities);
|
||||
fillMappingQualitiesFromPileup(read.getMappingQuality(), qualities);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
@ -106,12 +105,9 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn
|
|||
return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms));
|
||||
}
|
||||
|
||||
private static void fillMappingQualitiesFromPileup(final int mq, final int representativeCount, final List<Integer> qualities) {
|
||||
private static void fillMappingQualitiesFromPileup(final int mq, final List<Integer> qualities) {
|
||||
if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
|
||||
if ( representativeCount == 1 )
|
||||
qualities.add(mq);
|
||||
else
|
||||
qualities.addAll(Collections.nCopies(representativeCount, mq));
|
||||
qualities.add(mq);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -236,8 +236,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
return !(p.isDeletion() ||
|
||||
p.getMappingQual() == 0 ||
|
||||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE || // need the unBAQed quality score here
|
||||
p.getRead().isReducedRead() );
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -249,8 +248,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
*/
|
||||
protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
|
||||
return !( read.getMappingQuality() == 0 ||
|
||||
read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
read.isReducedRead() );
|
||||
read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -56,7 +56,6 @@ import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -90,10 +89,9 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
|
|||
int depth = 0;
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
for ( final PileupElement p : sample.getValue().getBasePileup() ) {
|
||||
final int actualSampleDepth = p.getRepresentativeCount();
|
||||
depth += actualSampleDepth;
|
||||
depth++;
|
||||
if ( p.isDeletion() )
|
||||
deletions += actualSampleDepth;
|
||||
deletions++;
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
|
|
|
|||
|
|
@ -61,7 +61,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -74,7 +73,6 @@ import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
|
|||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -216,7 +214,6 @@ public class BaseRecalibrator extends ReadWalker<Long, Long> implements NanoSche
|
|||
}
|
||||
|
||||
initializeRecalibrationEngine();
|
||||
RecalUtils.checkForInvalidRecalBams(getToolkit().getSAMFileHeaders(), getToolkit().getArguments().ALLOW_BQSR_ON_REDUCED_BAMS);
|
||||
minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN;
|
||||
referenceReader = getToolkit().getReferenceDataSource().getReference();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,207 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
/**
|
||||
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 6/15/12
|
||||
*/
|
||||
public class BaseAndQualsCounts extends BaseCounts {
|
||||
|
||||
private long sumInsertionQual_A = 0;
|
||||
private long sumDeletionQual_A = 0;
|
||||
private long sumInsertionQual_C = 0;
|
||||
private long sumDeletionQual_C = 0;
|
||||
private long sumInsertionQual_G = 0;
|
||||
private long sumDeletionQual_G = 0;
|
||||
private long sumInsertionQual_T = 0;
|
||||
private long sumDeletionQual_T = 0;
|
||||
private long sumInsertionQual_D = 0;
|
||||
private long sumDeletionQual_D = 0;
|
||||
private long sumInsertionQual_I = 0;
|
||||
private long sumDeletionQual_I = 0;
|
||||
private long sumInsertionQual_N = 0;
|
||||
private long sumDeletionQual_N = 0;
|
||||
|
||||
/*
|
||||
* Increments the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
*/
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
|
||||
incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Increments the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
* @param isSoftClip true if is soft-clipped
|
||||
*/
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) {
|
||||
// if we already have high quality bases, ignore low quality ones
|
||||
if ( isLowQualBase && !isLowQuality() )
|
||||
return;
|
||||
|
||||
// if this is a high quality base then remove any low quality bases and start from scratch
|
||||
if ( !isLowQualBase && isLowQuality() ) {
|
||||
if ( totalCount() > 0 )
|
||||
clear();
|
||||
setLowQuality(false);
|
||||
}
|
||||
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.incr(i, baseQual, baseMappingQual, isSoftClip);
|
||||
switch (i) {
|
||||
case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break;
|
||||
case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break;
|
||||
case G: sumInsertionQual_G += insQual; sumDeletionQual_G += delQual; break;
|
||||
case T: sumInsertionQual_T += insQual; sumDeletionQual_T += delQual; break;
|
||||
case D: sumInsertionQual_D += insQual; sumDeletionQual_D += delQual; break;
|
||||
case I: sumInsertionQual_I += insQual; sumDeletionQual_I += delQual; break;
|
||||
case N: sumInsertionQual_N += insQual; sumDeletionQual_N += delQual; break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrements the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
*/
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
|
||||
decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrements the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
* @param isSoftClip true if is soft-clipped
|
||||
*/
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) {
|
||||
// if this is not the right type of base, ignore it
|
||||
if ( isLowQualBase != isLowQuality() )
|
||||
return;
|
||||
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.decr(i, baseQual, baseMappingQual, isSoftClip);
|
||||
switch (i) {
|
||||
case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break;
|
||||
case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break;
|
||||
case G: sumInsertionQual_G -= insQual; sumDeletionQual_G -= delQual; break;
|
||||
case T: sumInsertionQual_T -= insQual; sumDeletionQual_T -= delQual; break;
|
||||
case D: sumInsertionQual_D -= insQual; sumDeletionQual_D -= delQual; break;
|
||||
case I: sumInsertionQual_I -= insQual; sumDeletionQual_I -= delQual; break;
|
||||
case N: sumInsertionQual_N -= insQual; sumDeletionQual_N -= delQual; break;
|
||||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfBase(final BaseIndex base) {
|
||||
return (byte) (getInsertionQual(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfBase(final BaseIndex base) {
|
||||
return (byte) (getDeletionQual(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
private long getInsertionQual(final BaseIndex base) {
|
||||
switch (base) {
|
||||
case A: return sumInsertionQual_A;
|
||||
case C: return sumInsertionQual_C;
|
||||
case G: return sumInsertionQual_G;
|
||||
case T: return sumInsertionQual_T;
|
||||
case D: return sumInsertionQual_D;
|
||||
case I: return sumInsertionQual_I;
|
||||
case N: return sumInsertionQual_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
private long getDeletionQual(final BaseIndex base) {
|
||||
switch (base) {
|
||||
case A: return sumDeletionQual_A;
|
||||
case C: return sumDeletionQual_C;
|
||||
case G: return sumDeletionQual_G;
|
||||
case T: return sumDeletionQual_T;
|
||||
case D: return sumDeletionQual_D;
|
||||
case I: return sumDeletionQual_I;
|
||||
case N: return sumDeletionQual_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears out all stored data in this object
|
||||
*/
|
||||
public void clear() {
|
||||
super.clear();
|
||||
sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0;
|
||||
sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,411 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
|
||||
/**
|
||||
* An object to keep track of the number of occurrences of each base and it's quality.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
* Time: 2:55 PM
|
||||
*/
|
||||
|
||||
public class BaseCounts {
|
||||
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
|
||||
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
|
||||
|
||||
|
||||
private int count_A = 0; // keeps track of the base counts
|
||||
private int sumQual_A = 0; // keeps track of the quals of each base
|
||||
private int count_C = 0;
|
||||
private int sumQual_C = 0;
|
||||
private int count_G = 0;
|
||||
private int sumQual_G = 0;
|
||||
private int count_T = 0;
|
||||
private int sumQual_T = 0;
|
||||
private int count_D = 0;
|
||||
private int sumQual_D = 0;
|
||||
private int count_I = 0;
|
||||
private int sumQual_I = 0;
|
||||
private int count_N = 0;
|
||||
private int sumQual_N = 0;
|
||||
private int totalCount = 0; // keeps track of total count since this is requested so often
|
||||
private int nSoftClippedBases = 0;
|
||||
private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this
|
||||
private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise
|
||||
|
||||
|
||||
public static BaseCounts createWithCounts(int[] countsACGT) {
|
||||
BaseCounts baseCounts = new BaseCounts();
|
||||
baseCounts.count_A = countsACGT[0];
|
||||
baseCounts.count_C = countsACGT[1];
|
||||
baseCounts.count_G = countsACGT[2];
|
||||
baseCounts.count_T = countsACGT[3];
|
||||
baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3];
|
||||
return baseCounts;
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void add(final BaseCounts other) {
|
||||
this.count_A += other.count_A;
|
||||
this.count_C += other.count_C;
|
||||
this.count_G += other.count_G;
|
||||
this.count_T += other.count_T;
|
||||
this.count_D += other.count_D;
|
||||
this.count_I += other.count_I;
|
||||
this.count_N += other.count_N;
|
||||
this.totalCount += other.totalCount;
|
||||
this.nSoftClippedBases = other.nSoftClippedBases;
|
||||
this.mappingQualities.addAll(other.mappingQualities);
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(final BaseCounts other) {
|
||||
this.count_A -= other.count_A;
|
||||
this.count_C -= other.count_C;
|
||||
this.count_G -= other.count_G;
|
||||
this.count_T -= other.count_T;
|
||||
this.count_D -= other.count_D;
|
||||
this.count_I -= other.count_I;
|
||||
this.count_N -= other.count_N;
|
||||
this.totalCount -= other.totalCount;
|
||||
this.nSoftClippedBases -= other.nSoftClippedBases;
|
||||
this.mappingQualities.removeAll(other.mappingQualities);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final byte base) {
|
||||
add(BaseIndex.byteToBase(base), 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) {
|
||||
switch (base) {
|
||||
case A: ++count_A; sumQual_A += qual; break;
|
||||
case C: ++count_C; sumQual_C += qual; break;
|
||||
case G: ++count_G; sumQual_G += qual; break;
|
||||
case T: ++count_T; sumQual_T += qual; break;
|
||||
case D: ++count_D; sumQual_D += qual; break;
|
||||
case I: ++count_I; sumQual_I += qual; break;
|
||||
case N: ++count_N; sumQual_N += qual; break;
|
||||
}
|
||||
++totalCount;
|
||||
nSoftClippedBases += isSoftclip ? 1 : 0;
|
||||
mappingQualities.add(mappingQuality);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(final byte base) {
|
||||
add(BaseIndex.byteToBase(base), -1);
|
||||
}
|
||||
|
||||
private void add(final BaseIndex base, int amount) {
|
||||
switch(base) {
|
||||
case A: count_A += amount; break;
|
||||
case C: count_C += amount; break;
|
||||
case G: count_G += amount; break;
|
||||
case T: count_T += amount; break;
|
||||
case D: count_D += amount; break;
|
||||
case I: count_I += amount; break;
|
||||
case N: count_N += amount; break;
|
||||
}
|
||||
totalCount += amount;
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) {
|
||||
switch (base) {
|
||||
case A: --count_A; sumQual_A -= qual; break;
|
||||
case C: --count_C; sumQual_C -= qual; break;
|
||||
case G: --count_G; sumQual_G -= qual; break;
|
||||
case T: --count_T; sumQual_T -= qual; break;
|
||||
case D: --count_D; sumQual_D -= qual; break;
|
||||
case I: --count_I; sumQual_I -= qual; break;
|
||||
case N: --count_N; sumQual_N -= qual; break;
|
||||
}
|
||||
--totalCount;
|
||||
nSoftClippedBases -= isSoftclip ? 1 : 0;
|
||||
mappingQualities.remove((Integer) mappingQuality);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(final byte base) {
|
||||
return getSumQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(final BaseIndex base) {
|
||||
switch (base) {
|
||||
case A: return sumQual_A;
|
||||
case C: return sumQual_C;
|
||||
case G: return sumQual_G;
|
||||
case T: return sumQual_T;
|
||||
case D: return sumQual_D;
|
||||
case I: return sumQual_I;
|
||||
case N: return sumQual_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(final byte base) {
|
||||
return averageQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(final BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final byte base) {
|
||||
return countOfBase(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final BaseIndex base) {
|
||||
switch (base) {
|
||||
case A: return count_A;
|
||||
case C: return count_C;
|
||||
case G: return count_G;
|
||||
case T: return count_T;
|
||||
case D: return count_D;
|
||||
case I: return count_I;
|
||||
case N: return count_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfBase(final BaseIndex base) {
|
||||
return getSumQuals(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQualsOfBase(final BaseIndex base) {
|
||||
return (byte) (sumQualsOfBase(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int nSoftclips() {
|
||||
return nSoftClippedBases;
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int totalCount() {
|
||||
return totalCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this object
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this object
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQualities);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param base base
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(final byte base) {
|
||||
return baseCountProportion(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param baseIndex base
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(final BaseIndex baseIndex) {
|
||||
return (totalCount == 0) ? 0.0 : (double)countOfBase(baseIndex) / (double)totalCount;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
b.append(i.toString()).append("=").append(countOfBase(i)).append(",");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the base index for which the count is highest, including indel indexes
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
return baseIndexWithMostCounts(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the base index for which the count is highest, excluding indel indexes
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
return baseIndexWithMostCounts(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the base index with the most counts
|
||||
*
|
||||
* @param allowIndels should we allow base indexes representing indels?
|
||||
* @return non-null base index
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
int maxCount = countOfBase(maxI);
|
||||
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if ( !allowIndels && !i.isNucleotide() )
|
||||
continue;
|
||||
|
||||
final int myCount = countOfBase(i);
|
||||
if (myCount > maxCount) {
|
||||
maxI = i;
|
||||
maxCount = myCount;
|
||||
}
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
public byte baseWithMostProbability() {
|
||||
return baseIndexWithMostProbability().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability() {
|
||||
return baseIndexWithMostProbability(true);
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
|
||||
return baseIndexWithMostProbability(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the base index with the most probability
|
||||
*
|
||||
* @param allowIndels should we allow base indexes representing indels?
|
||||
* @return non-null base index
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
long maxSum = getSumQuals(maxI);
|
||||
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if ( !allowIndels && !i.isNucleotide() )
|
||||
continue;
|
||||
|
||||
final long mySum = getSumQuals(i);
|
||||
if (mySum > maxSum) {
|
||||
maxI = i;
|
||||
maxSum = mySum;
|
||||
}
|
||||
}
|
||||
return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels));
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
return totalCount - countOfBase(BaseIndex.D) - countOfBase(BaseIndex.I);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the proportional count of a base compared to all other bases except indels (I and D)
|
||||
*
|
||||
* @param base base
|
||||
* @return the proportion of this base over all other bases except indels
|
||||
*/
|
||||
@Requires("base.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex base) {
|
||||
final int total = totalCountWithoutIndels();
|
||||
return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this instance represents low quality bases
|
||||
*/
|
||||
public boolean isLowQuality() { return isLowQuality; }
|
||||
|
||||
/**
|
||||
* Sets the low quality value
|
||||
*
|
||||
* @param value true if this instance represents low quality bases false otherwise
|
||||
*/
|
||||
public void setLowQuality(final boolean value) { isLowQuality = value; }
|
||||
|
||||
/**
|
||||
* Clears out all stored data in this object
|
||||
*/
|
||||
public void clear() {
|
||||
count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0;
|
||||
sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0;
|
||||
totalCount = 0;
|
||||
nSoftClippedBases = 0;
|
||||
mappingQualities.clear();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,136 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
/**
|
||||
* Simple byte / base index conversions
|
||||
*
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 8/26/11
|
||||
*/
|
||||
public enum BaseIndex {
|
||||
A ( 'A', 0 ),
|
||||
C ( 'C', 1 ),
|
||||
G ( 'G', 2 ),
|
||||
T ( 'T', 3 ),
|
||||
D ( 'D', 4 ),
|
||||
I ( 'I', 5 ), // insertion to the right of the base
|
||||
N ( 'N', 6 );
|
||||
|
||||
final byte b;
|
||||
final int index;
|
||||
|
||||
public byte getByte() { return b; }
|
||||
|
||||
/**
|
||||
* Ordinal is stored in SyntheticRead rather than enum to save object reference, and store as byte for compactness.
|
||||
* It is stored as byte, and this method merely eliminates a cast.
|
||||
*/
|
||||
public byte getOrdinalByte() { return (byte)ordinal(); }
|
||||
|
||||
private BaseIndex(char base, int index) {
|
||||
this.b = (byte)base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a byte representation of a base to BaseIndex
|
||||
*
|
||||
* @param base the byte representation of the base
|
||||
* @return the BaseIndex representation of the base;
|
||||
*/
|
||||
public static BaseIndex byteToBase(final byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
return A;
|
||||
case 'C':
|
||||
case 'c':
|
||||
return C;
|
||||
case 'G':
|
||||
case 'g':
|
||||
return G;
|
||||
case 'T':
|
||||
case 't':
|
||||
return T;
|
||||
case 'D':
|
||||
case 'd':
|
||||
case '-':
|
||||
return D;
|
||||
case 'I':
|
||||
case 'i':
|
||||
return I;
|
||||
case 'N':
|
||||
case 'n':
|
||||
return N;
|
||||
default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Definition of a nucleotide for the BaseIndex is anything that has been read as a base
|
||||
* by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows
|
||||
* there is a base there (N).
|
||||
*
|
||||
* @return whether or not it is a nucleotide, given the definition above
|
||||
*/
|
||||
public final boolean isNucleotide() {
|
||||
return !isIndel();
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not this base is an insertion or a deletion
|
||||
*
|
||||
* @return true for I or D, false otherwise
|
||||
*/
|
||||
public final boolean isIndel() {
|
||||
return this == D || this == I;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
|
||||
/**
|
||||
* A stash of regions that must be kept uncompressed in all samples
|
||||
*
|
||||
* In general, these are regions that were kept uncompressed by a tumor sample and we want to force
|
||||
* all other samples (normals and/or tumors) to also keep these regions uncompressed
|
||||
*
|
||||
* User: carneiro
|
||||
* Date: 10/15/12
|
||||
* Time: 4:08 PM
|
||||
*/
|
||||
public class CompressionStash extends ObjectAVLTreeSet<FinishedGenomeLoc> {
|
||||
public CompressionStash() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a UnvalidatingGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc
|
||||
* in the stash.
|
||||
*
|
||||
* @param insertLoc the new loc to be inserted
|
||||
* @return true if the loc, or it's merged version, wasn't present in the list before.
|
||||
*/
|
||||
@Override
|
||||
public boolean add(final FinishedGenomeLoc insertLoc) {
|
||||
ObjectSortedSet<FinishedGenomeLoc> removedLocs = new ObjectAVLTreeSet<FinishedGenomeLoc>();
|
||||
for (FinishedGenomeLoc existingLoc : this) {
|
||||
if (existingLoc.isPast(insertLoc)) {
|
||||
break; // if we're past the loc we're done looking for overlaps.
|
||||
}
|
||||
if (existingLoc.equals(insertLoc)) {
|
||||
return false; // if this loc was already present in the stash, we don't need to insert it.
|
||||
}
|
||||
if (existingLoc.contiguousP(insertLoc)) {
|
||||
removedLocs.add(existingLoc); // list the original loc for merging
|
||||
}
|
||||
}
|
||||
|
||||
this.removeAll(removedLocs); // remove all locs that will be merged
|
||||
removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged
|
||||
|
||||
return super.add(new FinishedGenomeLoc(GenomeLoc.merge(removedLocs), insertLoc.isFinished()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(Collection<? extends FinishedGenomeLoc> locs) {
|
||||
boolean result = false;
|
||||
for (final FinishedGenomeLoc loc : locs) {
|
||||
result |= this.add(loc);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/10/11
|
||||
* Time: 8:49 AM
|
||||
*
|
||||
* A general interface for ReadCompressors. Read compressors have the following semantics:
|
||||
*
|
||||
* The accept a stream of reads, in order, and after each added read returns a compressed stream
|
||||
* of reads for emission. This stream of reads is a "reduced" representation of the total stream
|
||||
* of reads. The actual compression approach is left up to the implementing class.
|
||||
*/
|
||||
public interface Compressor {
|
||||
/**
|
||||
* Adds the read to the compressor. The returned iteratable collection of
|
||||
* reads represents the incremental compressed output.
|
||||
* @param read the next uncompressed read in the input stream to the compressor
|
||||
* @return an iterator over the incrementally available compressed reads
|
||||
*/
|
||||
@Requires("read != null")
|
||||
@Ensures("result != null")
|
||||
Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read);
|
||||
|
||||
/**
|
||||
* Must be called after the last read has been added to finalize the compressor state
|
||||
* and return the last compressed reads from the compressor.
|
||||
* @return an iterator over the final compressed reads of this compressor
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
Iterable<GATKSAMRecord> close();
|
||||
}
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
|
||||
|
||||
/**
|
||||
* GenomeLocs are very useful objects to keep track of genomic locations and perform set operations
|
||||
* with them.
|
||||
*
|
||||
* However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot
|
||||
* be created easily for small tasks that do not require the rigors of the GenomeLocParser validation
|
||||
*
|
||||
* UnvalidatingGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should
|
||||
* only be used outside of the engine.
|
||||
*
|
||||
* User: carneiro
|
||||
* Date: 10/16/12
|
||||
* Time: 2:07 PM
|
||||
*/
|
||||
public class FinishedGenomeLoc extends UnvalidatingGenomeLoc {
|
||||
private boolean finished;
|
||||
|
||||
public FinishedGenomeLoc(final String contigName, final int contigIndex, final int start, final int stop, final boolean finished) {
|
||||
super(contigName, contigIndex, start, stop);
|
||||
this.finished = finished;
|
||||
}
|
||||
|
||||
public FinishedGenomeLoc(final GenomeLoc loc, final boolean finished) {
|
||||
super(loc.getContig(), loc.getContigIndex(), loc.getStart(), loc.getStop());
|
||||
this.finished = finished;
|
||||
}
|
||||
|
||||
public boolean isFinished() {
|
||||
return finished;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,393 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
||||
/**
|
||||
* The element that describes the header of the sliding window.
|
||||
*
|
||||
* Each site has a header element containing the counts of each base, it's reference based location and whether or
|
||||
* not the site has insertions (to it's right). It also contains information about the bases that have been filtered
|
||||
* out due to mapping or base quality.
|
||||
*/
|
||||
public class HeaderElement {
|
||||
private BaseAndQualsCounts positiveConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
|
||||
private BaseAndQualsCounts negativeConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
|
||||
private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site.
|
||||
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
|
||||
private int location; // Genome location of this site (the sliding window knows which contig we're at
|
||||
|
||||
protected static final int MIN_COUNT_FOR_USING_PVALUE = 2;
|
||||
|
||||
public int getLocation() {
|
||||
return location;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the base counts object for the consensus type
|
||||
*
|
||||
* @param consensusType the type to use
|
||||
* @return non-null base counts
|
||||
*/
|
||||
public BaseAndQualsCounts getBaseCounts(final SlidingWindow.ConsensusType consensusType) {
|
||||
if ( consensusType == SlidingWindow.ConsensusType.POSITIVE_CONSENSUS )
|
||||
return positiveConsensusBaseCounts;
|
||||
if ( consensusType == SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS )
|
||||
return negativeConsensusBaseCounts;
|
||||
return filteredBaseCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty
|
||||
* filteredBaseCounts - 0 insertions to the right - empty mappingQuality list
|
||||
*
|
||||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(final int location) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty
|
||||
* filteredBaseCounts - empty mappingQuality list
|
||||
*
|
||||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(final int location, final int insertionsToTheRight) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with all given parameters
|
||||
*
|
||||
* @param positiveConsensusBaseCounts the BaseCounts object for the running positive consensus synthetic read
|
||||
* @param negativeConsensusBaseCounts the BaseCounts object for the running negative consensus synthetic read
|
||||
* @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read
|
||||
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
|
||||
* @param location the reference location of this reference element
|
||||
* HeaderElement
|
||||
*/
|
||||
public HeaderElement(final BaseAndQualsCounts positiveConsensusBaseCounts, final BaseAndQualsCounts negativeConsensusBaseCounts, final BaseAndQualsCounts filteredBaseCounts, final int insertionsToTheRight, final int location) {
|
||||
this.positiveConsensusBaseCounts = positiveConsensusBaseCounts;
|
||||
this.negativeConsensusBaseCounts = negativeConsensusBaseCounts;
|
||||
this.filteredBaseCounts = filteredBaseCounts;
|
||||
this.insertionsToTheRight = insertionsToTheRight;
|
||||
this.location = location;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
|
||||
* by insertion, deletion and mismatches.
|
||||
*
|
||||
* @param minVariantPvalue min p-value for deciding that a position is or is not variable due to mismatches
|
||||
* @param minVariantProportion min proportion for deciding that a position is or is not variable due to mismatches
|
||||
* @param minIndelProportion min proportion for deciding that a position is or is not variable due to indels
|
||||
* @return true if site is variant by any definition. False otherwise.
|
||||
*/
|
||||
public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) {
|
||||
return ( hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) )
|
||||
&& (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the base insertion quality
|
||||
* @param delQual the base deletion quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
* @param minBaseQual the minimum base qual allowed to be a good base
|
||||
* @param minMappingQual the minimum mapping qual allowed to be a good read
|
||||
* @param isSoftClipped true if the base is soft-clipped in the original read
|
||||
* @param isNegativeStrand true if the base comes from a read on the negative strand
|
||||
*/
|
||||
public void addBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) {
|
||||
// If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
|
||||
if ( baseMappingQuality >= minMappingQual ) {
|
||||
if ( isNegativeStrand )
|
||||
negativeConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
|
||||
else
|
||||
positiveConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
|
||||
} else {
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the base insertion quality
|
||||
* @param delQual the base deletion quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
* @param minBaseQual the minimum base qual allowed to be a good base
|
||||
* @param minMappingQual the minimum mapping qual allowed to be a good read
|
||||
* @param isSoftClipped true if the base is soft-clipped in the original read
|
||||
* @param isNegativeStrand true if the base comes from a read on the negative strand
|
||||
*/
|
||||
public void removeBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) {
|
||||
// If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
|
||||
if ( baseMappingQuality >= minMappingQual ) {
|
||||
if ( isNegativeStrand )
|
||||
negativeConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
|
||||
else
|
||||
positiveConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
|
||||
} else {
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
|
||||
* should be added to the right of the element.
|
||||
*/
|
||||
public void addInsertionToTheRight() {
|
||||
insertionsToTheRight++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this HeaderElement contain consensus data?
|
||||
*
|
||||
* @param consensusType the type to use
|
||||
* @return whether or not this HeaderElement contains consensus data
|
||||
*/
|
||||
public boolean hasConsensusData(final SlidingWindow.ConsensusType consensusType) {
|
||||
return getBaseCounts(consensusType).totalCount() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* A HeaderElement is empty if it has no consensus or filtered data
|
||||
*
|
||||
* @return whether or not this HeaderElement has no data
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
return !hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.FILTERED);
|
||||
}
|
||||
|
||||
/**
|
||||
* removes an insertion from this element (if you removed a read that had an insertion)
|
||||
*/
|
||||
public void removeInsertionToTheRight() {
|
||||
this.insertionsToTheRight--;
|
||||
if (insertionsToTheRight < 0)
|
||||
throw new ReviewedStingException("Removed too many insertions, header is now negative at position " + location);
|
||||
}
|
||||
|
||||
public boolean hasInsertionToTheRight() {
|
||||
return insertionsToTheRight > 0;
|
||||
}
|
||||
|
||||
public int numInsertionsToTheRight() {
|
||||
return insertionsToTheRight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess insertions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromInsertions(double minIndelProportion) {
|
||||
final int numberOfBases = totalCountForBothStrands();
|
||||
if (numberOfBases == 0)
|
||||
return (insertionsToTheRight > 0); // do we only have insertions?
|
||||
|
||||
// if we have bases and insertions, check the ratio
|
||||
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
|
||||
}
|
||||
|
||||
private int totalCountForBothStrands() {
|
||||
return positiveConsensusBaseCounts.totalCount() + negativeConsensusBaseCounts.totalCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess deletions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess deletions
|
||||
*/
|
||||
private boolean isVariantFromDeletions(double minIndelProportion) {
|
||||
return positiveConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || positiveConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion
|
||||
|| negativeConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || negativeConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess mismatches
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage).
|
||||
* @param minVariantProportion the minimum proportion to call a site variant (used with high coverage).
|
||||
* @return whether or not the HeaderElement is variant due to excess mismatches
|
||||
*/
|
||||
protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) {
|
||||
return isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) ||
|
||||
isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess mismatches
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage).
|
||||
* @param minVariantProportion the minimum proportion to call a site variant (used with high coverage).
|
||||
* @param consensusType the consensus type to use
|
||||
* @return whether or not the HeaderElement is variant due to excess mismatches
|
||||
*/
|
||||
private boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion, final SlidingWindow.ConsensusType consensusType) {
|
||||
final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType);
|
||||
final int totalCount = baseAndQualsCounts.totalCountWithoutIndels();
|
||||
final BaseIndex mostCommon = baseAndQualsCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
final int countOfOtherBases = totalCount - baseAndQualsCounts.countOfBase(mostCommon);
|
||||
return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion);
|
||||
}
|
||||
|
||||
/**
|
||||
* This handles the special case where we have more bases that came from soft clips than bases that came from
|
||||
* normal bases by forcing it to become a variant region. We don't want a consensus based on too little information.
|
||||
*
|
||||
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
|
||||
*/
|
||||
protected boolean isVariantFromSoftClips() {
|
||||
return isVariantFromSoftClips(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || isVariantFromSoftClips(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS);
|
||||
}
|
||||
|
||||
/**
|
||||
* This handles the special case where we have more bases that came from soft clips than bases that came from
|
||||
* normal bases by forcing it to become a variant region. We don't want a consensus based on too little information.
|
||||
*
|
||||
* @param consensusType the consensus type to use
|
||||
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
|
||||
*/
|
||||
private boolean isVariantFromSoftClips(final SlidingWindow.ConsensusType consensusType) {
|
||||
final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType);
|
||||
final int nSoftClippedBases = baseAndQualsCounts.nSoftclips();
|
||||
return nSoftClippedBases > 0 && nSoftClippedBases >= (baseAndQualsCounts.totalCount() - nSoftClippedBases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return the number of alleles necessary to represent this site or -1 if there are too many indels
|
||||
*/
|
||||
public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) {
|
||||
final ObjectArrayList<BaseIndex> alleles = getAlleles(minVariantPvalue, minVariantProportion);
|
||||
return alleles == null ? -1 : alleles.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return the list of alleles necessary to represent this site or null if there are too many indels
|
||||
*/
|
||||
public ObjectArrayList<BaseIndex> getAlleles(final double minVariantPvalue, final double minVariantProportion) {
|
||||
// make sure we have bases at all
|
||||
final int totalBaseCount = totalCountForBothStrands();
|
||||
if ( totalBaseCount == 0 )
|
||||
return new ObjectArrayList<>(0);
|
||||
|
||||
// next, check for insertions; technically, the insertion count can be greater than totalBaseCount
|
||||
// (because of the way insertions are counted), so we need to account for that
|
||||
if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) )
|
||||
return null;
|
||||
|
||||
// finally, check for the bases themselves (including deletions)
|
||||
final ObjectArrayList<BaseIndex> alleles = new ObjectArrayList<>(4);
|
||||
for ( final BaseIndex base : BaseIndex.values() ) {
|
||||
final int baseCount = positiveConsensusBaseCounts.countOfBase(base) + negativeConsensusBaseCounts.countOfBase(base);
|
||||
if ( baseCount == 0 )
|
||||
continue;
|
||||
|
||||
if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) {
|
||||
if ( base == BaseIndex.D )
|
||||
return null;
|
||||
alleles.add(base);
|
||||
}
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there are a significant number of softclips.
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return true if there are significant softclips, false otherwise
|
||||
*/
|
||||
public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) {
|
||||
return hasSignificantCount(positiveConsensusBaseCounts.nSoftclips() + negativeConsensusBaseCounts.nSoftclips(), totalCountForBothStrands(), minVariantPvalue, minVariantProportion);
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there are a significant number of count.
|
||||
*
|
||||
* @param count the count (k) to test against
|
||||
* @param total the total (n) to test against
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return true if there is a significant count given the provided pvalue, false otherwise
|
||||
*/
|
||||
private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) {
|
||||
if ( count == 0 || total == 0 )
|
||||
return false;
|
||||
|
||||
// use p-values for low counts of k
|
||||
if ( count <= MIN_COUNT_FOR_USING_PVALUE ) {
|
||||
final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count);
|
||||
return pvalue > minVariantPvalue;
|
||||
}
|
||||
|
||||
// otherwise, use straight proportions
|
||||
final int minBaseCountForSignificance = (int)(minVariantProportion * total);
|
||||
return count >= minBaseCountForSignificance;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,163 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public class MultiSampleCompressor {
|
||||
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
||||
|
||||
protected Object2ObjectMap<String, SingleSampleCompressor> compressorsPerSample = new Object2ObjectOpenHashMap<String, SingleSampleCompressor>();
|
||||
|
||||
public MultiSampleCompressor(SAMFileHeader header,
|
||||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltPValueToTriggerVariant,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an alignment to the compressor
|
||||
*
|
||||
* @param read the read to be added
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public ObjectSet<GATKSAMRecord> addAlignment(final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
String sampleName = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sampleName);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions);
|
||||
ObjectSet<GATKSAMRecord> reads = readsAndStash.getFirst();
|
||||
CompressionStash regions = readsAndStash.getSecond();
|
||||
|
||||
reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions));
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Properly closes the compressor.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ObjectSet<GATKSAMRecord> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close(knownSnpPositions);
|
||||
reads.addAll(readsAndStash.getFirst());
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes current variant regions.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
private ObjectSet<GATKSAMRecord> closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if (!regions.isEmpty()) {
|
||||
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
|
||||
reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions));
|
||||
}
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,782 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
|
||||
*
|
||||
* <p>
|
||||
* This tool will generate reduced versions of the BAM files that still follow the BAM specification
|
||||
* and contain all the information necessary to call variants according to the GATK Best Practices recommendations.
|
||||
* Some options allow you to tune how much compression you want to achieve. The default values have been
|
||||
* shown to reduce a typical whole exome BAM file by 100x. The higher the coverage, the bigger the
|
||||
* savings in file size and performance of the downstream tools.
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The BAM file to be compressed
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* The compressed (reduced) BAM file.
|
||||
*
|
||||
* <p/>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ReduceReads \
|
||||
* -I myData.bam \
|
||||
* -o myData.reduced.bam
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
|
||||
@PartitionBy(PartitionType.CONTIG)
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40)
|
||||
public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
|
||||
@Output(required = false, defaultToStdout = false)
|
||||
private StingSAMFileWriter out = null;
|
||||
private SAMFileWriter writerToUse = null;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "The number of bases to keep around mismatches (potential variation)", required = false)
|
||||
public int contextSize = 10;
|
||||
|
||||
/**
|
||||
* Reads that have
|
||||
* mapping quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "The minimum mapping quality to be considered for the consensus synthetic read", required = false)
|
||||
public int minMappingQuality = 20;
|
||||
|
||||
/**
|
||||
* Reads that have
|
||||
* base quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "The minimum base quality to be considered for the consensus synthetic read", required = false)
|
||||
public byte minBaseQual = 15;
|
||||
|
||||
/**
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with
|
||||
* quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
*/
|
||||
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
|
||||
public byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Any number of VCF files representing known SNPs to be used for the polyploid-based reduction.
|
||||
* Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored.
|
||||
* If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present
|
||||
* in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will
|
||||
* be triggered anywhere there is a single SNP present in a consensus window.
|
||||
*/
|
||||
@Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* This strips away all extra information of the read -- anything other than bases, quals
|
||||
* and read group.
|
||||
*/
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "Do not simplify read", required = false)
|
||||
public boolean DONT_SIMPLIFY_READS = false;
|
||||
|
||||
/**
|
||||
* Note that it is not necessary to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly by default in those cases.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "Do not hard clip adaptor sequences", required = false)
|
||||
public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
|
||||
/**
|
||||
* This option overrides the argument of minimum tail
|
||||
* quality.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "Do not hard clip the low quality tails of the reads", required = false)
|
||||
public boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
|
||||
/**
|
||||
* By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
* base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant
|
||||
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
|
||||
*/
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "Do not use high quality soft-clipped bases", required = false)
|
||||
public boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
|
||||
/**
|
||||
* By default, ReduceReads will compress read names to numbers and guarantee
|
||||
* uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather
|
||||
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
|
||||
*/
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "Do not compress read names", required = false)
|
||||
public boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
|
||||
/**
|
||||
* The hard clips will happen exactly at the interval border.
|
||||
*/
|
||||
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "Hard clip all incoming reads to the desired intervals", required = false)
|
||||
public boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
|
||||
/**
|
||||
* Anything below this will be
|
||||
* considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that
|
||||
* this value is used only regions with high coverage.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "Minimum proportion of mismatches in a site to trigger a variant region", required = false)
|
||||
public double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to
|
||||
* trigger polyploid compression). Note that this value is used only regions with low coverage.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region", required = false)
|
||||
public double minAltPValueToTriggerVariant = 0.01;
|
||||
|
||||
/**
|
||||
* Anything below this will be considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "Minimum proportion of indels in a site to trigger a variant region", required = false)
|
||||
public double minIndelProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* This level of downsampling only happens after the region has been evaluated, therefore it can
|
||||
* be combined with the engine level downsampling.
|
||||
* A value of 0 turns downsampling off.
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "Downsample the number of reads emitted per sample in a variant region for better compression", required = false)
|
||||
public int downsampleCoverage = 250;
|
||||
|
||||
/**
|
||||
* Generally, this tool is not meant to be run for more than 1 sample at a time. The one valid exception
|
||||
* brought to our attention by colleagues is the specific case of tumor/normal pairs in cancer analysis.
|
||||
* To prevent users from unintentionally running the tool in a less than ideal manner, we require them
|
||||
* to explicitly enable multi-sample analysis with this argument.
|
||||
*/
|
||||
@Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "Enable multi-sample reduction for cancer analysis", required = false)
|
||||
public boolean ALLOW_MULTIPLE_SAMPLES = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "nwayout", shortName = "nw", doc = "Generate separate output files per input file", required = false)
|
||||
public boolean nwayout = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "Debug level", required = false)
|
||||
public int debugLevel = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dr", doc = "Debug read", required = false)
|
||||
public String debugRead = "";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "Downsampling strategy", required = false)
|
||||
public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags", required = false)
|
||||
public boolean NO_PG_TAG = false;
|
||||
|
||||
public enum DownsampleStrategy {
|
||||
Normal,
|
||||
Adaptive
|
||||
}
|
||||
|
||||
int nCompressedReads = 0;
|
||||
|
||||
private static int READ_NAME_HASH_DEFAULT_SIZE = 1000;
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
Object2LongOpenHashMap<String> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
|
||||
ObjectSortedSet<GenomeLoc> intervalList;
|
||||
|
||||
ObjectSortedSet<GenomeLoc> knownSnpPositions;
|
||||
|
||||
// IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam";
|
||||
|
||||
/**
|
||||
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
|
||||
* is done at the reduceInit method
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
if ( !nwayout && out == null )
|
||||
throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes");
|
||||
|
||||
if ( nwayout && out != null )
|
||||
throw new UserException.CommandLineException("--out and --nwayout cannot be used simultaneously; please use one or the other");
|
||||
|
||||
if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 )
|
||||
throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
||||
if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
|
||||
throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
||||
if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES )
|
||||
throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis. If that is what you want to do, use the -cancer_mode flag.");
|
||||
|
||||
if ( known.isEmpty() )
|
||||
knownSnpPositions = null;
|
||||
else
|
||||
knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
GenomeAnalysisEngine toolkit = getToolkit();
|
||||
this.resetReadNameHash(); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new ObjectAVLTreeSet<GenomeLoc>(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
|
||||
if (toolkit.getIntervals() != null)
|
||||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
final boolean indexOnTheFly = true;
|
||||
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||
if (nwayout) {
|
||||
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
|
||||
writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, false, indexOnTheFly, NO_PG_TAG, programRecord, true);
|
||||
}
|
||||
else {
|
||||
writerToUse = out;
|
||||
out.setPresorted(false);
|
||||
if (!NO_PG_TAG) {
|
||||
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Initializer for {@link #readNameHash}. */
|
||||
private void resetReadNameHash() {
|
||||
// If the hash grows large, subsequent clear operations can be very expensive, so trim the hash down if it grows beyond its default.
|
||||
if (readNameHash == null || readNameHash.size() > READ_NAME_HASH_DEFAULT_SIZE) {
|
||||
readNameHash = new Object2LongOpenHashMap<String>(READ_NAME_HASH_DEFAULT_SIZE);
|
||||
} else {
|
||||
readNameHash.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes in a read and prepares it for the SlidingWindow machinery by performing the
|
||||
* following optional clipping operations:
|
||||
* 1. Hard clip adaptor sequences
|
||||
* 2. Hard clip low quality tails
|
||||
* 3. Hard clip all remaining soft clipped bases
|
||||
* 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads)
|
||||
*
|
||||
* @param ref default map parameter
|
||||
* @param read default map parameter
|
||||
* @param metaDataTracker default map parameter
|
||||
* @return a linked list with all the reads produced by the clipping operations
|
||||
*/
|
||||
@Override
|
||||
public ObjectArrayList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
ObjectArrayList<GATKSAMRecord> mappedReads;
|
||||
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
|
||||
System.out.println("Found debug read!");
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||
|
||||
// we write the actual alignment starts to their respective alignment shift tags in the temporary
|
||||
// attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
|
||||
|
||||
// Check if the read goes beyond the boundaries of the chromosome, and hard clip those boundaries.
|
||||
int chromosomeLength = ref.getGenomeLocParser().getContigInfo(read.getReferenceName()).getSequenceLength();
|
||||
if (read.getSoftStart() < 0)
|
||||
read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart());
|
||||
if (read.getSoftEnd() > chromosomeLength)
|
||||
read = ReadClipper.hardClipByReadCoordinates(read, chromosomeLength - read.getSoftStart() + 1, read.getReadLength() - 1);
|
||||
|
||||
if (!DONT_SIMPLIFY_READS)
|
||||
read.simplify(); // Clear all unnecessary attributes
|
||||
if (!DONT_CLIP_ADAPTOR_SEQUENCES)
|
||||
read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any.
|
||||
if (!DONT_CLIP_LOW_QUAL_TAILS)
|
||||
read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails
|
||||
if (!isWholeGenome()) {
|
||||
if (HARD_CLIP_TO_INTERVAL)
|
||||
mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval
|
||||
else {
|
||||
mappedReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
mappedReads.add(read);
|
||||
}
|
||||
}
|
||||
else {
|
||||
mappedReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
if (!read.isEmpty())
|
||||
mappedReads.add(read);
|
||||
}
|
||||
|
||||
if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
|
||||
ObjectArrayList<GATKSAMRecord> tempList = new ObjectArrayList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord mRead : mappedReads) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
|
||||
if (!clippedRead.isEmpty())
|
||||
tempList.add(clippedRead);
|
||||
}
|
||||
mappedReads = tempList;
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
for (GATKSAMRecord mappedRead : mappedReads)
|
||||
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
|
||||
|
||||
// add the SNPs to the list of known positions
|
||||
populateKnownSNPs(metaDataTracker);
|
||||
|
||||
return mappedReads;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the positions of known SNPs to the set so that we can keep track of it
|
||||
*
|
||||
* @param metaDataTracker the ref meta data tracker
|
||||
*/
|
||||
protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) {
|
||||
for ( final VariantContext vc : metaDataTracker.getValues(known) ) {
|
||||
if ( vc.isSNP() )
|
||||
knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the ReduceReadsStash that keeps track of all reads that are waiting to
|
||||
* enter the SlidingWindow machinery. The stash makes sure reads are served in order
|
||||
* even though map() may generate reads that are only supposed to enter the machinery
|
||||
* in the future.
|
||||
*
|
||||
* @return the empty stash
|
||||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process
|
||||
* all reads that come before the original read (the read that was passed to map) including the original
|
||||
* read. This is where we send reads, in order, to the SlidingWindow machinery.
|
||||
*
|
||||
* @param mappedReads the list of reads sent by map
|
||||
* @param stash the stash that keeps the reads in order for processing
|
||||
* @return the stash with all reads that have not been processed yet
|
||||
*/
|
||||
public ReduceReadsStash reduce(ObjectArrayList<GATKSAMRecord> mappedReads, ReduceReadsStash stash) {
|
||||
if (debugLevel == 1)
|
||||
stash.print();
|
||||
|
||||
boolean firstRead = true;
|
||||
for (GATKSAMRecord read : mappedReads) {
|
||||
boolean originalRead = firstRead && isOriginalRead(mappedReads, read);
|
||||
|
||||
if (read.getReadLength() == 0)
|
||||
throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
|
||||
if (originalRead) {
|
||||
ObjectArrayList<GATKSAMRecord> readsReady = new ObjectArrayList<GATKSAMRecord>();
|
||||
readsReady.addAll(stash.getAllReadsBefore(read));
|
||||
readsReady.add(read);
|
||||
|
||||
for (GATKSAMRecord readReady : readsReady) {
|
||||
if (debugLevel == 1)
|
||||
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
|
||||
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions))
|
||||
outputRead(compressedRead);
|
||||
|
||||
// We only care about maintaining the link between read pairs if they are in the same variant
|
||||
// region. Since an entire variant region's worth of reads is returned in a single call to
|
||||
// stash.compress(), the readNameHash can be cleared after the for() loop above.
|
||||
// The advantage of clearing the hash is that otherwise it holds all reads that have been encountered,
|
||||
// which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory.
|
||||
this.resetReadNameHash();
|
||||
|
||||
}
|
||||
} else
|
||||
stash.add(read);
|
||||
|
||||
firstRead = false;
|
||||
}
|
||||
|
||||
// reduce memory requirements by removing old positions
|
||||
if ( !mappedReads.isEmpty() )
|
||||
clearStaleKnownPositions(mappedReads.get(0));
|
||||
|
||||
return stash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that now more reads will come, we process all the remaining reads in the stash, in order.
|
||||
*
|
||||
* @param stash the ReduceReadsStash with all unprocessed reads (from reduce)
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(ReduceReadsStash stash) {
|
||||
|
||||
// output any remaining reads in the compressor
|
||||
for (GATKSAMRecord read : stash.close(knownSnpPositions))
|
||||
outputRead(read);
|
||||
|
||||
if (nwayout)
|
||||
writerToUse.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes known positions that are no longer relevant for use with het compression.
|
||||
*
|
||||
* @param read the current read, used for checking whether there are stale positions we can remove
|
||||
*/
|
||||
protected void clearStaleKnownPositions(final GATKSAMRecord read) {
|
||||
// nothing to clear if not used or empty
|
||||
if ( knownSnpPositions == null || knownSnpPositions.isEmpty() )
|
||||
return;
|
||||
|
||||
// not ready to be cleared until we encounter a read from a different contig
|
||||
final int contigIndexOfRead = read.getReferenceIndex();
|
||||
if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead )
|
||||
return;
|
||||
|
||||
// because we expect most elements to be stale, it's not going to be efficient to remove them one at a time
|
||||
final ObjectAVLTreeSet<GenomeLoc> goodLocs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
for ( final GenomeLoc loc : knownSnpPositions ) {
|
||||
if ( loc.getContigIndex() == contigIndexOfRead )
|
||||
goodLocs.add(loc);
|
||||
}
|
||||
knownSnpPositions.clear();
|
||||
knownSnpPositions.addAll(goodLocs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
|
||||
*
|
||||
* Note: If read overlaps more than one interval, it will be hard clipped to all
|
||||
* the intervals it overlaps with
|
||||
*
|
||||
* @param read the read to be hard clipped to the interval.
|
||||
* @return a shallow copy of the read hard clipped to the interval
|
||||
*/
|
||||
private ObjectArrayList<GATKSAMRecord> hardClipReadToInterval(GATKSAMRecord read) {
|
||||
ObjectArrayList<GATKSAMRecord> clippedReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
|
||||
|
||||
boolean originalRead = true; // false if this is the right tail of the original read
|
||||
boolean overlap; // keeps track of the interval that overlapped the original read
|
||||
boolean doneClipping; // triggers an early exit if we are done clipping this read
|
||||
|
||||
if (isWholeGenome())
|
||||
clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged
|
||||
|
||||
for (GenomeLoc interval : intervalList) {
|
||||
|
||||
if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before)
|
||||
break;
|
||||
|
||||
GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch
|
||||
|
||||
switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) {
|
||||
case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read
|
||||
if (!originalRead) // something went wrong if this is the tail of the read
|
||||
throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
overlap = false;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
|
||||
case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore
|
||||
if (originalRead) {
|
||||
overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read).
|
||||
clippedRead = GATKSAMRecord.emptyRead(read);
|
||||
} else
|
||||
overlap = false;
|
||||
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_CONTIG: // read is in a different contig
|
||||
if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one.
|
||||
if (read.getReferenceIndex() < interval.getContigIndex())
|
||||
throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
else {
|
||||
overlap = false;
|
||||
doneClipping = false;
|
||||
}
|
||||
} // tail read CANNOT be in a different contig.
|
||||
else {
|
||||
if (read.getReferenceIndex() < interval.getContigIndex()) {
|
||||
overlap = false;
|
||||
doneClipping = true;
|
||||
} else
|
||||
throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_LEFT:
|
||||
if (originalRead) // if this is the first read this should never happen.
|
||||
throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
|
||||
overlap = false;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore
|
||||
overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here.
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case OVERLAP_LEFT: // clip the left tail of the read
|
||||
clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1);
|
||||
|
||||
overlap = true;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval
|
||||
clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1);
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
|
||||
|
||||
overlap = true;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read
|
||||
clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1);
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
|
||||
|
||||
overlap = true;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case OVERLAP_CONTAINED: // don't do anything to the read
|
||||
clippedRead = read;
|
||||
|
||||
overlap = true;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval.");
|
||||
}
|
||||
|
||||
if (overlap && originalRead)
|
||||
intervalOverlapped = interval;
|
||||
|
||||
if (clippedRead != null) {
|
||||
originalRead = false;
|
||||
|
||||
if (!clippedRead.isEmpty())
|
||||
clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off
|
||||
}
|
||||
|
||||
if (doneClipping)
|
||||
break;
|
||||
}
|
||||
|
||||
if (intervalOverlapped != null)
|
||||
intervalList = intervalList.tailSet(intervalOverlapped);
|
||||
|
||||
return clippedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compresses the read name and adds it to output BAM file (reduced BAM)
|
||||
* after performing some quality control
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void outputRead(GATKSAMRecord read) {
|
||||
if (debugLevel == 2) {
|
||||
checkForHighMismatch(read);
|
||||
checkCigar(read);
|
||||
}
|
||||
|
||||
if (read.isReducedRead())
|
||||
nCompressedReads++;
|
||||
else {
|
||||
int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT);
|
||||
int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT);
|
||||
|
||||
int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression
|
||||
int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression
|
||||
|
||||
if (startShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
|
||||
if (endShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
|
||||
|
||||
if (!DONT_COMPRESS_READ_NAMES)
|
||||
nextReadNumber = compressReadName(readNameHash, read, nextReadNumber);
|
||||
|
||||
writerToUse.addAlignment(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Quality control procedure that checks if the consensus reads contains too many
|
||||
* mismatches with the reference. This should never happen and is a good trigger for
|
||||
* errors with the algorithm.
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void checkForHighMismatch(GATKSAMRecord read) {
|
||||
final int start = read.getAlignmentStart();
|
||||
final int stop = read.getAlignmentEnd();
|
||||
final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases();
|
||||
final int nm = SequenceUtil.countMismatches(read, ref, start - 1);
|
||||
final int readLen = read.getReadLength();
|
||||
final double nmFraction = nm / (1.0 * readLen);
|
||||
if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus"))
|
||||
throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
private void checkCigar (GATKSAMRecord read) {
|
||||
if (read.getCigar().isValid(null, -1) != null) {
|
||||
throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compresses the read name using the readNameHash if we have already compressed
|
||||
* this read name before.
|
||||
*
|
||||
* @param hash the hash table containing the read name to compressed read name map
|
||||
* @param read any read
|
||||
* @param nextReadNumber the number to use in the compressed read name in case this is a new read name
|
||||
* @return the next number to use in the compressed read name
|
||||
*/
|
||||
protected static long compressReadName(final Object2LongOpenHashMap<String> hash, final GATKSAMRecord read, final long nextReadNumber) {
|
||||
final String name = read.getReadName();
|
||||
final StringBuilder compressedName = new StringBuilder();
|
||||
long result = nextReadNumber;
|
||||
if (read.isReducedRead()) {
|
||||
compressedName.append("C");
|
||||
}
|
||||
final Long readNumber = hash.get(name);
|
||||
if (readNumber != null) {
|
||||
compressedName.append(readNumber);
|
||||
} else {
|
||||
hash.put(name, nextReadNumber);
|
||||
compressedName.append(nextReadNumber);
|
||||
result++;
|
||||
}
|
||||
read.setReadName(compressedName.toString());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the read is the original read that went through map().
|
||||
*
|
||||
* This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled.
|
||||
*
|
||||
* @param list the list
|
||||
* @param read the read
|
||||
* @return Returns true if the read is the original read that went through map().
|
||||
*/
|
||||
private boolean isOriginalRead(ObjectArrayList<GATKSAMRecord> list, GATKSAMRecord read) {
|
||||
return isWholeGenome() || list.get(0).equals(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether or not the intervalList is empty, meaning we're running in WGS mode.
|
||||
*
|
||||
* @return whether or not we're running in WGS mode.
|
||||
*/
|
||||
private boolean isWholeGenome() {
|
||||
return intervalList.isEmpty();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* This class implements a "read stash" that keeps reads always sorted in alignment order. Useful
|
||||
* for read walkers that alter the alignment information of the incoming reads, but need to
|
||||
* maintain the reads sorted for the reduce step. (e.g. ReduceReads)
|
||||
*/
|
||||
|
||||
public class ReduceReadsStash {
|
||||
protected MultiSampleCompressor compressor;
|
||||
SortedSet<GATKSAMRecord> outOfOrderReads;
|
||||
|
||||
/**
|
||||
* Creates a stash with the default sorting order (read alignment)
|
||||
* @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close())
|
||||
*/
|
||||
public ReduceReadsStash(MultiSampleCompressor compressor) {
|
||||
this.compressor = compressor;
|
||||
this.outOfOrderReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all reads before a given read (for processing)
|
||||
*
|
||||
* @param read the original read
|
||||
* @return all reads that have alignment start before the original read.
|
||||
*/
|
||||
public List<GATKSAMRecord> getAllReadsBefore(GATKSAMRecord read) {
|
||||
List<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
GATKSAMRecord newHead = null;
|
||||
|
||||
for (GATKSAMRecord stashedRead : outOfOrderReads) {
|
||||
if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0)
|
||||
result.add(stashedRead);
|
||||
else {
|
||||
newHead = stashedRead;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (result.size() > 0) {
|
||||
if (result.size() == outOfOrderReads.size())
|
||||
outOfOrderReads.clear();
|
||||
else
|
||||
outOfOrderReads = new TreeSet<GATKSAMRecord>(outOfOrderReads.tailSet(newHead));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* sends the read to the MultiSampleCompressor
|
||||
*
|
||||
* @param read the read to be compressed
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> compress(final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return compressor.addAlignment(read, knownSnpPositions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a read to the stash
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
public void add(GATKSAMRecord read) {
|
||||
outOfOrderReads.add(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the stash, processing all remaining reads in order
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return a list of all the reads produced by the SlidingWindow machinery)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
LinkedList<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// compress all the stashed reads (in order)
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions))
|
||||
result.add(compressedRead);
|
||||
|
||||
// output any remaining reads from the compressor
|
||||
for (GATKSAMRecord read : compressor.close(knownSnpPositions))
|
||||
result.add(read);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful debug functionality, outputs all elements in the stash
|
||||
*/
|
||||
public void print() {
|
||||
int i = 1;
|
||||
System.out.println("Stash Contents:");
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd()));
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,153 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author carneiro, depristo
|
||||
* @version 3.0
|
||||
*/
|
||||
public class SingleSampleCompressor {
|
||||
final private int contextSize;
|
||||
final private int downsampleCoverage;
|
||||
final private int minMappingQuality;
|
||||
final private double minAltPValueToTriggerVariant;
|
||||
final private double minAltProportionToTriggerVariant;
|
||||
final private double minIndelProportionToTriggerVariant;
|
||||
final private int minBaseQual;
|
||||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
|
||||
private SlidingWindow slidingWindow;
|
||||
private int slidingWindowCounter;
|
||||
|
||||
public static Pair<ObjectSet<GATKSAMRecord>, CompressionStash> emptyPair = new Pair<ObjectSet<GATKSAMRecord>,CompressionStash>(new ObjectAVLTreeSet<GATKSAMRecord>(), new CompressionStash());
|
||||
|
||||
public SingleSampleCompressor(final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltPValueToTriggerVariant,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
this.slidingWindowCounter = 0;
|
||||
this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant;
|
||||
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
|
||||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an alignment to the compressor
|
||||
*
|
||||
* @param read the read to be added
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions ) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash stash = new CompressionStash();
|
||||
int readOriginalStart = read.getUnclippedStart();
|
||||
|
||||
// create a new window if:
|
||||
if ((slidingWindow != null) &&
|
||||
( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig
|
||||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||
|
||||
// close the current sliding window
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions);
|
||||
reads = readsAndStash.getFirst();
|
||||
stash = readsAndStash.getSecond();
|
||||
slidingWindow = null; // so we create a new one on the next if
|
||||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(),
|
||||
slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant,
|
||||
minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
stash.addAll(slidingWindow.addRead(read));
|
||||
return new Pair<ObjectSet<GATKSAMRecord>, CompressionStash>(reads, stash);
|
||||
}
|
||||
|
||||
/**
|
||||
* Properly closes the compressor.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes current variant regions.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,369 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
|
||||
/**
|
||||
* Running Consensus is a read that is compressed as a sliding window travels over the reads
|
||||
* and keeps track of all the bases that are outside of variant regions.
|
||||
*
|
||||
* Consensus reads have qual fields that correspond to the number of reads that had the base
|
||||
* and passed the minimum quality threshold.
|
||||
*
|
||||
* The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads
|
||||
* that compose the consensus
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 8/26/11
|
||||
*/
|
||||
public class SyntheticRead {
|
||||
|
||||
/**
|
||||
* The types of strandedness for synthetic reads
|
||||
*/
|
||||
public enum StrandType {
|
||||
POSITIVE,
|
||||
NEGATIVE,
|
||||
STRANDLESS
|
||||
}
|
||||
|
||||
// Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint.
|
||||
private static class SingleBaseInfo {
|
||||
byte baseIndexOrdinal; // enum BaseIndex.ordinal
|
||||
int count;
|
||||
byte qual;
|
||||
byte insertionQual;
|
||||
byte deletionQual;
|
||||
|
||||
SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) {
|
||||
this.baseIndexOrdinal = baseIndexOrdinal;
|
||||
this.count = count;
|
||||
this.qual = qual;
|
||||
this.insertionQual = insertionQual;
|
||||
this.deletionQual = deletionQual;
|
||||
}
|
||||
}
|
||||
|
||||
// This class is merely sharing of code for convertVariableGivenBases().
|
||||
private abstract class SingleBaseInfoIterator implements Iterator<Byte> {
|
||||
final Iterator<SingleBaseInfo> it;
|
||||
|
||||
SingleBaseInfoIterator() {
|
||||
this.it = basesCountsQuals.iterator();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return it.hasNext();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Map from ordinal to enum value.
|
||||
private static final BaseIndex[] BaseIndexByOrdinal = new BaseIndex[BaseIndex.values().length];
|
||||
static
|
||||
{
|
||||
for (final BaseIndex baseIndex : BaseIndex.values()) {
|
||||
BaseIndexByOrdinal[baseIndex.ordinal()] = baseIndex;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private final ObjectArrayList<SingleBaseInfo> basesCountsQuals;
|
||||
private double mappingQuality;
|
||||
|
||||
// Information to produce a GATKSAMRecord
|
||||
private SAMFileHeader header;
|
||||
private GATKSAMReadGroupRecord readGroupRecord;
|
||||
private String contig;
|
||||
private int contigIndex;
|
||||
private String readName;
|
||||
private int refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
private StrandType strandType = StrandType.STRANDLESS;
|
||||
|
||||
/**
|
||||
* Full initialization of the running consensus if you have all the information and are ready to
|
||||
* start adding to the running consensus.
|
||||
*
|
||||
* @param header GATKSAMRecord file header
|
||||
* @param readGroupRecord Read Group for the GATKSAMRecord
|
||||
* @param contig the read's contig name
|
||||
* @param contigIndex the read's contig index
|
||||
* @param readName the read's name
|
||||
* @param refStart the alignment start (reference based)
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) {
|
||||
final int initialCapacity = 10000;
|
||||
basesCountsQuals = new ObjectArrayList<SingleBaseInfo>(initialCapacity);
|
||||
mappingQuality = 0.0;
|
||||
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.strandType = strandType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Easy access to keep adding to a running consensus that has already been
|
||||
* initialized with the correct read name and refStart
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param count number of reads with this base
|
||||
*/
|
||||
@Requires("count <= Byte.MAX_VALUE")
|
||||
public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) {
|
||||
basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual));
|
||||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
||||
public BaseIndex getBase(final int readCoordinate) {
|
||||
return BaseIndexByOrdinal[basesCountsQuals.get(readCoordinate).baseIndexOrdinal];
|
||||
}
|
||||
|
||||
public int getRefStart() {
|
||||
return refStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
|
||||
*
|
||||
* Invalid reads are :
|
||||
* - exclusively composed of deletions
|
||||
*
|
||||
* @return a GATKSAMRecord or null
|
||||
*/
|
||||
public GATKSAMRecord close () {
|
||||
if (isAllDeletions())
|
||||
return null;
|
||||
|
||||
GATKSAMRecord read = new GATKSAMRecord(header);
|
||||
read.setReferenceName(contig);
|
||||
read.setReferenceIndex(contigIndex);
|
||||
read.setReadPairedFlag(false);
|
||||
read.setReadUnmappedFlag(false);
|
||||
if ( strandType != StrandType.STRANDLESS ) {
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line
|
||||
read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE);
|
||||
}
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
read.setAlignmentStart(refStart);
|
||||
read.setReadName(readName);
|
||||
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
|
||||
read.setReadBases(convertReadBases());
|
||||
read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size()));
|
||||
read.setReadGroup(readGroupRecord);
|
||||
read.setReducedReadCountsTag(convertBaseCounts());
|
||||
|
||||
if (hasIndelQualities) {
|
||||
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
|
||||
read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION);
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the synthetic read is composed exclusively of deletions
|
||||
*
|
||||
* @return true if it is, false if it isn't.
|
||||
*/
|
||||
private boolean isAllDeletions() {
|
||||
for (SingleBaseInfo b : basesCountsQuals)
|
||||
if (b.baseIndexOrdinal != BaseIndex.D.getOrdinalByte())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public int size () {
|
||||
return basesCountsQuals.size();
|
||||
}
|
||||
|
||||
private byte [] convertBaseQualities() {
|
||||
return convertVariableGivenBases(new SingleBaseInfoIterator() {
|
||||
public Byte next() {
|
||||
return it.next().qual;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private byte [] convertInsertionQualities() {
|
||||
return convertVariableGivenBases(new SingleBaseInfoIterator() {
|
||||
public Byte next() {
|
||||
return it.next().insertionQual;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private byte [] convertDeletionQualities() {
|
||||
return convertVariableGivenBases(new SingleBaseInfoIterator() {
|
||||
public Byte next() {
|
||||
return it.next().deletionQual;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
protected int[] convertBaseCounts() {
|
||||
int[] variableArray = new int[getReadLengthWithNoDeletions()];
|
||||
int i = 0;
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
|
||||
if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte())
|
||||
variableArray[i++] = singleBaseInfo.count;
|
||||
}
|
||||
return variableArray;
|
||||
}
|
||||
|
||||
private byte [] convertReadBases() {
|
||||
byte [] readArray = new byte[getReadLengthWithNoDeletions()];
|
||||
int i = 0;
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
|
||||
final BaseIndex baseIndex = BaseIndexByOrdinal[singleBaseInfo.baseIndexOrdinal];
|
||||
if (baseIndex != BaseIndex.D)
|
||||
readArray[i++] = baseIndex.getByte();
|
||||
}
|
||||
|
||||
return readArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the cigar string for the synthetic read
|
||||
*
|
||||
* Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read.
|
||||
*
|
||||
* @return the cigar string for the synthetic read
|
||||
*/
|
||||
private Cigar buildCigar() {
|
||||
ObjectArrayList<CigarElement> cigarElements = new ObjectArrayList<CigarElement>();
|
||||
CigarOperator cigarOperator = null;
|
||||
int length = 0;
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
|
||||
final BaseIndex b = BaseIndexByOrdinal[singleBaseInfo.baseIndexOrdinal];
|
||||
CigarOperator op;
|
||||
switch (b) {
|
||||
case D:
|
||||
op = CigarOperator.DELETION;
|
||||
break;
|
||||
case I:
|
||||
throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported.");
|
||||
default:
|
||||
op = CigarOperator.MATCH_OR_MISMATCH;
|
||||
break;
|
||||
}
|
||||
if (cigarOperator == null) {
|
||||
if (op == CigarOperator.D) // read cannot start with a deletion
|
||||
refStart++; // if it does, we need to move the reference start forward
|
||||
else
|
||||
cigarOperator = op;
|
||||
}
|
||||
else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one
|
||||
cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator
|
||||
cigarOperator = op;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions)
|
||||
length++;
|
||||
}
|
||||
if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion
|
||||
cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element
|
||||
|
||||
return new Cigar(cigarElements);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality for all conversion utilities
|
||||
*
|
||||
* @param variableIterator the list to convert
|
||||
* @return a converted variable given the bases and skipping deletions
|
||||
*/
|
||||
|
||||
private byte [] convertVariableGivenBases (Iterator<Byte> variableIterator) {
|
||||
byte [] variableArray = new byte[getReadLengthWithNoDeletions()];
|
||||
int i = 0;
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
|
||||
byte count = variableIterator.next();
|
||||
if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte())
|
||||
variableArray[i++] = count;
|
||||
}
|
||||
return variableArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality for all conversion utilities
|
||||
*
|
||||
* @return the length of the read with no deletions
|
||||
*/
|
||||
private int getReadLengthWithNoDeletions() {
|
||||
int readLength = basesCountsQuals.size();
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals)
|
||||
if (singleBaseInfo.baseIndexOrdinal == BaseIndex.D.getOrdinalByte())
|
||||
readLength--;
|
||||
return readLength;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -187,18 +187,6 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable {
|
|||
if ( qual == 0 )
|
||||
return 0;
|
||||
|
||||
if ( elt.getRead().isReducedRead() ) {
|
||||
// reduced read representation
|
||||
if ( BaseUtils.isRegularBase( obsBase )) {
|
||||
int representativeCount = elt.getRepresentativeCount();
|
||||
add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods
|
||||
return representativeCount; // we added nObs bases here
|
||||
}
|
||||
|
||||
// odd bases or deletions => don't use them
|
||||
return 0;
|
||||
}
|
||||
|
||||
return add(obsBase, qual, (byte)0, (byte)0, 1);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -140,11 +140,10 @@ public class ErrorModel {
|
|||
Allele refAllele = refSampleVC.getReference();
|
||||
|
||||
if ( refSampleVC.isIndel()) {
|
||||
final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
|
||||
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
|
||||
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
|
||||
if (!haplotypeMap.isEmpty())
|
||||
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
int idx = 0;
|
||||
for (PileupElement refPileupElement : refSamplePileup) {
|
||||
|
|
|
|||
|
|
@ -193,8 +193,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
|
||||
if (!hasReferenceSampleData) {
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap);
|
||||
n = readHaplotypeLikelihoods.length;
|
||||
} else {
|
||||
Allele refAllele = null;
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
|||
int count = 0;
|
||||
for ( PileupElement p : pileup ) {
|
||||
if ( BaseUtils.isRegularBase( p.getBase() ) )
|
||||
count += p.getRepresentativeCount();
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
|
|||
|
|
@ -253,7 +253,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
int count = 0;
|
||||
for (PileupElement p : pileup) {
|
||||
if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase()))
|
||||
count += p.getRepresentativeCount();
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
|
|||
|
|
@ -608,7 +608,7 @@ public class UnifiedGenotyperEngine {
|
|||
int numDeletions = 0;
|
||||
for ( final PileupElement p : rawContext.getBasePileup() ) {
|
||||
if ( p.isDeletion() )
|
||||
numDeletions += p.getRepresentativeCount();
|
||||
numDeletions++;
|
||||
}
|
||||
if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) {
|
||||
return null;
|
||||
|
|
|
|||
|
|
@ -59,7 +59,6 @@ import org.broadinstitute.sting.utils.pairhmm.*;
|
|||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -419,8 +418,7 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation
|
|||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
|
||||
haplotypeLikelihood += ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
|
|
|
|||
|
|
@ -233,10 +233,6 @@ public class ReadErrorCorrector {
|
|||
*/
|
||||
@Requires("inputRead != null")
|
||||
private GATKSAMRecord correctRead(final GATKSAMRecord inputRead) {
|
||||
// no support for reduced reads (which shouldn't need to be error-corrected anyway!)
|
||||
if (inputRead.isReducedRead())
|
||||
return inputRead;
|
||||
|
||||
// do actual correction
|
||||
boolean corrected = false;
|
||||
final byte[] correctedBases = inputRead.getReadBases();
|
||||
|
|
|
|||
|
|
@ -297,13 +297,13 @@ public class ReferenceConfidenceModel {
|
|||
if( hqSoftClips != null && p.isNextToSoftClip() ) {
|
||||
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
|
||||
}
|
||||
result.AD_Ref_Any[1] += p.getRepresentativeCount();
|
||||
result.AD_Ref_Any[1]++;
|
||||
} else {
|
||||
result.AD_Ref_Any[0] += p.getRepresentativeCount();
|
||||
result.AD_Ref_Any[0]++;
|
||||
}
|
||||
result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
|
||||
result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
|
||||
result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
||||
result.genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
|
||||
result.genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
|
||||
result.genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -484,7 +484,7 @@ public class ReferenceConfidenceModel {
|
|||
|
||||
// todo -- this code really should handle CIGARs directly instead of relying on the above tests
|
||||
if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) {
|
||||
nInformative += p.getRepresentativeCount();
|
||||
nInformative++;
|
||||
if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) {
|
||||
return MAX_N_INDEL_INFORMATIVE_READS;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -239,14 +239,14 @@ public class HaplotypeGraph extends ReadThreadingGraph {
|
|||
super(kmerSize);
|
||||
referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes);
|
||||
this.haplotypes = new LinkedHashSet<>(haplotypes);
|
||||
addSequence("anonymous", referenceHaplotype.getBases(), null, true);
|
||||
addSequence("anonymous", referenceHaplotype.getBases(), true);
|
||||
for (final Haplotype h : haplotypes) {
|
||||
if (h.isReference())
|
||||
continue;
|
||||
if (h.length() < kmerSize) {
|
||||
Utils.warnUser(logger, "haplotype shorter than kmerSize " + h.length() + " < " + kmerSize + " will be dropped");
|
||||
} else
|
||||
addSequence("anonymous", h.getBases(), null, false);
|
||||
addSequence("anonymous", h.getBases(), false);
|
||||
|
||||
}
|
||||
buildGraphIfNecessary();
|
||||
|
|
|
|||
|
|
@ -151,14 +151,12 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
|
||||
|
||||
// add the reference sequence to the graph
|
||||
rtgraph.addSequence("ref", refHaplotype.getBases(), null, true);
|
||||
rtgraph.addSequence("ref", refHaplotype.getBases(), true);
|
||||
|
||||
// add the artificial GGA haplotypes to the graph
|
||||
int hapCount = 0;
|
||||
for ( final Haplotype h : activeAlleleHaplotypes ) {
|
||||
final int[] counts = new int[h.length()];
|
||||
Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS);
|
||||
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false);
|
||||
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false);
|
||||
}
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
|
|
|
|||
|
|
@ -199,16 +199,25 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
* @param isRef is this the reference sequence?
|
||||
*/
|
||||
protected void addSequence(final byte[] sequence, final boolean isRef) {
|
||||
addSequence("anonymous", sequence, null, isRef);
|
||||
addSequence("anonymous", sequence, isRef);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all bases in sequence to this graph
|
||||
*
|
||||
* @see #addSequence(String, String, byte[], int, int, int[], boolean) for full information
|
||||
* @see #addSequence(String, String, byte[], int, int, int, boolean) for full information
|
||||
*/
|
||||
public void addSequence(final String seqName, final byte[] sequence, final int[] counts, final boolean isRef) {
|
||||
addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, counts, isRef);
|
||||
public void addSequence(final String seqName, final byte[] sequence, final boolean isRef) {
|
||||
addSequence(seqName, sequence, 1, isRef);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all bases in sequence to this graph
|
||||
*
|
||||
* @see #addSequence(String, String, byte[], int, int, int, boolean) for full information
|
||||
*/
|
||||
public void addSequence(final String seqName, final byte[] sequence, final int count, final boolean isRef) {
|
||||
addSequence(seqName, ANONYMOUS_SAMPLE, sequence, 0, sequence.length, count, isRef);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -216,14 +225,12 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
*
|
||||
* @param seqName a useful seqName for this read, for debugging purposes
|
||||
* @param sequence non-null sequence of bases
|
||||
* @param counts a vector of counts for each bases, indicating how many times that base was observed in the sequence.
|
||||
* This allows us to support reduced reads in the ReadThreadingAssembler. Can be null, meaning that
|
||||
* each base is only observed once. If not null, must have length == sequence.length.
|
||||
* @param start the first base offset in sequence that we should use for constructing the graph using this sequence, inclusive
|
||||
* @param stop the last base offset in sequence that we should use for constructing the graph using this sequence, exclusive
|
||||
* @param count the representative count of this sequence (to use as the weight)
|
||||
* @param isRef is this the reference sequence.
|
||||
*/
|
||||
public void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int[] counts, final boolean isRef) {
|
||||
public void addSequence(final String seqName, final String sampleName, final byte[] sequence, final int start, final int stop, final int count, final boolean isRef) {
|
||||
// note that argument testing is taken care of in SequenceForKmers
|
||||
if ( alreadyBuilt ) throw new IllegalStateException("Graph already built");
|
||||
|
||||
|
|
@ -235,18 +242,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
}
|
||||
|
||||
// add the new sequence to the list of sequences for sample
|
||||
sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, counts, isRef));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a count appropriate for a kmer starting at kmerStart in sequence for kmers
|
||||
*
|
||||
* @param seqForKmers a non-null sequence for kmers object
|
||||
* @param kmerStart the position where the kmer starts in sequence
|
||||
* @return a count for a kmer from start -> start + kmerSize in seqForKmers
|
||||
*/
|
||||
private int getCountGivenKmerStart(final SequenceForKmers seqForKmers, final int kmerStart) {
|
||||
return seqForKmers.getCount(kmerStart + kmerSize - 1);
|
||||
sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, count, isRef));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -276,9 +272,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
// loop over all of the bases in sequence, extending the graph by one base at each point, as appropriate
|
||||
MultiDeBruijnVertex vertex = startingVertex;
|
||||
for ( int i = uniqueStartPos + 1; i <= seqForKmers.stop - kmerSize; i++ ) {
|
||||
final int count = getCountGivenKmerStart(seqForKmers, i);
|
||||
|
||||
vertex = extendChainByOne(vertex, seqForKmers.sequence, i, count, seqForKmers.isRef);
|
||||
vertex = extendChainByOne(vertex, seqForKmers.sequence, i, seqForKmers.count, seqForKmers.isRef);
|
||||
if ( debugGraphTransformations ) vertex.addRead(seqForKmers.name);
|
||||
}
|
||||
}
|
||||
|
|
@ -683,7 +677,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
// logger.warn(String.format("Increasing counts for %s -> %s via %s at %d with suffix %s vs. %s",
|
||||
// prev, vertex, edge, offset, (char)suffix, (char)seqBase));
|
||||
if ( suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1) ) {
|
||||
edge.incMultiplicity(seqForKmers.getCount(offset));
|
||||
edge.incMultiplicity(seqForKmers.count);
|
||||
increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset-1);
|
||||
}
|
||||
}
|
||||
|
|
@ -780,7 +774,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
* @param prevVertex a non-null vertex where sequence was last anchored in the graph
|
||||
* @param sequence the sequence we're threading through the graph
|
||||
* @param kmerStart the start of the current kmer in graph we'd like to add
|
||||
* @param count the number of observations of this kmer in graph (can be > 1 for reduced reads)
|
||||
* @param count the number of observations of this kmer in graph (can be > 1 for GGA)
|
||||
* @param isRef is this the reference sequence?
|
||||
* @return a non-null vertex connecting prevVertex to in the graph based on sequence
|
||||
*/
|
||||
|
|
@ -819,7 +813,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
protected void addRead(final GATKSAMRecord read) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
|
||||
|
||||
int lastGood = -1; // the index of the last good base we've seen
|
||||
for( int end = 0; end <= sequence.length; end++ ) {
|
||||
|
|
@ -832,7 +825,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
if ( start != -1 && len >= kmerSize ) {
|
||||
// if the sequence is long enough to get some value out of, add it to the graph
|
||||
final String name = read.getReadName() + "_" + start + "_" + end;
|
||||
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false);
|
||||
addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, 1, false);
|
||||
}
|
||||
|
||||
lastGood = -1; // reset the last good base
|
||||
|
|
|
|||
|
|
@ -58,36 +58,23 @@ final class SequenceForKmers {
|
|||
final String name;
|
||||
final byte[] sequence;
|
||||
final int start, stop;
|
||||
final private int[] counts;
|
||||
final int count;
|
||||
final boolean isRef;
|
||||
|
||||
/**
|
||||
* Create a new sequence for creating kmers
|
||||
*/
|
||||
SequenceForKmers(final String name, byte[] sequence, int start, int stop, int[] counts, boolean ref) {
|
||||
SequenceForKmers(final String name, byte[] sequence, int start, int stop, int count, boolean ref) {
|
||||
if ( start < 0 ) throw new IllegalArgumentException("Invalid start " + start);
|
||||
if ( stop < start ) throw new IllegalArgumentException("Invalid stop " + stop);
|
||||
if ( sequence == null ) throw new IllegalArgumentException("Sequence is null ");
|
||||
if ( counts != null && counts.length != sequence.length ) throw new IllegalArgumentException("Sequence and counts don't have the same length " + sequence.length + " vs " + counts.length);
|
||||
if ( count < 1 ) throw new IllegalArgumentException("Invalid count " + count);
|
||||
|
||||
this.name = name;
|
||||
this.sequence = sequence;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
this.count = count;
|
||||
this.isRef = ref;
|
||||
this.counts = counts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of observations of the kmer starting at i in this sequence
|
||||
*
|
||||
* Can we > 1 because sequence may be a reduced read and therefore count as N observations
|
||||
*
|
||||
* @param i the offset into sequence for the start of the kmer
|
||||
* @return a count >= 1 that indicates the number of observations of kmer starting at i in this sequence.
|
||||
*/
|
||||
public int getCount(final int i) {
|
||||
if ( i < 0 || i > sequence.length ) throw new ArrayIndexOutOfBoundsException("i must be >= 0 and <= " + sequence.length + " but got " + i);
|
||||
return counts == null ? 1 : counts[i];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -261,10 +261,9 @@ public class PairHMMIndelErrorModel {
|
|||
final double downsamplingFraction) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
|
||||
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
|
||||
return getDiploidHaplotypeLikelihoods(numHaplotypes, readLikelihoods);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -295,16 +294,13 @@ public class PairHMMIndelErrorModel {
|
|||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
final ReferenceContext ref,
|
||||
final int eventLength,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
|
||||
final int[] readCounts) {
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
|
||||
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
|
||||
|
||||
final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
|
||||
final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
|
||||
int readIdx=0;
|
||||
for (PileupElement p: pileup) {
|
||||
// > 1 when the read is a consensus read representing multiple independent observations
|
||||
readCounts[readIdx] = p.getRepresentativeCount();
|
||||
|
||||
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
|
||||
if (perReadAlleleLikelihoodMap.containsPileupElement(p)) {
|
||||
|
|
@ -499,7 +495,7 @@ public class PairHMMIndelErrorModel {
|
|||
// return b1.length;
|
||||
// }
|
||||
|
||||
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
|
||||
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final double readLikelihoods[][]) {
|
||||
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
||||
// todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix
|
||||
|
|
@ -515,8 +511,7 @@ public class PairHMMIndelErrorModel {
|
|||
continue;
|
||||
final double li = readLikelihoods[readIdx][i];
|
||||
final double lj = readLikelihoods[readIdx][j];
|
||||
final int readCount = readCounts[readIdx];
|
||||
haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF);
|
||||
haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,175 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Emits intervals present in either the original or reduced bam but not the other.
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The original and reduced BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A list of intervals present in one bam but not the other.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -I:original original.bam \
|
||||
* -I:reduced reduced.bam \
|
||||
* -R ref.fasta \
|
||||
* -T AssessReducedCoverage \
|
||||
* -o output.intervals
|
||||
* </pre>
|
||||
*
|
||||
* @author ebanks
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
@Hidden
|
||||
public class AssessReducedCoverage extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
|
||||
|
||||
private static final String original = "original";
|
||||
private static final String reduced = "reduced";
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
@Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false)
|
||||
public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
final Set<String> tags = getAllTags(context.getBasePileup());
|
||||
return (tags.contains(original) && !tags.contains(reduced)) ||
|
||||
(OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null;
|
||||
}
|
||||
|
||||
private Set<String> getAllTags(final ReadBackedPileup pileup) {
|
||||
|
||||
final Set<String> tags = new HashSet<String>(10);
|
||||
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() )
|
||||
tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags());
|
||||
}
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
public void onTraversalDone(GenomeLoc sum) {
|
||||
if ( sum != null )
|
||||
out.println(sum);
|
||||
}
|
||||
|
||||
public GenomeLoc reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
|
||||
if ( lhs == null )
|
||||
return rhs;
|
||||
|
||||
if ( rhs == null )
|
||||
return lhs;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( lhs.contiguousP(rhs) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
|
||||
|
||||
// otherwise, print the lhs and start over with the rhs
|
||||
out.println(lhs);
|
||||
return rhs;
|
||||
}
|
||||
|
||||
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
if ( sum == null )
|
||||
return value;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( sum.contiguousP(value) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
|
||||
|
||||
// otherwise, print the sum and start over with the value
|
||||
out.println(sum);
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,208 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of
|
||||
* the reduced bam are above sufficient threshold)
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The original and reduced BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -I:original original.bam \
|
||||
* -I:reduced reduced.bam \
|
||||
* -R ref.fasta \
|
||||
* -T AssessReducedQuals \
|
||||
* -o output.intervals
|
||||
* </pre>
|
||||
*
|
||||
* @author ami
|
||||
*/
|
||||
@Hidden
|
||||
public class AssessReducedQuals extends LocusWalker<GenomeLoc, GenomeLoc> implements TreeReducible<GenomeLoc> {
|
||||
|
||||
private static final String reduced = "reduced";
|
||||
private static final int originalQualsIndex = 0;
|
||||
private static final int reducedQualsIndex = 1;
|
||||
|
||||
@Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false)
|
||||
public int sufficientQualSum = 600;
|
||||
|
||||
@Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false)
|
||||
public double qual_epsilon = 0.10;
|
||||
|
||||
@Argument(fullName = "exclude_low_mq", shortName = "excludeMQ", doc = "ignore reads with mapping quality below this number", required = false)
|
||||
public int excludeMQ = 0;
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
public void initialize() {
|
||||
if ( qual_epsilon < 0.0 || qual_epsilon > 1.0 )
|
||||
throw new UserException.BadArgumentValue("qual_epsilon", "must be a number between 0 and 1");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
@Override
|
||||
public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return null;
|
||||
|
||||
boolean reportLocus;
|
||||
final int[] quals = getPileupQuals(context.getBasePileup());
|
||||
final int epsilon = MathUtils.fastRound(quals[originalQualsIndex] * qual_epsilon);
|
||||
final int calcOriginalQuals = Math.min(quals[originalQualsIndex], sufficientQualSum);
|
||||
final int calcReducedQuals = Math.min(quals[reducedQualsIndex], sufficientQualSum);
|
||||
final int originalReducedQualDiff = calcOriginalQuals - calcReducedQuals;
|
||||
reportLocus = originalReducedQualDiff > epsilon || originalReducedQualDiff < -1 * epsilon;
|
||||
|
||||
return reportLocus ? ref.getLocus() : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the quals separated by version and strand
|
||||
* @param readPileup the pileup
|
||||
* @return 2x2 array with sum of quals separated by version in 1st dimension and strand in the 2nd
|
||||
*/
|
||||
private int[] getPileupQuals(final ReadBackedPileup readPileup) {
|
||||
|
||||
final int[] quals = new int[2];
|
||||
|
||||
for ( final PileupElement p : readPileup ) {
|
||||
final List<String> tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags();
|
||||
if ( isGoodRead(p) ) {
|
||||
final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount();
|
||||
final int tagIndex = getTagIndex(tags);
|
||||
quals[tagIndex] += tempQual;
|
||||
}
|
||||
}
|
||||
|
||||
return quals;
|
||||
}
|
||||
|
||||
private boolean isGoodRead(final PileupElement p) {
|
||||
return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= excludeMQ;
|
||||
}
|
||||
|
||||
private int getTagIndex(final List<String> tags) {
|
||||
return tags.contains(reduced) ? 1 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(GenomeLoc sum) {
|
||||
if ( sum != null )
|
||||
out.println(sum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) {
|
||||
if ( lhs == null )
|
||||
return rhs;
|
||||
|
||||
if ( rhs == null )
|
||||
return lhs;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( lhs.contiguousP(rhs) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop());
|
||||
|
||||
// otherwise, print the lhs and start over with the rhs
|
||||
out.println(lhs);
|
||||
return rhs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) {
|
||||
if ( value == null )
|
||||
return sum;
|
||||
|
||||
if ( sum == null )
|
||||
return value;
|
||||
|
||||
// if contiguous, just merge them
|
||||
if ( sum.contiguousP(value) )
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop());
|
||||
|
||||
// otherwise, print the sum and start over with the value
|
||||
out.println(sum);
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
|
@ -48,12 +48,10 @@ package org.broadinstitute.sting.utils.recalibration;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.compression.reducereads.ReduceReads;
|
||||
import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -1063,20 +1061,4 @@ public class RecalUtils {
|
|||
private static RecalDatum createDatumObject(final byte reportedQual, final double isError) {
|
||||
return new RecalDatum(1, isError, reportedQual);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks for invalid BAMs that are being used with BQSR and fails with a UserException if it finds one
|
||||
*
|
||||
* @param headers sam file headers being passed into the GATK engine
|
||||
* @param allowBqsrOnReducedBams should we allow BQSR on reduced bams?
|
||||
*/
|
||||
public static void checkForInvalidRecalBams(final List<SAMFileHeader> headers, final boolean allowBqsrOnReducedBams) {
|
||||
// for now, the only check we make is against reduced bams
|
||||
if ( !allowBqsrOnReducedBams ) {
|
||||
for ( final SAMFileHeader header : headers ) {
|
||||
if ( header.getProgramRecord(ReduceReads.PROGRAM_RECORD_NAME) != null )
|
||||
throw new UserException.BadInput("base quality score recalibration should absolutely not be run on reduced BAM files! Please run ReduceReads only after BQSR has been performed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,8 +46,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.compression.reducereads.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts;
|
||||
import org.broadinstitute.sting.utils.MannWhitneyU;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
@ -122,13 +120,15 @@ public class RankSumUnitTest {
|
|||
|
||||
final List<Integer> dist2 = new ArrayList<>(distribution2);
|
||||
if ( numToReduceIn2 > 0 ) {
|
||||
final org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts counts = new BaseCounts();
|
||||
int counts = 0;
|
||||
int quals = 0;
|
||||
|
||||
for ( int i = 0; i < numToReduceIn2; i++ ) {
|
||||
final int value = dist2.remove(0);
|
||||
counts.incr(BaseIndex.A, (byte)value, 0, false);
|
||||
counts++;
|
||||
quals += dist2.remove(0);
|
||||
}
|
||||
|
||||
final int qual = (int)counts.averageQualsOfBase(BaseIndex.A);
|
||||
final int qual = quals / counts;
|
||||
for ( int i = 0; i < numToReduceIn2; i++ )
|
||||
dist2.add(qual);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,201 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Basic unit test for BaseCounts in reduced reads
|
||||
*/
|
||||
public class BaseCountsUnitTest extends BaseTest {
|
||||
|
||||
private class BaseCountsTest {
|
||||
public String bases;
|
||||
public byte mostCountBase;
|
||||
public int mostCommonCount;
|
||||
|
||||
private BaseCountsTest(String bases, char mostCountBase, int mostCommonCount) {
|
||||
this.mostCommonCount = mostCommonCount;
|
||||
this.mostCountBase = (byte)mostCountBase;
|
||||
this.bases = bases;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "counting")
|
||||
public Object[][] createCountingData() {
|
||||
List<BaseCountsTest> params = new ArrayList<BaseCountsTest>();
|
||||
|
||||
params.add(new BaseCountsTest("A", 'A', 1 ));
|
||||
params.add(new BaseCountsTest("AA", 'A', 2 ));
|
||||
params.add(new BaseCountsTest("AC", 'A', 1 ));
|
||||
params.add(new BaseCountsTest("AAC", 'A', 2 ));
|
||||
params.add(new BaseCountsTest("AAA", 'A', 3 ));
|
||||
params.add(new BaseCountsTest("AAAN", 'A', 3 ));
|
||||
params.add(new BaseCountsTest("AAANNNN", 'N', 4 ));
|
||||
params.add(new BaseCountsTest("AACTG", 'A', 2 ));
|
||||
params.add(new BaseCountsTest("D", 'D', 1 ));
|
||||
params.add(new BaseCountsTest("DDAAD", 'D', 3));
|
||||
params.add(new BaseCountsTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
|
||||
params.add(new BaseCountsTest("AAIIIAI", 'I', 4 ));
|
||||
|
||||
List<Object[]> params2 = new ArrayList<Object[]>();
|
||||
for ( BaseCountsTest x : params ) params2.add(new Object[]{x});
|
||||
return params2.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "counting", enabled = true)
|
||||
public void testCounting(BaseCountsTest params) {
|
||||
BaseCounts counts = new BaseCounts();
|
||||
|
||||
for ( byte base : params.bases.getBytes() )
|
||||
counts.incr(base);
|
||||
|
||||
String name = String.format("Test-%s", params.bases);
|
||||
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
|
||||
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
|
||||
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
|
||||
|
||||
// test the static creation
|
||||
final int[] countsArray = new int[] { counts.countOfBase(BaseIndex.A), counts.countOfBase(BaseIndex.C),
|
||||
counts.countOfBase(BaseIndex.G), counts.countOfBase(BaseIndex.T)};
|
||||
final BaseCounts countsFromArray = BaseCounts.createWithCounts(countsArray);
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T));
|
||||
Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount());
|
||||
|
||||
// test addition
|
||||
counts.add(countsFromArray);
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.A), 2 * countsFromArray.countOfBase(BaseIndex.A));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.C), 2 * countsFromArray.countOfBase(BaseIndex.C));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.G), 2 * countsFromArray.countOfBase(BaseIndex.G));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.T), 2 * countsFromArray.countOfBase(BaseIndex.T));
|
||||
Assert.assertEquals(ACGTcounts(counts), 2 * countsFromArray.totalCount());
|
||||
|
||||
// test subtraction
|
||||
counts.sub(countsFromArray);
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G));
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T));
|
||||
Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount());
|
||||
|
||||
// test decrementing
|
||||
if ( counts.countOfBase(BaseIndex.A) > 0 ) {
|
||||
counts.decr((byte)'A');
|
||||
Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private static int ACGTcounts(final BaseCounts baseCounts) {
|
||||
return baseCounts.totalCountWithoutIndels() - baseCounts.countOfBase(BaseIndex.N);
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////
|
||||
// TEST FOR QUALS IN BASECOUNTS //
|
||||
//////////////////////////////////
|
||||
|
||||
private class BaseCountsQualsTest {
|
||||
public final List<Integer> quals;
|
||||
|
||||
private BaseCountsQualsTest(final List<Integer> quals) {
|
||||
this.quals = quals;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "quals")
|
||||
public Object[][] createQualsData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 };
|
||||
|
||||
for ( final int qual1 : quals ) {
|
||||
for ( final int qual2 : quals ) {
|
||||
for ( final int qual3 : quals ) {
|
||||
tests.add(new Object[]{new BaseCountsQualsTest(Arrays.asList(qual1, qual2, qual3))});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "quals", enabled = true)
|
||||
public void testQuals(BaseCountsQualsTest test) {
|
||||
BaseCounts counts = new BaseCounts();
|
||||
|
||||
for ( int qual : test.quals )
|
||||
counts.incr(BaseIndex.A, (byte)qual, 20, false);
|
||||
|
||||
final int actualSum = (int)counts.getSumQuals((byte)'A');
|
||||
final int expectedSum = qualSum(test.quals);
|
||||
Assert.assertEquals(actualSum, expectedSum);
|
||||
|
||||
final int actualAverage = (int)counts.averageQuals((byte)'A');
|
||||
Assert.assertEquals(actualAverage, expectedSum / test.quals.size());
|
||||
|
||||
// test both proportion methods
|
||||
Assert.assertEquals(counts.baseCountProportion(BaseIndex.A), counts.baseCountProportion((byte)'A'));
|
||||
}
|
||||
|
||||
private static int qualSum(final List<Integer> quals) {
|
||||
int sum = 0;
|
||||
for ( final int qual : quals )
|
||||
sum += qual;
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,214 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class HeaderElementUnitTest extends BaseTest {
|
||||
|
||||
private class HETest {
|
||||
public byte base, baseQual, insQual, delQual;
|
||||
public int MQ;
|
||||
public boolean isClip;
|
||||
|
||||
private HETest(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int MQ, final boolean isClip) {
|
||||
this.base = base;
|
||||
this.baseQual = baseQual;
|
||||
this.insQual = insQual;
|
||||
this.delQual = delQual;
|
||||
this.MQ = MQ;
|
||||
this.isClip = isClip;
|
||||
}
|
||||
}
|
||||
|
||||
private static final byte byteA = (byte)'A';
|
||||
private static final byte byte10 = (byte)10;
|
||||
private static final byte byte20 = (byte)20;
|
||||
private static final int minBaseQual = 20;
|
||||
private static final int minMappingQual = 20;
|
||||
|
||||
@DataProvider(name = "data")
|
||||
public Object[][] createData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, false)});
|
||||
tests.add(new Object[]{new HETest(byteA, byte10, byte20, byte20, 20, false)});
|
||||
tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 10, false)});
|
||||
tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, true)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "data", enabled = true)
|
||||
public void testHE(HETest test) {
|
||||
|
||||
HeaderElement headerElement = new HeaderElement(1000, 0);
|
||||
|
||||
// first test that if we add and then remove it, we have no data
|
||||
headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false);
|
||||
headerElement.addInsertionToTheRight();
|
||||
headerElement.removeBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false);
|
||||
headerElement.removeInsertionToTheRight();
|
||||
testHeaderIsEmpty(headerElement);
|
||||
|
||||
// now, test that the data was added as expected
|
||||
for ( int i = 0; i < 10; i++ )
|
||||
headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false);
|
||||
testHeaderData(headerElement, test);
|
||||
|
||||
// test the insertion adding functionality
|
||||
for ( int i = 0; i < 10; i++ )
|
||||
headerElement.addInsertionToTheRight();
|
||||
Assert.assertEquals(headerElement.numInsertionsToTheRight(), 10);
|
||||
}
|
||||
|
||||
private void testHeaderIsEmpty(final HeaderElement headerElement) {
|
||||
Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS));
|
||||
Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED));
|
||||
Assert.assertFalse(headerElement.hasInsertionToTheRight());
|
||||
Assert.assertTrue(headerElement.isEmpty());
|
||||
}
|
||||
|
||||
private void testHeaderData(final HeaderElement headerElement, final HETest test) {
|
||||
Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip);
|
||||
Assert.assertFalse(headerElement.isEmpty());
|
||||
Assert.assertFalse(headerElement.hasInsertionToTheRight());
|
||||
Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS), test.MQ >= minMappingQual);
|
||||
Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED), test.MQ < minMappingQual);
|
||||
Assert.assertEquals(headerElement.getBaseCounts(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) ? SlidingWindow.ConsensusType.POSITIVE_CONSENSUS : SlidingWindow.ConsensusType.FILTERED).getRMS(), (double)test.MQ);
|
||||
Assert.assertFalse(headerElement.isVariantFromMismatches(0.05, 0.05));
|
||||
Assert.assertEquals(headerElement.isVariant(0.05, 0.05, 0.05), test.isClip);
|
||||
}
|
||||
|
||||
|
||||
private class AllelesTest {
|
||||
public final int[] counts;
|
||||
public final double pvalue;
|
||||
|
||||
private AllelesTest(final int[] counts, final double pvalue) {
|
||||
this.counts = counts;
|
||||
this.pvalue = pvalue;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "alleles")
|
||||
public Object[][] createAllelesData() {
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
final int[] counts = new int[]{ 0, 5, 10, 15, 20 };
|
||||
final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 };
|
||||
|
||||
for ( final int countA : counts ) {
|
||||
for ( final int countC : counts ) {
|
||||
for ( final int countG : counts ) {
|
||||
for ( final int countT : counts ) {
|
||||
for ( final int countD : counts ) {
|
||||
for ( final double pvalue : pvalues ) {
|
||||
tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "alleles", enabled = true)
|
||||
public void testAlleles(AllelesTest test) {
|
||||
|
||||
HeaderElement headerElement = new HeaderElement(1000, 0);
|
||||
for ( int i = 0; i < test.counts.length; i++ ) {
|
||||
final BaseIndex base = BaseIndex.values()[i];
|
||||
for ( int j = 0; j < test.counts[i]; j++ )
|
||||
headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false, false);
|
||||
}
|
||||
|
||||
final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue, test.pvalue);
|
||||
final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue);
|
||||
|
||||
Assert.assertEquals(nAllelesSeen, nAllelesExpected);
|
||||
}
|
||||
|
||||
private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) {
|
||||
int total = 0;
|
||||
for ( final int count : counts ) {
|
||||
total += count;
|
||||
}
|
||||
|
||||
int result = 0;
|
||||
for ( int index = 0; index < counts.length; index++ ) {
|
||||
final int count = counts[index];
|
||||
if ( count == 0 )
|
||||
continue;
|
||||
|
||||
final boolean isSignificant;
|
||||
if ( count <= HeaderElement.MIN_COUNT_FOR_USING_PVALUE ) {
|
||||
isSignificant = MathUtils.binomialCumulativeProbability(total, 0, count) > targetPvalue;
|
||||
} else {
|
||||
isSignificant = (count >= targetPvalue * total);
|
||||
}
|
||||
|
||||
if ( isSignificant ) {
|
||||
if ( index == BaseIndex.D.index )
|
||||
return -1;
|
||||
result++;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,347 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class ReduceReadsIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final static String DBSNP = b37dbSNP132;
|
||||
final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam";
|
||||
final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam";
|
||||
final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556";
|
||||
final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam";
|
||||
final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals";
|
||||
final String L = " -L 20:10,100,000-10,120,000 ";
|
||||
final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam";
|
||||
final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam";
|
||||
final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057";
|
||||
final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam";
|
||||
final String HIGH_COVERAGE_BAM = privateTestDir + "NA20313.highCoverageRegion.bam";
|
||||
final String HIGH_COVERAGE_L = " -L 1:1650830-1650870";
|
||||
final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam";
|
||||
final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam";
|
||||
|
||||
final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e";
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, final WalkerTestSpec spec) {
|
||||
return executeTest(name, spec, emptyFileMd5);
|
||||
}
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) {
|
||||
final Pair<List<File>, List<String>> result = super.executeTest(name, spec);
|
||||
|
||||
// perform some Reduce Reads specific testing now
|
||||
if ( result != null ) {
|
||||
|
||||
// generate a new command-line based on the old one
|
||||
spec.disableImplicitArgs();
|
||||
final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" ");
|
||||
|
||||
final StringBuilder reducedInputs = new StringBuilder();
|
||||
for ( final File file : result.getFirst() ) {
|
||||
reducedInputs.append(" -I:reduced ");
|
||||
reducedInputs.append(file.getAbsolutePath());
|
||||
}
|
||||
|
||||
// the coverage test is a less stricter version of the quals test so we can safely ignore it for now
|
||||
//final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs);
|
||||
//super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5)));
|
||||
|
||||
// run the quals test
|
||||
final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs);
|
||||
super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5)));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a new command-line based on the old one
|
||||
*
|
||||
* @param walkerName the new walker name to use
|
||||
* @param originalArgs the original arguments used for the test
|
||||
* @return the new command line
|
||||
*/
|
||||
private String createCommandLine(final String walkerName, final String[] originalArgs) {
|
||||
|
||||
final StringBuilder newArgs = new StringBuilder();
|
||||
|
||||
for ( int i = 0; i < originalArgs.length; i++ ) {
|
||||
final String arg = originalArgs[i];
|
||||
|
||||
if ( arg.equals("-T") ) {
|
||||
newArgs.append("-T ");
|
||||
newArgs.append(walkerName);
|
||||
} else if ( arg.startsWith("-I") ) {
|
||||
newArgs.append("-I:original ");
|
||||
newArgs.append(originalArgs[++i]);
|
||||
} else if ( arg.equals("-R") || arg.equals("-L") ) {
|
||||
newArgs.append(arg);
|
||||
newArgs.append(" ");
|
||||
newArgs.append(originalArgs[++i]);
|
||||
}
|
||||
|
||||
// always add a trailing space
|
||||
newArgs.append(" ");
|
||||
}
|
||||
|
||||
newArgs.append("-o %s");
|
||||
|
||||
return newArgs.toString();
|
||||
}
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) {
|
||||
return super.executeTest(name, spec);
|
||||
}
|
||||
|
||||
private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) {
|
||||
this.RRTest(testName, args, md5, useKnowns, emptyFileMd5);
|
||||
}
|
||||
|
||||
private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5));
|
||||
executeTest(testName, spec, qualsTestMD5);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "0e503f7b79ace4c89d74f0943a0de1c0", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompressionWithKnowns() {
|
||||
RRTest("testDefaultCompressionWithKnowns ", L, "6db7ce2733d006f8bd61c42a40d23728", true);
|
||||
}
|
||||
|
||||
private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
RRTest("testMultipleIntervals ", intervals, "207f2c6d3db956e19412a45a231ca367", false, "043b2838c27d8f9580379b54c18ff40a");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervalsWithKnowns() {
|
||||
RRTest("testMultipleIntervalsWithKnowns ", intervals, "f3b11a8a7673b301e27137936fafc6b6", true, "043b2838c27d8f9580379b54c18ff40a");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "dcc3716b3665aa1c2dbe6b22d6534aef", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompressionWithKnowns() {
|
||||
RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "97ae655bf0e483ea227b1aac67ced024", true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a1377eb922e0b09a03a280b691b0b3ff", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompressionWithKnowns() {
|
||||
RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "bd7c5b0b210694f364ca6a41f5b89870", true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testBadPvalueInput() {
|
||||
final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class);
|
||||
executeTest("testBadPvalueInput", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
final String md5 = "9c9305eda5e4e7f22246ec8a4b242c97";
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false);
|
||||
RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("1bda512143be1016dfaca1f7020b6398")), "4f916da29d91852077f0a2fdbdd2c7f6");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReduction() {
|
||||
String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("58c2bae5a339af2ea3c22a46ce8faa68")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReductionWithKnowns() {
|
||||
String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
|
||||
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5c251932b49d99a810581e3a6f762878")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testInsertionsAtEdgeOfConsensus() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s ";
|
||||
executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("c10653a8c21fb32b5cf580d3704b0edd")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead
|
||||
* and a subsequent tail leaves only this read in the stash. The next read to come in is in fact
|
||||
* before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error
|
||||
* that was freaking hard to catch.
|
||||
*
|
||||
* This bam is simplified to replicate the exact bug with the three provided intervals.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fddbec29d0945afbbb34b42994614c15")), "3eab32c215ba68e75efd5ab7e9f7a2e7");
|
||||
}
|
||||
|
||||
/**
|
||||
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
|
||||
* filtered out.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
// we expect to lose coverage due to the downsampling so don't run the systematic tests
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7dfe2647992ce1154db340fc742d523a")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will
|
||||
* break the current hard-clipping routine that protects reduce reads from such reads.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testReadOffContig() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s ";
|
||||
executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("595e5812c37189930cae93e45765def4")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testPairedReadsInVariantRegion() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) +
|
||||
" -o %s --downsample_coverage 250 -dcov 50 ";
|
||||
executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b005727119eee27995705959a637085e")), "2af063d1bd3c322b03405dbb3ecf59a9");
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirm that this bam does not fail when multi-sample mode is enabled. The provided example is tricky and used to cause
|
||||
* us to exception out in the code.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testMultiSampleDoesNotFailWithFlag() {
|
||||
String cmd = "-T ReduceReads --cancer_mode -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null";
|
||||
executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, Collections.<String>emptyList()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirm that this bam fails when multi-sample mode is not enabled
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testMultiSampleFailsWithoutFlag() {
|
||||
String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null";
|
||||
executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* Confirm that compression is not capping coverage counts to max byte
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testCompressionWorksForHighDepth() {
|
||||
final String base = String.format("-T ReduceReads -npt -R %s -I %s %s", b37KGReference, HIGH_COVERAGE_BAM, HIGH_COVERAGE_L) + " -o %s";
|
||||
final File outputBam = executeTestWithoutAdditionalRRTests("testCompressionWorksForHighDepth",
|
||||
new WalkerTestSpec(base, 1, Arrays.asList(""))).first.get(0); // No MD5s; we only want to check the coverage
|
||||
|
||||
boolean sawHighCoveragePosition = false;
|
||||
final SAMFileReader reader = new SAMFileReader(outputBam);
|
||||
reader.setSAMRecordFactory(new GATKSamRecordFactory());
|
||||
|
||||
for ( final SAMRecord rawRead : reader ) {
|
||||
final GATKSAMRecord read = (GATKSAMRecord)rawRead;
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, rawRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG));
|
||||
|
||||
if ( ! read.isReducedRead() )
|
||||
continue;
|
||||
|
||||
final int[] decodedCounts = read.getReducedReadCounts();
|
||||
for ( final int count : decodedCounts ) {
|
||||
if ( count > Byte.MAX_VALUE ) {
|
||||
sawHighCoveragePosition = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( sawHighCoveragePosition )
|
||||
break;
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
||||
Assert.assertTrue(sawHighCoveragePosition, "No positions were found with coverage over max byte (127); the coverage is incorrectly being capped somewhere!");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,214 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
|
||||
public class ReduceReadsUnitTest extends BaseTest {
|
||||
|
||||
Random random = new Random(987743);
|
||||
Object2LongOpenHashMap<String> hash = new Object2LongOpenHashMap<String>();
|
||||
long nextNumber = 0L;
|
||||
|
||||
/**
|
||||
* Combinatorial unit test data provider example.
|
||||
*
|
||||
* Creates data for testMyData test function, containing two arguments, start and size at each value
|
||||
*
|
||||
* @return Object[][] for testng DataProvider
|
||||
*/
|
||||
@DataProvider(name = "ReadNameProvider")
|
||||
public Object[][] readNameProvider() {
|
||||
final int readNameLength = 4;
|
||||
final int nReads = 100000;
|
||||
final int charVariety = 20;
|
||||
ObjectArrayList<Object[]> tests = new ObjectArrayList<Object[]>();
|
||||
ObjectOpenHashSet<String> truthSet = new ObjectOpenHashSet<String>();
|
||||
byte[] bytes = new byte[readNameLength];
|
||||
for ( int i = 0; i<nReads; i++) {
|
||||
random.nextBytes(bytes);
|
||||
StringBuilder readNameBuilder = new StringBuilder(readNameLength);
|
||||
for (byte b : bytes) {
|
||||
readNameBuilder.append((char) ('a' + Math.abs(b) % charVariety));
|
||||
}
|
||||
String readName = readNameBuilder.toString();
|
||||
tests.add(new Object[]{readName, truthSet.contains(readName)});
|
||||
truthSet.add(readName);
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the read name compression functionality
|
||||
*/
|
||||
@Test(dataProvider = "ReadNameProvider", enabled = false)
|
||||
public void testReadNameCompression(final String name, final boolean alreadySeen) {
|
||||
GATKSAMRecord read = GATKSAMRecord.createRandomRead(1);
|
||||
read.setReadName(name);
|
||||
final int previousHashSize = hash.keySet().size();
|
||||
final long previousNumber = nextNumber;
|
||||
nextNumber = ReduceReads.compressReadName(hash, read, nextNumber);
|
||||
Assert.assertEquals(hash.keySet().size(), alreadySeen ? previousHashSize : previousHashSize + 1);
|
||||
Assert.assertEquals(nextNumber, alreadySeen ? previousNumber : previousNumber + 1);
|
||||
Assert.assertTrue(hash.containsKey(name));
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests the functionality related to known SNP positions ////
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private static SAMFileHeader header;
|
||||
private static GenomeLocParser genomeLocParser;
|
||||
|
||||
@BeforeClass
|
||||
public void beforeClass() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 100);
|
||||
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
@DataProvider(name = "PopulateKnownsProvider")
|
||||
public Object[][] populateKnownsProvider() {
|
||||
|
||||
final Allele A = Allele.create("A", true);
|
||||
final Allele C = Allele.create("C");
|
||||
final Allele G = Allele.create("G");
|
||||
final Allele AC = Allele.create("AC");
|
||||
|
||||
final VariantContext snp_1_10 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_1_10_2 = new VariantContextBuilder("known", "chr1", 10, 10, Arrays.asList(A, G)).make();
|
||||
final VariantContext snp_1_20 = new VariantContextBuilder("known", "chr1", 20, 20, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_1_30 = new VariantContextBuilder("known", "chr1", 30, 30, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_2_10 = new VariantContextBuilder("known", "chr2", 10, 10, Arrays.asList(A, C)).make();
|
||||
final VariantContext snp_3_10 = new VariantContextBuilder("known", "chr3", 10, 10, Arrays.asList(A, C)).make();
|
||||
final VariantContext indel_1_40 = new VariantContextBuilder("known", "chr1", 40, 40, Arrays.asList(A, AC)).make();
|
||||
final VariantContext indel_2_40 = new VariantContextBuilder("known", "chr2", 40, 40, Arrays.asList(A, AC)).make();
|
||||
|
||||
final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "foo1", 0, 1, 1);
|
||||
final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "foo2", 1, 1, 1);
|
||||
final GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header, "foo3", 2, 1, 1);
|
||||
|
||||
final ObjectArrayList<Object[]> tests = new ObjectArrayList<Object[]>();
|
||||
|
||||
// test single
|
||||
tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))});
|
||||
|
||||
// test multiple at one position
|
||||
tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))});
|
||||
|
||||
// test multiple
|
||||
tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
|
||||
|
||||
// test indel not used
|
||||
tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))});
|
||||
tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))});
|
||||
|
||||
// test read clears
|
||||
tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
|
||||
tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))});
|
||||
tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))});
|
||||
tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))});
|
||||
tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))});
|
||||
tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) {
|
||||
final List<GATKFeature> x = new ArrayList<GATKFeature>();
|
||||
x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known"));
|
||||
final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()));
|
||||
return new RefMetaDataTracker(Arrays.asList(rods));
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PopulateKnownsProvider")
|
||||
public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List<RefMetaDataTracker> trackers) {
|
||||
final ReduceReads rr = new ReduceReads();
|
||||
RodBinding.resetNameCounter();
|
||||
rr.known = Arrays.<RodBinding<VariantContext>>asList(new RodBinding(VariantContext.class, "known"));
|
||||
rr.knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
engine.setGenomeLocParser(genomeLocParser);
|
||||
rr.setToolkit(engine);
|
||||
|
||||
for ( final RefMetaDataTracker tracker : trackers )
|
||||
rr.populateKnownSNPs(tracker);
|
||||
Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear);
|
||||
|
||||
rr.clearStaleKnownPositions(read);
|
||||
Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,964 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public class SlidingWindowUnitTest extends BaseTest {
|
||||
|
||||
private static final int variantRegionLength = 1000;
|
||||
private static final int globalStartPosition = 1000000;
|
||||
|
||||
private static boolean[] createBitset(final List<FinishedGenomeLoc> locs) {
|
||||
final boolean[] variantRegionBitset = new boolean[variantRegionLength];
|
||||
for ( FinishedGenomeLoc loc : locs ) {
|
||||
final int stop = loc.getStop() - globalStartPosition;
|
||||
for ( int i = loc.getStart() - globalStartPosition; i <= stop; i++ )
|
||||
variantRegionBitset[i] = true;
|
||||
}
|
||||
return variantRegionBitset;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
//// Test for leading softclips immediately followed by an insertion in the CIGAR ////
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLeadingSoftClipThenInsertion() {
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 10);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', 10));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, 10));
|
||||
read.setMappingQuality(30);
|
||||
read.setCigarString("2S2I6M");
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 1);
|
||||
slidingWindow.addRead(read);
|
||||
slidingWindow.close(null);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLeadingHardClipThenInsertion() {
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 8);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', 8));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, 8));
|
||||
read.setMappingQuality(30);
|
||||
read.setCigarString("2H2I6M");
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.addRead(read);
|
||||
slidingWindow.close(null);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests the findVariantRegions() method and related functionality ////
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private static final FinishedGenomeLoc loc90to95 = new FinishedGenomeLoc("1", 0, 1000090, 1000095, false);
|
||||
private static final FinishedGenomeLoc loc96to99 = new FinishedGenomeLoc("1", 0, 1000096, 1000099, false);
|
||||
private static final FinishedGenomeLoc loc100to110 = new FinishedGenomeLoc("1", 0, 1000100, 1000110, false);
|
||||
private static final FinishedGenomeLoc loc999 = new FinishedGenomeLoc("1", 0, 1000999, 1000999, false);
|
||||
|
||||
private class FindVariantRegionsTest {
|
||||
public List<FinishedGenomeLoc> locs, expectedResult;
|
||||
public boolean[] variantRegionBitset;
|
||||
|
||||
private FindVariantRegionsTest(final List<FinishedGenomeLoc> locs) {
|
||||
this.locs = locs;
|
||||
this.expectedResult = locs;
|
||||
variantRegionBitset = createBitset(locs);
|
||||
}
|
||||
|
||||
private FindVariantRegionsTest(final List<FinishedGenomeLoc> locs, final List<FinishedGenomeLoc> expectedResult) {
|
||||
this.locs = locs;
|
||||
this.expectedResult = expectedResult;
|
||||
variantRegionBitset = createBitset(locs);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "findVariantRegions")
|
||||
public Object[][] createFindVariantRegionsData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{new FindVariantRegionsTest(Arrays.<FinishedGenomeLoc>asList(loc90to95))});
|
||||
tests.add(new Object[]{new FindVariantRegionsTest(Arrays.<FinishedGenomeLoc>asList(loc90to95, loc100to110))});
|
||||
tests.add(new Object[]{new FindVariantRegionsTest(Arrays.<FinishedGenomeLoc>asList(loc90to95, loc96to99, loc100to110), Arrays.<FinishedGenomeLoc>asList(new FinishedGenomeLoc("1", 0, 1000090, 1000110, false)))});
|
||||
tests.add(new Object[]{new FindVariantRegionsTest(Arrays.<FinishedGenomeLoc>asList(loc90to95, loc999))});
|
||||
tests.add(new Object[]{new FindVariantRegionsTest(Arrays.<FinishedGenomeLoc>asList(loc999))});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "findVariantRegions", enabled = true)
|
||||
public void testFindVariantRegions(FindVariantRegionsTest test) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition);
|
||||
final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, test.variantRegionBitset, true);
|
||||
int index = 0;
|
||||
for ( final FinishedGenomeLoc loc : locs ) {
|
||||
Assert.assertTrue(loc.equals(test.expectedResult.get(index++)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testNoClosingRegions() {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition);
|
||||
final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, createBitset(Arrays.<FinishedGenomeLoc>asList(loc90to95, loc999)), false);
|
||||
Assert.assertEquals(locs.size(), 1);
|
||||
Assert.assertEquals(locs.iterator().next(), loc90to95);
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests the markSites() method and related functionality ////
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMarkedSitesClass() {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition);
|
||||
final SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites();
|
||||
|
||||
markedSites.updateRegion(100, 100);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 100);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100);
|
||||
|
||||
markedSites.updateRegion(300, 100);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 300);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100);
|
||||
|
||||
markedSites.getVariantSiteBitSet()[10] = true;
|
||||
markedSites.updateRegion(290, 100);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 290);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100);
|
||||
Assert.assertFalse(markedSites.getVariantSiteBitSet()[10]);
|
||||
|
||||
markedSites.getVariantSiteBitSet()[20] = true;
|
||||
markedSites.updateRegion(290, 100);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 290);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100);
|
||||
Assert.assertTrue(markedSites.getVariantSiteBitSet()[20]);
|
||||
|
||||
markedSites.updateRegion(300, 100);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 300);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100);
|
||||
|
||||
markedSites.getVariantSiteBitSet()[95] = true;
|
||||
markedSites.updateRegion(390, 20);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 390);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 20);
|
||||
Assert.assertTrue(markedSites.getVariantSiteBitSet()[5]);
|
||||
|
||||
markedSites.updateRegion(340, 60);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 340);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 60);
|
||||
|
||||
markedSites.getVariantSiteBitSet()[20] = true;
|
||||
markedSites.updateRegion(350, 60);
|
||||
Assert.assertEquals(markedSites.getStartLocation(), 350);
|
||||
Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 60);
|
||||
Assert.assertTrue(markedSites.getVariantSiteBitSet()[10]);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMarkVariantRegion() {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition);
|
||||
slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100);
|
||||
|
||||
slidingWindow.markVariantRegion(40);
|
||||
Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21);
|
||||
|
||||
slidingWindow.markVariantRegion(5);
|
||||
Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37);
|
||||
|
||||
slidingWindow.markVariantRegion(95);
|
||||
Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52);
|
||||
}
|
||||
|
||||
private static int countTrueBits(final boolean[] bitset) {
|
||||
int count = 0;
|
||||
for ( final boolean bit : bitset ) {
|
||||
if ( bit )
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMarkingRegionInCancerMode() {
|
||||
|
||||
final int contextSize = 10;
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, contextSize, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.addRead(createSimpleRead("1", 0, 34, 75));
|
||||
slidingWindow.addRead(createSimpleRead("2", 0, 97, 73));
|
||||
slidingWindow.addRead(createSimpleRead("3", 0, 98, 75));
|
||||
slidingWindow.addRead(createSimpleRead("4", 0, 98, 75));
|
||||
slidingWindow.addRead(createSimpleRead("5", 0, 98, 75));
|
||||
|
||||
final CompressionStash regions = new CompressionStash();
|
||||
regions.add(new FinishedGenomeLoc("1", 0, 89, 109, true));
|
||||
|
||||
slidingWindow.closeVariantRegions(regions, null, false);
|
||||
Assert.assertEquals(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet().length, 76 + contextSize);
|
||||
}
|
||||
|
||||
private GATKSAMRecord createSimpleRead(final String name, final int refIndex, final int alignmentStart, final int length) {
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', length));
|
||||
read.setBaseQualities(Utils.dupBytes((byte) 30, length));
|
||||
read.setMappingQuality(60);
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
//// This section tests the consensus creation functionality ////
|
||||
/////////////////////////////////////////////////////////////////
|
||||
|
||||
private static final int readLength = 100;
|
||||
private static final int testRegionSize = 1000;
|
||||
private final ObjectList<GATKSAMRecord> basicReads = new ObjectArrayList<GATKSAMRecord>(20);
|
||||
private IndexedFastaSequenceFile seq;
|
||||
private SAMFileHeader header;
|
||||
|
||||
@BeforeClass
|
||||
public void setup() throws FileNotFoundException {
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary());
|
||||
|
||||
final int readFrequency = 20;
|
||||
|
||||
basicReads.clear();
|
||||
for ( int i = 0; i < testRegionSize; i += readFrequency ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition + i, readLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(i % 40 == 20);
|
||||
basicReads.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
private class ConsensusCreationTest {
|
||||
public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage;
|
||||
public final List<GATKSAMRecord> myReads = new ArrayList<GATKSAMRecord>(20);
|
||||
public final String description;
|
||||
|
||||
private ConsensusCreationTest(final List<GenomeLoc> locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) {
|
||||
this.expectedNumberOfReads = expectedNumberOfReads;
|
||||
this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression;
|
||||
this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage;
|
||||
this.description = String.format("%d %d %d %b %b", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage, readsShouldBeLowQuality, variantBaseShouldBeLowQuality);
|
||||
|
||||
// first, add the basic reads to the collection
|
||||
myReads.addAll(basicReads);
|
||||
|
||||
// then add the permuted reads
|
||||
for ( final GenomeLoc loc : locs )
|
||||
myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality, CigarOperator.M));
|
||||
}
|
||||
|
||||
private ConsensusCreationTest(final List<GenomeLoc> locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) {
|
||||
this.expectedNumberOfReads = expectedNumberOfReads;
|
||||
this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression;
|
||||
this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage;
|
||||
this.description = String.format("%s %d %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage);
|
||||
|
||||
// first, add the basic reads to the collection
|
||||
myReads.addAll(basicReads);
|
||||
|
||||
// then add the permuted reads
|
||||
for ( final GenomeLoc loc : locs )
|
||||
myReads.add(createVariantRead(loc, false, false, operator));
|
||||
}
|
||||
|
||||
public String toString() { return description; }
|
||||
|
||||
private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality,
|
||||
final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) {
|
||||
|
||||
final int startPos = loc.getStart() - 50;
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead" + startPos, 0, startPos, readLength);
|
||||
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
// create a mismatch if requested
|
||||
if ( operator == CigarOperator.M )
|
||||
bases[50] = 'C';
|
||||
read.setReadBases(bases);
|
||||
|
||||
final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength);
|
||||
if ( variantBaseShouldBeLowQuality )
|
||||
baseQuals[50] = (byte)10;
|
||||
read.setBaseQualities(baseQuals);
|
||||
final byte mappingQual = readShouldBeLowQuality ? (byte)10 : (byte)30;
|
||||
read.setMappingQuality(mappingQual);
|
||||
|
||||
if ( operator != CigarOperator.M ) {
|
||||
final List<CigarElement> elements = new ArrayList<CigarElement>(3);
|
||||
elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 51, CigarOperator.M));
|
||||
elements.add(new CigarElement(1, operator));
|
||||
elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 48, CigarOperator.M));
|
||||
read.setCigar(new Cigar(elements));
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
}
|
||||
|
||||
private static final GenomeLoc loc290 = new UnvalidatingGenomeLoc("1", 0, 1000290, 1000290);
|
||||
private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295);
|
||||
private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309);
|
||||
private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310);
|
||||
private static final GenomeLoc loc320 = new UnvalidatingGenomeLoc("1", 0, 1000320, 1000320);
|
||||
private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100);
|
||||
|
||||
private static final int DEEP_COVERAGE_ITERATIONS = 100;
|
||||
|
||||
@DataProvider(name = "ConsensusCreation")
|
||||
public Object[][] createConsensusCreationTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// test high quality reads and bases
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(), false, false, 2, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), false, false, 11, 8, 7 + DEEP_COVERAGE_ITERATIONS)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), false, false, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc320), false, false, 13, 12, 6 + (6 * DEEP_COVERAGE_ITERATIONS))});
|
||||
|
||||
// test low quality reads
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(), true, false, 2, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), true, false, 3, 3, 3)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), true, false, 3, 3, 3)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), true, false, 3, 3, 3)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), true, false, 3, 3, 3)});
|
||||
|
||||
// test low quality bases
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(), false, true, 2, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), false, true, 2, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), false, true, 2, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), false, true, 2, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), false, true, 2, 2, 2)});
|
||||
|
||||
// test mixture
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), true, false, 3, 3, 3)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), false, true, 2, 2, 2)});
|
||||
|
||||
// test I/D operators
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.D, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), CigarOperator.D, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.I, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), CigarOperator.I, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ConsensusCreation", enabled = true)
|
||||
public void testConsensusCreationTest(ConsensusCreationTest test) {
|
||||
final ObjectAVLTreeSet<GenomeLoc> knownSNPs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
// test WITHOUT het compression
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads);
|
||||
|
||||
// test WITH het compression at KNOWN sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
for ( int i = 0; i < 1200; i++ )
|
||||
knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
|
||||
|
||||
// test WITH het compression at ALL sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
result = slidingWindow.close(null);
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
|
||||
|
||||
// test with deep coverage
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 0, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( int i = 0; i < DEEP_COVERAGE_ITERATIONS; i++ ) {
|
||||
for ( final GATKSAMRecord read : test.myReads ) {
|
||||
final GATKSAMRecord copy = ArtificialSAMUtils.createArtificialRead(header, read.getReadName() + "_" + (i+1), 0, read.getAlignmentStart(), readLength);
|
||||
copy.setReadBases(read.getReadBases());
|
||||
copy.setBaseQualities(read.getBaseQualities());
|
||||
copy.setMappingQuality(read.getMappingQuality());
|
||||
copy.setReadNegativeStrandFlag(read.getReadNegativeStrandFlag());
|
||||
if ( read.getCigar() != null )
|
||||
copy.setCigar(read.getCigar());
|
||||
slidingWindow.addRead(copy);
|
||||
}
|
||||
}
|
||||
result = slidingWindow.close(null);
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsAtDeepCoverage);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConsensusCreationForMultiallelic() {
|
||||
|
||||
final int totalNumReads = 7;
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<GATKSAMRecord>(totalNumReads);
|
||||
|
||||
for ( int i = 0; i < totalNumReads; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
|
||||
final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G');
|
||||
read.setReadBases(Utils.dupBytes((byte) base, readLength));
|
||||
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
final ObjectAVLTreeSet<GenomeLoc> knownSNPs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
// test WITHOUT het compression
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
|
||||
// test WITH het compression at KNOWN sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
|
||||
// test WITH het compression at ALL sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConsensusCreationForInsertions() {
|
||||
|
||||
final int totalNumReads = 7;
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<>(totalNumReads);
|
||||
|
||||
// add reads, one with a SNP and one with a SNP and insertion
|
||||
for ( int i = 0; i < totalNumReads; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
if ( i < 2 )
|
||||
bases[20] = 'C';
|
||||
if ( i == 0 )
|
||||
bases[80] = 'C';
|
||||
read.setReadBases(bases);
|
||||
|
||||
if ( i == 0 )
|
||||
read.setCigarString("80M1I19M");
|
||||
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
final Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(null);
|
||||
Assert.assertEquals(result.getFirst().size(), 3); // no compression at all for SNPs
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddingReadPairWithSameCoordinates() {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10);
|
||||
|
||||
final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1);
|
||||
read1.setReadBases(new byte[]{(byte)'A'});
|
||||
read1.setBaseQualities(new byte[]{(byte)'A'});
|
||||
read1.setMappingQuality(30);
|
||||
read1.setReadNegativeStrandFlag(false);
|
||||
slidingWindow.addRead(read1);
|
||||
|
||||
final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1);
|
||||
read2.setReadBases(new byte[]{(byte)'A'});
|
||||
read2.setBaseQualities(new byte[]{(byte)'A'});
|
||||
read2.setMappingQuality(30);
|
||||
read2.setReadNegativeStrandFlag(true);
|
||||
slidingWindow.addRead(read2);
|
||||
|
||||
Assert.assertEquals(slidingWindow.readsInWindow.size(), 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOnlySpanningReadHasLowQual() {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
|
||||
final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead1", 0, globalStartPosition, 100);
|
||||
final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead2", 0, globalStartPosition + 50, 100);
|
||||
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
read1.setReadBases(bases);
|
||||
read2.setReadBases(bases);
|
||||
|
||||
final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength);
|
||||
baseQuals[80] = (byte)10;
|
||||
read1.setBaseQualities(baseQuals);
|
||||
read2.setBaseQualities(baseQuals);
|
||||
|
||||
read1.setMappingQuality(30);
|
||||
read2.setMappingQuality(30);
|
||||
|
||||
slidingWindow.addRead(read1);
|
||||
slidingWindow.addRead(read2);
|
||||
|
||||
Assert.assertEquals(slidingWindow.close(null).getFirst().size(), 1);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
//// This section tests the downsampling functionality ////
|
||||
///////////////////////////////////////////////////////////
|
||||
|
||||
@DataProvider(name = "Downsampling")
|
||||
public Object[][] createDownsamplingTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( int i = 1; i < basicReads.size() + 10; i++ )
|
||||
tests.add(new Object[]{i});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "Downsampling", enabled = true)
|
||||
public void testDownsamplingTest(final int dcov) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
final ObjectList<GATKSAMRecord> result = slidingWindow.downsampleVariantRegion(basicReads);
|
||||
|
||||
Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size()));
|
||||
}
|
||||
|
||||
@DataProvider(name = "DownsamplingFromClose")
|
||||
public Object[][] createDownsamplingFromCloseTestData() {
|
||||
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<>(20);
|
||||
for ( int i = 0; i < 21; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength);
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
if ( i < 5 )
|
||||
bases[50] = 'C';
|
||||
read.setReadBases(bases);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( int i = 1; i < 25; i++ )
|
||||
tests.add(new Object[]{myReads, i});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "DownsamplingFromClose", enabled = true)
|
||||
public void testDownsamplingTestFromClose(final ObjectList<GATKSAMRecord> myReads, final int dcov) {
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet<GenomeLoc>()); // no het compression
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), Math.min(dcov, myReads.size()), "Down-sampling was not performed correctly");
|
||||
}
|
||||
|
||||
@DataProvider(name = "NoDownsamplingForConsensusReads")
|
||||
public Object[][] createNoDownsamplingForConsensusReadsData() {
|
||||
|
||||
final ObjectList<GATKSAMRecord> myReads = new ObjectArrayList<>(20);
|
||||
for ( int i = 0; i < 30; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength);
|
||||
final byte[] bases = Utils.dupBytes((byte) 'A', readLength);
|
||||
if ( i < 10 )
|
||||
bases[50] = 'C';
|
||||
read.setReadBases(bases);
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
read.setReadNegativeStrandFlag(i % 2 == 0);
|
||||
myReads.add(read);
|
||||
}
|
||||
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( int i = 0; i < 5; i++ )
|
||||
tests.add(new Object[]{myReads, i});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "NoDownsamplingForConsensusReads", enabled = true)
|
||||
public void testNoDownsamplingForConsensusReads(final ObjectList<GATKSAMRecord> myReads, final int dcov) {
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(null); // allow het compression (so we expect 4 reads)
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), 4, "Down-sampling was performed on consensus reads!");
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
//// This section tests the consensus base quals accuracy ////
|
||||
//////////////////////////////////////////////////////////////
|
||||
|
||||
private class QualsTest {
|
||||
public final List<Integer> quals;
|
||||
public final List<GATKSAMRecord> myReads = new ArrayList<GATKSAMRecord>(5);
|
||||
|
||||
private QualsTest(final List<Integer> quals) {
|
||||
this.quals = quals;
|
||||
for ( int i = 0; i < quals.size(); i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, 1);
|
||||
read.setReadBases(new byte[]{(byte)'A'});
|
||||
read.setBaseQualities(new byte[]{quals.get(i).byteValue()});
|
||||
read.setMappingQuality(30);
|
||||
myReads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "ConsensusQuals")
|
||||
public Object[][] createConsensusQualsData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 };
|
||||
|
||||
for ( final int qual1 : quals ) {
|
||||
for ( final int qual2 : quals ) {
|
||||
for ( final int qual3 : quals ) {
|
||||
tests.add(new Object[]{new QualsTest(Arrays.asList(qual1, qual2, qual3))});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private static final byte minUsableConsensusQual = 10;
|
||||
|
||||
@Test(dataProvider = "ConsensusQuals", enabled = true)
|
||||
public void testConsensusQualsTest(QualsTest test) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
final Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet<GenomeLoc>());
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), 1);
|
||||
final GATKSAMRecord read = result.getFirst().iterator().next();
|
||||
final int actualBaseQual = read.getReducedCount(0) * read.getBaseQualities()[0];
|
||||
final int expectedBaseQual = qualSum(test.quals);
|
||||
Assert.assertEquals(actualBaseQual, expectedBaseQual);
|
||||
}
|
||||
|
||||
private static int qualSum(final List<Integer> quals) {
|
||||
int goodBases = 0;
|
||||
int sum = 0;
|
||||
for ( final int qual : quals ) {
|
||||
if ( qual >= minUsableConsensusQual ) {
|
||||
goodBases++;
|
||||
sum += qual;
|
||||
}
|
||||
}
|
||||
|
||||
// handle a low quality consensus
|
||||
if ( sum == 0 ) {
|
||||
for ( final int qual : quals ) {
|
||||
goodBases++;
|
||||
sum += qual;
|
||||
}
|
||||
}
|
||||
|
||||
return sum - (sum % goodBases);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
//// This section tests the new header creation ////
|
||||
////////////////////////////////////////////////////
|
||||
|
||||
@DataProvider(name = "CreateNewHeader")
|
||||
public Object[][] CreateNewHeaderTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int start : Arrays.asList(-10, -1, 0, 1, 10) ) {
|
||||
for ( final int stop : Arrays.asList(-10, -1, 0, 1, 10) ) {
|
||||
tests.add(new Object[]{start, stop});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "CreateNewHeader", enabled = true)
|
||||
public void createNewHeaderTest(final int start, final int stop) {
|
||||
|
||||
// set up the window header
|
||||
final int currentHeaderStart = 100;
|
||||
final int currentHeaderLength = 50;
|
||||
final LinkedList<HeaderElement> windowHeader = new LinkedList<HeaderElement>();
|
||||
for ( int i = 0; i < currentHeaderLength; i++ )
|
||||
windowHeader.add(new HeaderElement(currentHeaderStart + i));
|
||||
|
||||
// set up the read
|
||||
final int readStart = currentHeaderStart + start;
|
||||
final int readLength = currentHeaderLength + stop - start;
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte) 30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start);
|
||||
|
||||
Assert.assertEquals(newIndex, start > 0 ? start : 0);
|
||||
|
||||
final int expectedNewLength = currentHeaderLength + (start < 0 ? -start : 0) + (stop > 0 ? stop : 0);
|
||||
Assert.assertEquals(windowHeader.size(), expectedNewLength);
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
//// This section tests updating the header from a read ////
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
@DataProvider(name = "UpdateHeaderForRead")
|
||||
public Object[][] UpdateHeaderForReadTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int start : Arrays.asList(0, 1, 10) ) {
|
||||
for ( final int readLength : Arrays.asList(1, 5, 10) ) {
|
||||
tests.add(new Object[]{start, readLength});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "UpdateHeaderForRead", enabled = true)
|
||||
public void updateHeaderForReadTest(final int start, final int readLength) {
|
||||
|
||||
// set up the window header
|
||||
final int currentHeaderStart = 100;
|
||||
final int currentHeaderLength = 50;
|
||||
final LinkedList<HeaderElement> windowHeader = new LinkedList<HeaderElement>();
|
||||
for ( int i = 0; i < currentHeaderLength; i++ )
|
||||
windowHeader.add(new HeaderElement(currentHeaderStart + i));
|
||||
|
||||
// set up the read
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + start, readLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(30);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
|
||||
// add the read
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start);
|
||||
for ( int i = 0; i < start; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0);
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
Assert.assertEquals(windowHeader.get(start + i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1);
|
||||
for ( int i = start + readLength; i < currentHeaderLength; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0);
|
||||
|
||||
// now remove the read
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, true, start);
|
||||
for ( int i = 0; i < currentHeaderLength; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUpdateHeaderForReadWithHighMQ() {
|
||||
|
||||
// set up the window header
|
||||
final int currentHeaderStart = 100;
|
||||
final LinkedList<HeaderElement> windowHeader = new LinkedList<>();
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
windowHeader.add(new HeaderElement(currentHeaderStart + i));
|
||||
|
||||
// set up the read
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, readLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(180);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
|
||||
// add the read and make sure it's not filtered because of low MQ (byte vs. int)
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0);
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests functionality related to polyploid consensus creation ////
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@DataProvider(name = "MatchesKnownProvider")
|
||||
public Object[][] matchesKnownProvider() {
|
||||
|
||||
final ObjectArrayList<Object[]> tests = new ObjectArrayList<Object[]>();
|
||||
|
||||
// test no knowns
|
||||
tests.add(new Object[]{new ObjectAVLTreeSet<GenomeLoc>(), loc290.getStart(), false});
|
||||
|
||||
final ObjectSortedSet<GenomeLoc> knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
knownSnpPositions.add(loc290);
|
||||
knownSnpPositions.add(loc295);
|
||||
knownSnpPositions.add(loc310);
|
||||
|
||||
// test overlap
|
||||
tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true});
|
||||
tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true});
|
||||
tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true});
|
||||
tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MatchesKnownProvider")
|
||||
public void testMatchesKnown(final ObjectSortedSet<GenomeLoc> knownSnpPositions, final int targetLoc, final boolean expectedResult) {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10);
|
||||
Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult);
|
||||
}
|
||||
|
||||
@DataProvider(name = "SignificantSoftclipsProvider")
|
||||
public Object[][] SignificantSoftclipsTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) {
|
||||
for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) {
|
||||
tests.add(new Object[]{indexWithSoftclips, indexToSkip});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "SignificantSoftclipsProvider", enabled = true)
|
||||
public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) {
|
||||
|
||||
// set up the window header
|
||||
final int currentHeaderStart = 100;
|
||||
final int currentHeaderLength = 10;
|
||||
final LinkedList<HeaderElement> windowHeader = new LinkedList<HeaderElement>();
|
||||
for ( int i = 0; i < currentHeaderLength; i++ )
|
||||
windowHeader.add(new HeaderElement(currentHeaderStart + i));
|
||||
|
||||
// set up the normal read
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength));
|
||||
read.setMappingQuality(30);
|
||||
|
||||
// add the read
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0);
|
||||
|
||||
// set up and add a soft-clipped read if requested
|
||||
if ( indexWithSoftclips != -1 ) {
|
||||
final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1);
|
||||
softclippedRead.setReadBases(new byte[]{(byte) 'A'});
|
||||
softclippedRead.setBaseQualities(new byte[]{(byte) 30});
|
||||
softclippedRead.setMappingQuality(30);
|
||||
softclippedRead.setCigarString("1S");
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips);
|
||||
}
|
||||
|
||||
final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip);
|
||||
Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.junit.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
public class BiasedDownsamplingIntegrationTest extends WalkerTest {
|
||||
|
||||
private final static String baseCommandUG = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000";
|
||||
private final static String baseCommandHC = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:4,000,000-5,000,000" + " --useFilteredReadsForAnnotations";
|
||||
|
||||
private final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/";
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing UnifiedGenotyper contamination down-sampling on BAMs with artificially created contaminated.
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
private void testDefaultContamination() {
|
||||
final String bam1 = "NA11918.with.1.NA12842.reduced.bam";
|
||||
final String bam2 = "NA12842.with.1.NA11918.reduced.bam";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination .05 ", 1,
|
||||
Arrays.asList("b13612312ff991cf40ddc44255e76ecd"));
|
||||
executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with .05 downsampling.", spec);
|
||||
}
|
||||
|
||||
|
||||
// verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level
|
||||
|
||||
|
||||
@DataProvider(name="PerSampleEqualFlatContamBams")
|
||||
public Object[][] makePerSampleEqualFlatContamBams() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0}) ;
|
||||
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ;
|
||||
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ;
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PerSampleEqualFlatContamBams")
|
||||
private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling) {
|
||||
final String command = baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(""));
|
||||
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
|
||||
|
||||
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
|
||||
Pair<List<File>, List<String>> test1 = executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
|
||||
|
||||
spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(""));
|
||||
|
||||
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
|
||||
Pair<List<File>, List<String>> test2 = executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec);
|
||||
|
||||
//verify that the md5s match up.
|
||||
Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0));
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing HaplotypeCaller Contamination Removal
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
@DataProvider(name="PerSampleEqualFlatContamBamsHC")
|
||||
public Object[][] makePerSampleEqualFlatContamBamsHC() {
|
||||
final List<Object[]> tests = new LinkedList<Object[]>();
|
||||
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0 }) ;
|
||||
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ;
|
||||
tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ;
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider = "PerSampleEqualFlatContamBamsHC")
|
||||
private void testPerSampleEqualsFlatHC(final String bam1, final String bam2, final String persampleFile, final Double downsampling) {
|
||||
final String command = baseCommandHC + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ";
|
||||
|
||||
WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(""));
|
||||
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
|
||||
|
||||
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
|
||||
|
||||
Pair<List<File>, List<String>> test1= executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec);
|
||||
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(""));
|
||||
|
||||
rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result
|
||||
Pair<List<File>, List<String>> test2=executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec);
|
||||
|
||||
//verify that the md5s match up.
|
||||
Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0));
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,87 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void testReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f"));
|
||||
executeTest("test calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamSNPs() {
|
||||
testReducedCalling("SNP", "cc0508b18028f2e84e6a42c1ff23721c");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamINDELs() {
|
||||
testReducedCalling("INDEL", "6fc00d5299b1bf334d39634c3409a69d");
|
||||
}
|
||||
|
||||
|
||||
private void testReducedCalling(final String model, final String md5) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
|
||||
Arrays.asList(md5));
|
||||
executeTest("test calling on a ReducedRead BAM with " + model, spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -171,7 +171,7 @@ public class AssemblyResultSetUnitTest extends BaseTest
|
|||
|
||||
final ReadThreadingGraph rtg = new ReadThreadingGraph(10);
|
||||
for (final Haplotype h : haplotypes)
|
||||
rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference());
|
||||
rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), h.isReference());
|
||||
final SeqGraph seqGraph = rtg.convertToSequenceGraph();
|
||||
final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph);
|
||||
ar.setThreadingGraph(rtg);
|
||||
|
|
|
|||
|
|
@ -217,28 +217,6 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("12c56262ed30db1249b8d722e324357c"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("1627cf5f3a97e8b73b3c095db46aef1b"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// test dbSNP annotation
|
||||
|
|
|
|||
|
|
@ -157,26 +157,6 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCalcNIndelInformativeReducedReads() {
|
||||
final String bases = "ACGGGTTTGGAC";
|
||||
final byte[] quals = Utils.dupBytes((byte)30, bases.length());
|
||||
final int count = 10;
|
||||
final int[] counts = new int[bases.length()];
|
||||
for ( int i = 0; i < counts.length; i++ )
|
||||
counts[i] = count;
|
||||
final int position = 100;
|
||||
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, position, counts.length, counts);
|
||||
read.setReadString(bases);
|
||||
read.setBaseQualities(quals);
|
||||
read.setCigarString(bases.length() + "M");
|
||||
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, position, position);
|
||||
final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), 0);
|
||||
final int actual = model.calcNIndelInformativeReads(pileup, 0, bases.getBytes(), 3);
|
||||
Assert.assertEquals(actual, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClose() {
|
||||
model.close();
|
||||
|
|
|
|||
|
|
@ -79,8 +79,8 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
|||
final ReadThreadingGraph assembler = new ReadThreadingGraph(11);
|
||||
final String ref = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG";
|
||||
final String alt = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG";
|
||||
assembler.addSequence("anonymous", getBytes(ref), null, true);
|
||||
assembler.addSequence("anonymous", getBytes(alt), null, false);
|
||||
assembler.addSequence("anonymous", getBytes(ref), true);
|
||||
assembler.addSequence("anonymous", getBytes(alt), false);
|
||||
assembler.buildGraphIfNecessary();
|
||||
Assert.assertNotEquals(ref.length() - 11 + 1,assembler.vertexSet().size(),"the number of vertex in the graph is the same as if there was no alternative sequence");
|
||||
Assert.assertEquals(ref.length() - 11 + 1 + 11,assembler.vertexSet().size(),"the number of vertex in the graph is not the same as if there is an alternative sequence");
|
||||
|
|
@ -178,7 +178,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
|||
|
||||
// test that there are cycles detected for small kmer
|
||||
final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25);
|
||||
rtgraph25.addSequence("ref", ref.getBytes(), null, true);
|
||||
rtgraph25.addSequence("ref", ref.getBytes(), true);
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
rtgraph25.addRead(read);
|
||||
rtgraph25.buildGraphIfNecessary();
|
||||
|
|
@ -186,7 +186,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
|||
|
||||
// test that there are no cycles detected for large kmer
|
||||
final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75);
|
||||
rtgraph75.addSequence("ref", ref.getBytes(), null, true);
|
||||
rtgraph75.addSequence("ref", ref.getBytes(), true);
|
||||
for ( final GATKSAMRecord read : reads )
|
||||
rtgraph75.addRead(read);
|
||||
rtgraph75.buildGraphIfNecessary();
|
||||
|
|
@ -200,7 +200,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
|||
final byte[] ref = Utils.dupBytes((byte)'A', length);
|
||||
|
||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25);
|
||||
rtgraph.addSequence("ref", ref, null, true);
|
||||
rtgraph.addSequence("ref", ref, true);
|
||||
|
||||
// add reads with Ns at any position
|
||||
for ( int i = 0; i < length; i++ ) {
|
||||
|
|
@ -250,7 +250,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest {
|
|||
|
||||
// create the graph and populate it
|
||||
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize);
|
||||
rtgraph.addSequence("ref", ref.getBytes(), null, true);
|
||||
rtgraph.addSequence("ref", ref.getBytes(), true);
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M");
|
||||
rtgraph.addRead(read);
|
||||
rtgraph.buildGraphIfNecessary();
|
||||
|
|
|
|||
|
|
@ -55,26 +55,12 @@ public class SequenceForKmersUnitTest extends BaseTest {
|
|||
@Test
|
||||
public void testNoCount() {
|
||||
final byte[] seq = "ACGT".getBytes();
|
||||
final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, null, true);
|
||||
final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, 1, true);
|
||||
Assert.assertEquals(sk.name, "foo");
|
||||
Assert.assertEquals(sk.sequence, seq);
|
||||
Assert.assertEquals(sk.start, 0);
|
||||
Assert.assertEquals(sk.stop, seq.length);
|
||||
Assert.assertEquals(sk.count, 1);
|
||||
Assert.assertEquals(sk.isRef, true);
|
||||
for ( int i = 0; i < seq.length; i++ )
|
||||
Assert.assertEquals(sk.getCount(i), 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithCounts() {
|
||||
final int len = 256;
|
||||
final int[] counts = new int[len];
|
||||
for ( int i = 0; i < len; i++ ) counts[i] = i;
|
||||
final byte[] seq = Utils.dupBytes((byte)'A', len);
|
||||
|
||||
final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, counts, true);
|
||||
|
||||
for ( int i = 0; i < seq.length; i++ )
|
||||
Assert.assertEquals(sk.getCount(i), i);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -116,9 +116,9 @@ public class ActiveRegionTestDataSet {
|
|||
public AssemblyResultSet assemblyResultSet() {
|
||||
if (assemblyResultSet == null) {
|
||||
final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize);
|
||||
rtg.addSequence("anonymous", this.getReference().getBytes(), null, true);
|
||||
rtg.addSequence("anonymous", this.getReference().getBytes(), true);
|
||||
for (final String haplotype : this.haplotypesStrings()) {
|
||||
rtg.addSequence("anonymous", haplotype.getBytes(), null, false);
|
||||
rtg.addSequence("anonymous", haplotype.getBytes(), false);
|
||||
}
|
||||
rtg.buildGraphIfNecessary();
|
||||
if (rtg.hasCycles())
|
||||
|
|
|
|||
|
|
@ -31,7 +31,6 @@ import net.sf.samtools.*;
|
|||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.Tags;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
|
|
@ -48,10 +47,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.*;
|
||||
|
|
@ -327,6 +324,8 @@ public class SAMDataSource {
|
|||
// and read group id (merged) -> read group id (original) mappings.
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
SAMFileReader reader = readers.getReader(id);
|
||||
checkForReducedBamFile(reader.getFileHeader());
|
||||
|
||||
ReadGroupMapping mappingToMerged = new ReadGroupMapping();
|
||||
|
||||
List<SAMReadGroupRecord> readGroups = reader.getFileHeader().getReadGroups();
|
||||
|
|
@ -352,6 +351,16 @@ public class SAMDataSource {
|
|||
resourcePool.releaseReaders(readers);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the provided SAM header if from a reduced bam file.
|
||||
* @param header the SAM header for a given file
|
||||
* @throws UserException if the header is from a reduced bam
|
||||
*/
|
||||
private void checkForReducedBamFile(final SAMFileHeader header) {
|
||||
if ( header.getProgramRecord("GATK ReduceReads") != null )
|
||||
throw new UserException("The GATK no longer supports running off of BAMs produced by ReduceReads");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
|
|
|
|||
|
|
@ -65,18 +65,14 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
alleleStratifiedElements[i] = new PileupElementList();
|
||||
|
||||
// start by stratifying the reads by the alleles they represent at this position
|
||||
boolean sawReducedRead = false;
|
||||
for ( final PileupElement pe : pileup ) {
|
||||
if ( pe.getRead().isReducedRead() )
|
||||
sawReducedRead = true;
|
||||
|
||||
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
|
||||
if ( baseIndex != -1 )
|
||||
alleleStratifiedElements[baseIndex].add(pe);
|
||||
}
|
||||
|
||||
// make a listing of allele counts and calculate the total count
|
||||
final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead);
|
||||
final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements);
|
||||
final int totalAlleleCount = (int)MathUtils.sum(alleleCounts);
|
||||
|
||||
// do smart down-sampling
|
||||
|
|
@ -106,18 +102,12 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
* Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present)
|
||||
*
|
||||
* @param alleleStratifiedElements pileup elements stratified by allele
|
||||
* @param sawReducedRead is at least one read a reduced read?
|
||||
* @return non-null int array representing allele counts
|
||||
*/
|
||||
private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) {
|
||||
private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements) {
|
||||
final int[] alleleCounts = new int[alleleStratifiedElements.length];
|
||||
for ( int i = 0; i < alleleStratifiedElements.length; i++ ) {
|
||||
if ( !sawReducedRead ) {
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
} else {
|
||||
for ( final PileupElement pe : alleleStratifiedElements[i] )
|
||||
alleleCounts[i] += pe.getRepresentativeCount();
|
||||
}
|
||||
alleleCounts[i] = alleleStratifiedElements[i].size();
|
||||
}
|
||||
return alleleCounts;
|
||||
}
|
||||
|
|
@ -211,24 +201,7 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
|
||||
int currentBitSetIndex = 0;
|
||||
for ( final PileupElement element : elements ) {
|
||||
|
||||
final int representativeCount = element.getRepresentativeCount();
|
||||
|
||||
// if it's a reduced read, we need to be smart about how we down-sample
|
||||
if ( representativeCount > 1 ) {
|
||||
// count how many bits are set over the span represented by this read
|
||||
int setBits = 0;
|
||||
for ( int i = 0; i < representativeCount; i++ )
|
||||
setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0;
|
||||
|
||||
// remove that count from the count of the reduced read
|
||||
if ( setBits == representativeCount )
|
||||
elementsToRemove.add(element);
|
||||
else
|
||||
element.adjustRepresentativeCount(-1 * setBits);
|
||||
}
|
||||
// otherwise it's trivial: remove if the corresponding bit is set
|
||||
else if ( itemsToRemove.get(currentBitSetIndex++) ) {
|
||||
if ( itemsToRemove.get(currentBitSetIndex++) ) {
|
||||
elementsToRemove.add(element);
|
||||
}
|
||||
}
|
||||
|
|
@ -255,7 +228,6 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
|
||||
final int numAlleles = alleles.size();
|
||||
|
||||
// TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead
|
||||
final int[] alleleCounts = new int[numAlleles];
|
||||
for ( int i = 0; i < numAlleles; i++ )
|
||||
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
|
||||
|
|
@ -302,9 +274,6 @@ public class AlleleBiasedDownsamplingUtils {
|
|||
|
||||
int currentBitSetIndex = 0;
|
||||
for ( final GATKSAMRecord read : reads ) {
|
||||
if ( read.isReducedRead() )
|
||||
throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords");
|
||||
|
||||
if ( itemsToRemove.get(currentBitSetIndex++) )
|
||||
elementsToRemove.add(read);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,9 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -159,14 +156,6 @@ public abstract class Downsampler<T> {
|
|||
* @return true if the item should not be subject to elimination during downsampling, otherwise false
|
||||
*/
|
||||
protected boolean doNotDiscardItem( final Object item ) {
|
||||
// Use getClass() rather than instanceof for performance reasons. Ugly but fast.
|
||||
if ( item.getClass() == GATKSAMRecord.class ) {
|
||||
return ((GATKSAMRecord)item).isReducedRead();
|
||||
}
|
||||
else if ( item.getClass() == AlignmentStateMachine.class ) {
|
||||
return ((AlignmentStateMachine)item).isReducedRead();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -314,14 +314,13 @@ public class CallableLoci extends LocusWalker<CallableLoci.CallableBaseState, Ca
|
|||
// count up the depths of all and QC+ bases
|
||||
int rawDepth = 0, QCDepth = 0, lowMAPQDepth = 0;
|
||||
for (PileupElement e : context.getBasePileup()) {
|
||||
final int depth = e.getRepresentativeCount();
|
||||
rawDepth += depth;
|
||||
rawDepth++;
|
||||
|
||||
if (e.getMappingQual() <= maxLowMAPQ)
|
||||
lowMAPQDepth += depth;
|
||||
lowMAPQDepth++;
|
||||
|
||||
if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
|
||||
QCDepth += depth;
|
||||
QCDepth++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -217,12 +217,12 @@ public class CoverageUtils {
|
|||
|
||||
private static void updateCounts(int[] counts, PileupElement e) {
|
||||
if ( e.isDeletion() ) {
|
||||
counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount();
|
||||
counts[BaseUtils.Base.D.ordinal()]++;
|
||||
} else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) {
|
||||
counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount();
|
||||
counts[BaseUtils.Base.N.ordinal()]++;
|
||||
} else {
|
||||
try {
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount();
|
||||
counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++;
|
||||
} catch (ArrayIndexOutOfBoundsException exc) {
|
||||
throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ public class DeprecatedToolChecks {
|
|||
private static Object2ObjectMap deprecatedGATKWalkers = new Object2ObjectOpenHashMap();
|
||||
static {
|
||||
// Indicate recommended replacement in parentheses if applicable
|
||||
deprecatedGATKWalkers.put("ReduceReads", "3.0 (use recommended best practices pipeline with the HaplotypeCaller)");
|
||||
deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)");
|
||||
deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)");
|
||||
deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)");
|
||||
|
|
|
|||
|
|
@ -399,12 +399,6 @@ public class ClippingOp {
|
|||
hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION);
|
||||
}
|
||||
|
||||
if (read.isReducedRead()) {
|
||||
final int[] reducedCounts = new int[newLength];
|
||||
System.arraycopy(read.getReducedReadCounts(), copyStart, reducedCounts, 0, newLength);
|
||||
hardClippedRead.setReducedReadCounts(reducedCounts);
|
||||
}
|
||||
|
||||
return hardClippedRead;
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,6 @@ import org.broadinstitute.sting.utils.MathUtils;
|
|||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -115,13 +114,9 @@ public class PerReadAlleleLikelihoodMap {
|
|||
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
|
||||
|
||||
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
|
||||
// TODO -- come up with a strategy for down-sampling reduced reads
|
||||
// Currently we are unable to remove reduced reads because their representative base count differs throughout the read
|
||||
if ( !entry.getKey().isReducedRead() ) {
|
||||
final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele.isInformative() )
|
||||
alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey());
|
||||
}
|
||||
final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele.isInformative() )
|
||||
alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey());
|
||||
}
|
||||
|
||||
return alleleReadMap;
|
||||
|
|
@ -233,10 +228,9 @@ public class PerReadAlleleLikelihoodMap {
|
|||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : likelihoodReadMap.entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
final GATKSAMRecord read = entry.getKey();
|
||||
final int count = ReadUtils.getMeanRepresentativeReadCount(read);
|
||||
final double likelihood_iii = entry.getValue().get(iii_allele);
|
||||
final double likelihood_jjj = entry.getValue().get(jjj_allele);
|
||||
haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF);
|
||||
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF;
|
||||
|
||||
// fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair
|
||||
if ( haplotypeLikelihood < maxElement ) break;
|
||||
|
|
|
|||
|
|
@ -123,15 +123,6 @@ public class AlignmentStateMachine {
|
|||
return getRead().getReferenceIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is our read a reduced read?
|
||||
*
|
||||
* @return true if the read we encapsulate is a reduced read, otherwise false
|
||||
*/
|
||||
public boolean isReducedRead() {
|
||||
return read.isReducedRead();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this the left edge state? I.e., one that is before or after the current read?
|
||||
* @return true if this state is an edge state, false otherwise
|
||||
|
|
|
|||
|
|
@ -30,8 +30,6 @@ import com.google.java.contract.Requires;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
|
@ -296,43 +294,6 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
//
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the number of elements in the pileup element.
|
||||
*
|
||||
* Unless this is a reduced read, the number of elements in a pileup element is one. In the event of
|
||||
* this being a reduced read and a deletion, we return the average number of elements between the left
|
||||
* and right elements to the deletion. We assume the deletion to be left aligned.
|
||||
*
|
||||
* @return the representative count
|
||||
*/
|
||||
public int getRepresentativeCount() {
|
||||
if (read.isReducedRead()) {
|
||||
if (isDeletion() && (offset + 1 >= read.getReadLength()) ) // deletion in the end of the read
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString()));
|
||||
|
||||
return isDeletion()
|
||||
? MathUtils.fastRound((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2.0)
|
||||
: read.getReducedCount(offset);
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjusts the representative count of this pileup element.
|
||||
* Throws an exception if this element does not represent a reduced read.
|
||||
*
|
||||
* See GATKSAMRecord.adjustReducedCount() for warnings on the permanency of this operation.
|
||||
*
|
||||
* @param adjustmentFactor how much to adjust the representative count (can be positive or negative)
|
||||
*/
|
||||
public void adjustRepresentativeCount(final int adjustmentFactor) {
|
||||
if ( read.isReducedRead() )
|
||||
read.adjustReducedCount(offset, adjustmentFactor);
|
||||
else
|
||||
throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the cigar element aligning this element to the genome
|
||||
* @return a non-null CigarElement
|
||||
|
|
|
|||
|
|
@ -811,10 +811,7 @@ public class ReadBackedPileupImpl implements ReadBackedPileup {
|
|||
@Override
|
||||
public int depthOfCoverage() {
|
||||
if (depthOfCoverage == UNINITIALIZED_CACHED_INT_VALUE) {
|
||||
depthOfCoverage = 0;
|
||||
for (PileupElement p : pileupElementTracker.unorderedIterable()) {
|
||||
depthOfCoverage += p.getRepresentativeCount();
|
||||
}
|
||||
depthOfCoverage = pileupElementTracker.size();
|
||||
}
|
||||
return depthOfCoverage;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -324,31 +324,6 @@ public class ArtificialSAMUtils {
|
|||
return Arrays.asList(left, right);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an artificial reduced read based on the parameters. The cigar string will be *M, where * is the
|
||||
* length of the read. The base counts specified in the baseCounts array will be stored fully encoded in
|
||||
* the RR attribute.
|
||||
*
|
||||
* @param header the SAM header to associate the read with
|
||||
* @param name the name of the read
|
||||
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
||||
* @param alignmentStart where to start the alignment
|
||||
* @param length the length of the read
|
||||
* @param baseCounts reduced base counts to encode in the RR attribute; length must match the read length
|
||||
* @return the artificial reduced read
|
||||
*/
|
||||
public static GATKSAMRecord createArtificialReducedRead( final SAMFileHeader header,
|
||||
final String name,
|
||||
final int refIndex,
|
||||
final int alignmentStart,
|
||||
final int length,
|
||||
final int[] baseCounts ) {
|
||||
final GATKSAMRecord read = createArtificialRead(header, name, refIndex, alignmentStart, length);
|
||||
read.setReducedReadCounts(baseCounts);
|
||||
read.setReducedReadCountsTag();
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a collection of identical artificial reads based on the parameters. The cigar string for each
|
||||
* read will be *M, where * is the length of the read.
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.sam;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
|
|
@ -51,12 +50,6 @@ import java.util.*;
|
|||
* functions, so modifying a GATKSAMRecord in any way may result in stale cached values.
|
||||
*/
|
||||
public class GATKSAMRecord extends BAMRecord {
|
||||
// ReduceReads specific attribute tags
|
||||
public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool
|
||||
public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool
|
||||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start
|
||||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end
|
||||
|
||||
// Base Quality Score Recalibrator specific attribute tags
|
||||
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions
|
||||
public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions
|
||||
|
|
@ -70,17 +63,15 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
// the SAMRecord data we're caching
|
||||
private String mReadString = null;
|
||||
private GATKSAMReadGroupRecord mReadGroup = null;
|
||||
private int[] reducedReadCounts = null;
|
||||
private final static int UNINITIALIZED = -1;
|
||||
private int softStart = UNINITIALIZED;
|
||||
private int softEnd = UNINITIALIZED;
|
||||
private Integer adapterBoundary = null;
|
||||
|
||||
private Boolean isStrandlessRead = null;
|
||||
private boolean isStrandlessRead = false;
|
||||
|
||||
// because some values can be null, we don't want to duplicate effort
|
||||
private boolean retrievedReadGroup = false;
|
||||
private boolean retrievedReduceReadCounts = false;
|
||||
|
||||
// These temporary attributes were added here to make life easier for
|
||||
// certain algorithms by providing a way to label or attach arbitrary data to
|
||||
|
|
@ -160,9 +151,6 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return true if this read doesn't have meaningful strand information
|
||||
*/
|
||||
public boolean isStrandless() {
|
||||
if ( isStrandlessRead == null ) {
|
||||
isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null;
|
||||
}
|
||||
return isStrandlessRead;
|
||||
}
|
||||
|
||||
|
|
@ -342,185 +330,6 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
return getReadGroup().getNGSPlatform();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// *** ReduceReads functions ***//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Get the counts of the bases in this reduced read
|
||||
*
|
||||
* NOTE that this is not the value of the REDUCED_READ_CONSENSUS_TAG, which
|
||||
* is encoded in a special way. This is the actual positive counts of the
|
||||
* depth at each bases. So for a RR with a tag of:
|
||||
*
|
||||
* [10, 5, -1, -5]
|
||||
*
|
||||
* this function returns
|
||||
*
|
||||
* [10, 15, 9, 5]
|
||||
*
|
||||
* as one might expect.
|
||||
*
|
||||
* @return a int[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read
|
||||
*/
|
||||
public int[] getReducedReadCounts() {
|
||||
if ( ! retrievedReduceReadCounts ) {
|
||||
final byte[] tag = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG);
|
||||
if ( tag != null ) reducedReadCounts = decodeReduceReadCounts(tag);
|
||||
retrievedReduceReadCounts = true;
|
||||
}
|
||||
|
||||
return reducedReadCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* The number of bases corresponding the i'th base of the reduced read.
|
||||
*
|
||||
* @param i the read based coordinate inside the read
|
||||
* @return the number of bases corresponding to the i'th base of the reduced read
|
||||
*/
|
||||
public final int getReducedCount(final int i) {
|
||||
if ( !isReducedRead() )
|
||||
throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced");
|
||||
if ( i < 0 || i >= getReadBases().length )
|
||||
throw new IllegalArgumentException("illegal offset used when retrieving reduced counts: " + i);
|
||||
|
||||
final int[] reducedCounts = getReducedReadCounts();
|
||||
return reducedCounts[i];
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this read a reduced read?
|
||||
* @return true if yes
|
||||
*/
|
||||
public boolean isReducedRead() {
|
||||
return getReducedReadCounts() != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the reduced read counts tag for this record.
|
||||
* Note that this method is slightly expensive as it converts to the correct reduced counts representation and sets the
|
||||
* appropriate binary tag. If you want to modify the reduced count in place without triggering the permanent conversion
|
||||
* internally, use the #setReducedCount() method.
|
||||
*
|
||||
* @param counts the count array
|
||||
*/
|
||||
public void setReducedReadCountsTag(final int[] counts) {
|
||||
setAttribute(REDUCED_READ_CONSENSUS_TAG, encodeReduceReadCounts(counts));
|
||||
retrievedReduceReadCounts = false; // need to force new decode in case we had to handle precision problems with the counts
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setReducedReadCountsTag() and uses the currently stored values of the internal array.
|
||||
* Useful if you've been using #setReducedCount() to modify the reduced count and now want to trigger the expensive conversion.
|
||||
*/
|
||||
public void setReducedReadCountsTag() {
|
||||
if ( !retrievedReduceReadCounts )
|
||||
throw new IllegalStateException("Trying to write the reduced reads counts using an uninitialized internal array of counts");
|
||||
setReducedReadCountsTag(reducedReadCounts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the reduced read count corresponding the i'th base of the reduced read.
|
||||
*
|
||||
* WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion
|
||||
* and push that value into the read's binary tags, use #setReducedReadCountsTag().
|
||||
*
|
||||
* @param i the read based coordinate inside the read
|
||||
* @param count the new count
|
||||
*/
|
||||
public final void setReducedCount(final int i, final int count) {
|
||||
if ( count < 0 )
|
||||
throw new IllegalArgumentException("the reduced count cannot be set to a negative value");
|
||||
if ( !isReducedRead() )
|
||||
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
|
||||
if ( i < 0 || i >= getReadBases().length )
|
||||
throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i);
|
||||
|
||||
// force the initialization of the counts array if it hasn't happened yet
|
||||
getReducedReadCounts()[i] = count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the reduced read counts tag for this record to counts
|
||||
*
|
||||
* WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion
|
||||
* and push that value into the read's binary tags, use #setReducedReadCountsTag().
|
||||
*
|
||||
* @param counts the count array
|
||||
*/
|
||||
public void setReducedReadCounts(final int[] counts) {
|
||||
if ( counts.length != getReadBases().length )
|
||||
throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length);
|
||||
retrievedReduceReadCounts = true;
|
||||
reducedReadCounts = counts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the number of bases corresponding the i'th base of the reduced read.
|
||||
*
|
||||
* WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion
|
||||
* and push that value into the read's binary tags, use #setReducedReadCountsTag().
|
||||
*
|
||||
* @param i the read based coordinate inside the read
|
||||
* @param adjustmentFactor how much to add/subtract to the current count
|
||||
*/
|
||||
public final void adjustReducedCount(final int i, final int adjustmentFactor) {
|
||||
if ( !isReducedRead() )
|
||||
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
|
||||
if ( i < 0 || i >= getReadBases().length )
|
||||
throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i);
|
||||
|
||||
setReducedCount(i, getReducedReadCounts()[i] + adjustmentFactor);
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually decode the consensus tag of a reduce read, returning a newly allocated
|
||||
* set of values countsFromTag to be the real depth of cover at each base of the reduced read.
|
||||
*
|
||||
* for example, if the tag contains [10, 5, -1, -5], after running this function the
|
||||
* byte[] will contain the true counts [10, 15, 9, 5].
|
||||
*
|
||||
* as one might expect.
|
||||
*
|
||||
* @param countsFromTag a non-null byte[] containing the tag encoded reduce reads counts
|
||||
* @return a non-null int[] containing the true depth values for the vector
|
||||
*/
|
||||
protected static int[] decodeReduceReadCounts(final byte[] countsFromTag) {
|
||||
final int n = countsFromTag.length;
|
||||
final int[] result = new int[n];
|
||||
final int firstCount = countsFromTag[0] & 0xff; // unsigned byte
|
||||
result[0] = firstCount;
|
||||
for ( int i = 1; i < n; i++ ) {
|
||||
final int offsetCount = countsFromTag[i] & 0xff; // unsigned byte
|
||||
result[i] = (firstCount + offsetCount) % 256;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts int array from straight counts to the appropriate reduce reads representation in BAM (offset from first value)
|
||||
*
|
||||
* @param counts the counts array
|
||||
* @return non-null converted byte array
|
||||
*/
|
||||
protected static byte[] encodeReduceReadCounts(final int[] counts) {
|
||||
if ( counts.length == 0 )
|
||||
throw new IllegalArgumentException("Trying to write a reduced read with a counts array of length 0");
|
||||
|
||||
final byte[] compressedCountsArray = new byte[counts.length];
|
||||
final int firstCount = (int) MathUtils.bound(counts[0], 0, 255); // we want an unsigned byte capped at max byte representation
|
||||
compressedCountsArray[0] = (byte)firstCount;
|
||||
for ( int i = 1; i < counts.length; i++ ) {
|
||||
final int count = (int) MathUtils.bound(counts[i], 0, 255);
|
||||
final byte offset = (byte)(count - firstCount + (count >= firstCount ? 0 : 256)); // unsigned byte
|
||||
compressedCountsArray[i] = offset;
|
||||
}
|
||||
|
||||
return compressedCountsArray;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// *** GATKSAMRecord specific methods ***//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
|
@ -682,11 +491,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return the alignment start of a read before it was clipped
|
||||
*/
|
||||
public int getOriginalAlignmentStart() {
|
||||
int originalAlignmentStart = getUnclippedStart();
|
||||
Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT);
|
||||
if (alignmentShift != null)
|
||||
originalAlignmentStart += alignmentShift;
|
||||
return originalAlignmentStart;
|
||||
return getUnclippedStart();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -697,11 +502,7 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
* @return the alignment end of a read before it was clipped
|
||||
*/
|
||||
public int getOriginalAlignmentEnd() {
|
||||
int originalAlignmentEnd = getUnclippedEnd();
|
||||
Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT);
|
||||
if (alignmentShift != null)
|
||||
originalAlignmentEnd -= alignmentShift;
|
||||
return originalAlignmentEnd;
|
||||
return getUnclippedEnd();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -735,7 +536,6 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
emptyRead.setCigarString("");
|
||||
emptyRead.setReadBases(new byte[0]);
|
||||
emptyRead.setBaseQualities(new byte[0]);
|
||||
if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new int[0]);
|
||||
|
||||
SAMReadGroupRecord samRG = read.getReadGroup();
|
||||
emptyRead.clearAttributes();
|
||||
|
|
|
|||
|
|
@ -57,15 +57,6 @@ public class ReadUtils {
|
|||
private static final int DEFAULT_ADAPTOR_SIZE = 100;
|
||||
public static final int CLIPPING_GOAL_NOT_REACHED = -1;
|
||||
|
||||
public static int getMeanRepresentativeReadCount(GATKSAMRecord read) {
|
||||
if (!read.isReducedRead())
|
||||
return 1;
|
||||
|
||||
// compute mean representative read counts
|
||||
final int[] counts = read.getReducedReadCounts();
|
||||
return (int)Math.round((double)MathUtils.sum(counts)/counts.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* A marker to tell which end of the read has been clipped
|
||||
*/
|
||||
|
|
@ -695,8 +686,7 @@ public class ReadUtils {
|
|||
case D:
|
||||
for (int i = 0; i < cigarElement.getLength(); i++) {
|
||||
if (refLocation >= startLocation && refLocation <= stopLocation) {
|
||||
int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1;
|
||||
coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases
|
||||
coverage[refLocation - startLocation]++;
|
||||
}
|
||||
refLocation++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -210,4 +210,21 @@ public class SAMDataSourceUnitTest extends BaseTest {
|
|||
List<SAMProgramRecord> doRemoveProgramRecords = data.getHeader().getProgramRecords();
|
||||
assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true");
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = UserException.class)
|
||||
public void testFailOnReducedReads() {
|
||||
readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags()));
|
||||
|
||||
SAMDataSource data = new SAMDataSource(readers,
|
||||
new ThreadAllocation(),
|
||||
null,
|
||||
genomeLocParser,
|
||||
false,
|
||||
SAMFileReader.ValidationStringency.SILENT,
|
||||
null,
|
||||
null,
|
||||
new ValidationExclusion(),
|
||||
new ArrayList<ReadFilter>(),
|
||||
false);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,16 +124,10 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
|
|||
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
|
||||
|
||||
for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) {
|
||||
for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) {
|
||||
for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) {
|
||||
if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount )
|
||||
continue;
|
||||
for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) {
|
||||
if ( toRemove <= originalNormalCount + originalReducedCount )
|
||||
tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove});
|
||||
}
|
||||
}
|
||||
for ( final int originalCount : Arrays.asList(1, 2, 10, 1000) ) {
|
||||
for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) {
|
||||
if ( toRemove <= originalCount )
|
||||
tests.add(new Object[]{header, originalCount, toRemove});
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -141,27 +135,17 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
@Test(dataProvider = "BiasedDownsamplingTest")
|
||||
public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) {
|
||||
public void testBiasedDownsampling(final SAMFileHeader header, final int originalCount, final int toRemove) {
|
||||
|
||||
final LinkedList<PileupElement> elements = new LinkedList<PileupElement>();
|
||||
for ( int i = 0; i < originalNormalCount; i++ ) {
|
||||
final LinkedList<PileupElement> elements = new LinkedList<>();
|
||||
for ( int i = 0; i < originalCount; i++ ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
|
||||
elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
|
||||
}
|
||||
if ( originalReducedCount > 0 ) {
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
|
||||
read.setReducedReadCountsTag(new int[]{originalReducedCount});
|
||||
elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
|
||||
}
|
||||
|
||||
final List<PileupElement> result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove);
|
||||
int pileupCount = 0;
|
||||
for ( final PileupElement pe : elements ) // reduced reads may have gotten modified
|
||||
pileupCount += pe.getRepresentativeCount();
|
||||
for ( final PileupElement pe : result )
|
||||
pileupCount -= pe.getRepresentativeCount();
|
||||
final List<PileupElement> result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalCount, toRemove);
|
||||
|
||||
Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove);
|
||||
Assert.assertEquals(result.size(), toRemove);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -156,36 +155,4 @@ public class FractionalDownsamplerUnitTest extends BaseTest {
|
|||
downsampler.resetStats();
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final ReadsDownsampler<GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(0.0);
|
||||
|
||||
final Collection<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts));
|
||||
}
|
||||
for ( int i = 1; i <= 5; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5));
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 5, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 10, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<GATKSAMRecord> readsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(readsReturned.size(), 10, "wrong number of items returned by the downsampler");
|
||||
|
||||
for ( GATKSAMRecord readReturned : readsReturned ) {
|
||||
Assert.assertTrue(readReturned.isReducedRead(), "non-reduced read survived the downsampling process, but shouldn't have");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,12 +25,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -164,41 +160,4 @@ public class LevelingDownsamplerUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final Downsampler<LinkedList<AlignmentStateMachine>> downsampler = new LevelingDownsampler<LinkedList<AlignmentStateMachine>, AlignmentStateMachine>(1);
|
||||
|
||||
final Collection<LinkedList<AlignmentStateMachine>> groups = new LinkedList<LinkedList<AlignmentStateMachine>>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int alignmentStart : Arrays.asList(1, 2, 3) ) {
|
||||
final LinkedList<AlignmentStateMachine> group = new LinkedList<AlignmentStateMachine>();
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
group.add(new AlignmentStateMachine(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts)));
|
||||
}
|
||||
groups.add(group);
|
||||
}
|
||||
|
||||
downsampler.submit(groups);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 30, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<LinkedList<AlignmentStateMachine>> groupsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(groupsReturned.size(), 3, "wrong number of groups returned by the downsampler");
|
||||
|
||||
for ( LinkedList<AlignmentStateMachine> group : groupsReturned ) {
|
||||
Assert.assertEquals(group.size(), 10, "group has wrong size after downsampling");
|
||||
|
||||
for ( AlignmentStateMachine state : group ) {
|
||||
Assert.assertTrue(state.isReducedRead());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -129,46 +128,4 @@ public class ReservoirDownsamplerUnitTest extends BaseTest {
|
|||
downsampler.resetStats();
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final ReadsDownsampler<GATKSAMRecord> downsampler = new ReservoirDownsampler<GATKSAMRecord>(1);
|
||||
|
||||
final Collection<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts));
|
||||
}
|
||||
for ( int i = 1; i <= 5; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5));
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 4, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 11, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<GATKSAMRecord> readsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(readsReturned.size(), 11, "wrong number of items returned by the downsampler");
|
||||
|
||||
int numReducedReadsReturned = 0;
|
||||
int numNormalReadsReturned = 0;
|
||||
for ( GATKSAMRecord readReturned : readsReturned ) {
|
||||
if ( readReturned.isReducedRead() ) {
|
||||
numReducedReadsReturned++;
|
||||
}
|
||||
else {
|
||||
numNormalReadsReturned++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(numReducedReadsReturned, 10, "wrong number of reduced reads returned by the downsampler");
|
||||
Assert.assertEquals(numNormalReadsReturned, 1, "wrong number of non-reduced reads returned by the downsampler");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -328,48 +328,4 @@ public class SimplePositionalDownsamplerUnitTest extends BaseTest {
|
|||
|
||||
Assert.assertEquals(downsampledReads.size(), 10);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoNotDiscardReducedReads() {
|
||||
GenomeAnalysisEngine.resetRandomGenerator();
|
||||
final ReadsDownsampler<GATKSAMRecord> downsampler = new SimplePositionalDownsampler<GATKSAMRecord>(1);
|
||||
|
||||
final Collection<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int alignmentStart : Arrays.asList(1, 2, 3) ) {
|
||||
for ( int i = 1; i <= 10; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts));
|
||||
}
|
||||
for ( int i = 1; i <= 5; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, alignmentStart, 5));
|
||||
}
|
||||
}
|
||||
|
||||
downsampler.submit(reads);
|
||||
downsampler.signalEndOfInput();
|
||||
|
||||
Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 12, "wrong number of items discarded by the downsampler");
|
||||
Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't");
|
||||
Assert.assertEquals(downsampler.size(), 33, "downsampler size() reports wrong number of items");
|
||||
|
||||
final Collection<GATKSAMRecord> readsReturned = downsampler.consumeFinalizedItems();
|
||||
|
||||
Assert.assertEquals(readsReturned.size(), 33, "wrong number of items returned by the downsampler");
|
||||
|
||||
int numReducedReadsReturned = 0;
|
||||
int numNormalReadsReturned = 0;
|
||||
for ( GATKSAMRecord readReturned : readsReturned ) {
|
||||
if ( readReturned.isReducedRead() ) {
|
||||
numReducedReadsReturned++;
|
||||
}
|
||||
else {
|
||||
numNormalReadsReturned++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(numReducedReadsReturned, 30, "wrong number of reduced reads returned by the downsampler");
|
||||
Assert.assertEquals(numNormalReadsReturned, 3, "wrong number of non-reduced reads returned by the downsampler");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,11 +26,9 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
|
|
@ -41,7 +39,6 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
public class TAROrderedReadCacheUnitTest extends BaseTest {
|
||||
|
|
@ -104,47 +101,6 @@ public class TAROrderedReadCacheUnitTest extends BaseTest {
|
|||
verifySortednessOfReads(cacheReads);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadCacheWithReducedReads() {
|
||||
final List<GATKSAMRecord> reads = new ArrayList<GATKSAMRecord>();
|
||||
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
final int[] baseCounts = { 10, 10, 10, 10, 10 };
|
||||
|
||||
for ( int i = 1; i <= 100; i++ ) {
|
||||
reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, i, 5, baseCounts));
|
||||
reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, i, 5));
|
||||
}
|
||||
|
||||
final TAROrderedReadCache cache = new TAROrderedReadCache(50);
|
||||
|
||||
cache.addAll(reads);
|
||||
|
||||
// Our cache should have kept all of the reduced reads (which are retained unconditionally and do not count
|
||||
// towards the capacity limit), and discarded half of the 100 non-reduced reads due to the cache capacity
|
||||
// limit of 50.
|
||||
Assert.assertEquals(cache.size(), 150, "wrong number of reads in the cache at the end");
|
||||
Assert.assertEquals(cache.getNumDiscarded(), 50, "wrong number of reads discarded from the cache");
|
||||
|
||||
final List<GATKSAMRecord> cacheReads = cache.popCurrentReads();
|
||||
|
||||
int numReducedReadsRetained = 0;
|
||||
int numNormalReadsRetained = 0;
|
||||
|
||||
for ( GATKSAMRecord read : cacheReads ) {
|
||||
if ( read.isReducedRead() ) {
|
||||
numReducedReadsRetained++;
|
||||
}
|
||||
else {
|
||||
numNormalReadsRetained++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(numReducedReadsRetained, 100, "wrong number of reduced reads retained in the cache");
|
||||
Assert.assertEquals(numNormalReadsRetained, 50, "wrong number of non-reduced reads retained in the cache");
|
||||
|
||||
verifySortednessOfReads(cacheReads);
|
||||
}
|
||||
|
||||
private void verifySortednessOfReads( final List<GATKSAMRecord> reads) {
|
||||
int lastStart = -1;
|
||||
for ( GATKSAMRecord read : reads ) {
|
||||
|
|
|
|||
|
|
@ -67,13 +67,4 @@ public class CallableLociIntegrationTest extends WalkerTest {
|
|||
Arrays.asList("7f79ad8195c4161060463eeb21d2bb11", "7ee269e5f4581a924529a356cc806e55"));
|
||||
executeTest("formatBed lots of arguments", spec);
|
||||
}
|
||||
|
||||
@Test(enabled=true)
|
||||
public void testWithReducedRead() {
|
||||
String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1,
|
||||
Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0"));
|
||||
executeTest("CallableLoci with ReducedRead", spec);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -411,22 +411,6 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
|
||||
}
|
||||
|
||||
@Test(enabled = !DEBUG)
|
||||
public void testHardClipReducedRead() {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M");
|
||||
final int[] counts = new int[read.getReadLength()];
|
||||
for ( int i = 0; i < counts.length; i++ ) counts[i] = i;
|
||||
read.setReducedReadCounts(counts);
|
||||
int alnStart = read.getAlignmentStart();
|
||||
int alnEnd = read.getAlignmentEnd();
|
||||
int readLength = read.getReadLength();
|
||||
for (int i = 0; i < readLength / 2; i++) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i);
|
||||
final int[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1);
|
||||
Assert.assertEquals(clippedRead.getReducedReadCounts(), expectedReducedCounts);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = !DEBUG)
|
||||
public void testRevertEntirelySoftclippedReads() {
|
||||
GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("2H1S3H");
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.pileup;
|
|||
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine;
|
||||
import org.broadinstitute.sting.utils.locusiterator.LIBS_position;
|
||||
|
|
@ -126,7 +125,6 @@ public class PileupElementUnitTest extends LocusIteratorByStateBaseTest {
|
|||
// TODO -- add meaningful tests
|
||||
pe.getBaseInsertionQual();
|
||||
pe.getBaseDeletionQual();
|
||||
pe.getRepresentativeCount();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -27,22 +27,15 @@ package org.broadinstitute.sting.utils.sam;
|
|||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class GATKSAMRecordUnitTest extends BaseTest {
|
||||
GATKSAMRecord read, reducedRead;
|
||||
GATKSAMRecord read;
|
||||
final static String BASES = "ACTG";
|
||||
final static String QUALS = "!+5?";
|
||||
final private static int[] REDUCED_READ_COUNTS = new int[]{10, 20, 30, 40};
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
|
|
@ -51,121 +44,6 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
read.setReadUnmappedFlag(true);
|
||||
read.setReadBases(new String(BASES).getBytes());
|
||||
read.setBaseQualityString(new String(QUALS));
|
||||
|
||||
reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length());
|
||||
reducedRead.setReadBases(BASES.getBytes());
|
||||
reducedRead.setBaseQualityString(QUALS);
|
||||
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedReads() {
|
||||
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS);
|
||||
|
||||
Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read");
|
||||
Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read");
|
||||
|
||||
Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read");
|
||||
for (int i = 0; i < reducedRead.getReadLength(); i++) {
|
||||
Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testGetReducedCountOnNormalRead() {
|
||||
read.getReducedCount(0);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testSetReducedTagOnNormalRead() {
|
||||
read.setReducedCount(0, 2);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalArgumentException.class)
|
||||
public void testAdjustReducedCountToNegativeNumber() {
|
||||
reducedRead.setReducedCount(0, 1);
|
||||
reducedRead.adjustReducedCount(0, -2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetReducedCountOnReducedRead() {
|
||||
for (int i = 0; i < reducedRead.getReadLength(); i++) {
|
||||
final byte newCount = (byte)i;
|
||||
reducedRead.setReducedCount(i, newCount);
|
||||
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < reducedRead.getReadLength(); i++) {
|
||||
final int newCount = reducedRead.getReducedCount(i) + i;
|
||||
reducedRead.adjustReducedCount(i, i);
|
||||
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedReadEncodeAndDecode() {
|
||||
|
||||
// encode
|
||||
byte[] encoded = GATKSAMRecord.encodeReduceReadCounts(REDUCED_READ_COUNTS);
|
||||
|
||||
// decode
|
||||
int[] decoded = GATKSAMRecord.decodeReduceReadCounts(encoded);
|
||||
|
||||
// for the heck of it, let's encode and decode again!
|
||||
encoded = GATKSAMRecord.encodeReduceReadCounts(decoded);
|
||||
decoded = GATKSAMRecord.decodeReduceReadCounts(encoded);
|
||||
|
||||
for (int i = 0; i < decoded.length; i++)
|
||||
Assert.assertEquals(decoded[i], REDUCED_READ_COUNTS[i]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testByteBoundsOnReducedTag() {
|
||||
reducedRead.setReducedCount(0, 1000);
|
||||
reducedRead.setReducedReadCountsTag();
|
||||
reducedRead.adjustReducedCount(0, -255);
|
||||
Assert.assertEquals(reducedRead.getReducedCount(0), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReducedReadPileupElement() {
|
||||
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS);
|
||||
|
||||
PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0);
|
||||
PileupElement reducedreadp = LocusIteratorByState.createPileupForReadAndOffset(reducedRead, 0);
|
||||
|
||||
Assert.assertFalse(readp.getRead().isReducedRead());
|
||||
|
||||
Assert.assertTrue(reducedreadp.getRead().isReducedRead());
|
||||
Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]);
|
||||
Assert.assertEquals(reducedreadp.getQual(), readp.getQual());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOriginalAlignments() {
|
||||
final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'};
|
||||
final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 };
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M");
|
||||
|
||||
// A regular read with all matches
|
||||
Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart());
|
||||
Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd());
|
||||
|
||||
// Alignment start shifted
|
||||
int alignmentShift = 2;
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, alignmentShift);
|
||||
Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart());
|
||||
Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd());
|
||||
|
||||
// Both alignments shifted
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, alignmentShift);
|
||||
Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart());
|
||||
Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd());
|
||||
|
||||
// Alignment end shifted
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null);
|
||||
Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart());
|
||||
Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -197,36 +75,4 @@ public class GATKSAMRecordUnitTest extends BaseTest {
|
|||
read.setIsStrandless(true);
|
||||
read.setReadNegativeStrandFlag(true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetReducedCountsIsCorrect() {
|
||||
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS);
|
||||
final int[] counts = reducedRead.getReducedReadCounts();
|
||||
Assert.assertNotSame(counts, reducedRead.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG));
|
||||
for ( int i = 0; i < counts.length; i++ )
|
||||
Assert.assertEquals(counts[i], reducedRead.getReducedCount(i), "Reduced counts vector not equal to getReducedCount(i) at " + i);
|
||||
}
|
||||
|
||||
@DataProvider(name = "ReducedReadCountConversionProvider")
|
||||
public Object[][] ReducedReadCountConversionTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{new int[] {100, 100, 100, 101}, new byte[] {100, 0, 0, 1}});
|
||||
tests.add(new Object[]{new int[] {1, 100, 100, 0}, new byte[] {1, 99, 99, -1}});
|
||||
tests.add(new Object[]{new int[] {127, 100, 0, 1}, new byte[] {127, -27, -127, -126}});
|
||||
tests.add(new Object[]{new int[] {1, 127, 51, 126}, new byte[] {1, 126, 50, 125}});
|
||||
tests.add(new Object[]{new int[] {300, 127, 1, 255}, new byte[] {-1, -128, 2, 0}});
|
||||
tests.add(new Object[]{new int[] {1, 300, 51, 126}, new byte[] {1, -2, 50, 125}});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ReducedReadCountConversionProvider", enabled = true)
|
||||
public void reducedReadCountConversionTest(final int[] counts, final byte[] expectedConversion) {
|
||||
|
||||
reducedRead.setReducedReadCountsTag(counts);
|
||||
final byte[] actualConversion = reducedRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG);
|
||||
for ( int i = 0; i < actualConversion.length; i++ )
|
||||
Assert.assertEquals(actualConversion[i], expectedConversion[i], "Conversion differs at position " + i + ": " + actualConversion[i] + " vs. " + expectedConversion[i]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue