Merge remote-tracking branch 'unstable/master'
This commit is contained in:
commit
e29b52b9a5
|
|
@ -1,7 +1,6 @@
|
|||
/*.bam
|
||||
/*.bai
|
||||
/*.bed
|
||||
*.idx
|
||||
*~
|
||||
/*.vcf
|
||||
/*.txt
|
||||
|
|
|
|||
65
build.xml
65
build.xml
|
|
@ -91,9 +91,8 @@
|
|||
<property name="key.dir" value="${public.dir}/keys" />
|
||||
|
||||
<!-- Contracts for Java -->
|
||||
<!-- By default, enabled only for test targets -->
|
||||
<!-- To disable for test targets, run with -Duse.contracts=false -->
|
||||
<!-- To enable for non-test targets, run with -Duse.contracts=true -->
|
||||
<!-- Disabled by default -->
|
||||
<!-- To enable, run with -Duse.contracts=true -->
|
||||
<property name="java.contracts.dir" value="${build.dir}/java/contracts" />
|
||||
<property name="contracts.version" value="1.0-r139" />
|
||||
<property name="cofoja.jar" value="${lib.dir}/cofoja-${contracts.version}.jar"/>
|
||||
|
|
@ -675,8 +674,9 @@
|
|||
<include name="org/broadinstitute/sting/utils/GenomeLocParser*.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/GenomeLoc.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/HasGenomeLocation.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/BaseUtils.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/BaseUtils*.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/Utils.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/MRUCaching*.class"/>
|
||||
<include name="org/broadinstitute/sting/utils/exceptions/**/*.class"/>
|
||||
<include name="org/broadinstitute/sting/gatk/walkers/na12878kb/core/**/*.class"/>
|
||||
<include name="net/sf/picard/reference/FastaSequenceFile.class"/>
|
||||
|
|
@ -865,14 +865,18 @@
|
|||
<property name="executable" value="GenomeAnalysisTK" />
|
||||
</target>
|
||||
|
||||
<target name="init.executable.queueall" depends="init.build.all, init.javaandscala">
|
||||
<property name="executable" value="Queue" />
|
||||
<target name="init.executable.gatkall" depends="init.build.all, init.javaonly">
|
||||
<property name="executable" value="GenomeAnalysisTK" />
|
||||
</target>
|
||||
|
||||
<target name="init.executable.queuefull" depends="init.build.publicprotectedonly, init.javaandscala">
|
||||
<property name="executable" value="Queue" />
|
||||
</target>
|
||||
|
||||
<target name="init.executable.queueall" depends="init.build.all, init.javaandscala">
|
||||
<property name="executable" value="Queue" />
|
||||
</target>
|
||||
|
||||
<target name="require.executable">
|
||||
<condition property="no.executable.defined">
|
||||
<or>
|
||||
|
|
@ -921,12 +925,17 @@
|
|||
</target>
|
||||
|
||||
<!-- Package specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
|
||||
|
||||
<!-- GATK "full" == public + protected, ie., the standard binary release of the GATK -->
|
||||
<target name="package.gatk.full" depends="init.executable.gatkfull,package" />
|
||||
|
||||
<target name="package.queue.all" depends="init.executable.queueall,package" />
|
||||
<!-- GATK "all" == public + protected + private. Should never be publicly released -->
|
||||
<target name="package.gatk.all" depends="init.executable.gatkall,package" />
|
||||
|
||||
<target name="package.queue.full" depends="init.executable.queuefull,package" />
|
||||
|
||||
<target name="package.queue.all" depends="init.executable.queueall,package" />
|
||||
|
||||
<!-- Release a build. Don't call this target directly. Call one of the specific release targets below -->
|
||||
<target name="release" depends="require.executable" description="release a build, putting each file in a location specified by the package">
|
||||
<ant antfile="${package.output.dir}/${executable}.xml" target="release" />
|
||||
|
|
@ -1104,7 +1113,7 @@
|
|||
|
||||
<path id="testng.default.classpath">
|
||||
<path refid="build.results" />
|
||||
<pathelement path="${clover.jar}"/>
|
||||
<pathelement path="${clover.jar}"/>
|
||||
<pathelement location="${java.contracts.dir}" />
|
||||
<pathelement location="${java.test.classes}" />
|
||||
<pathelement location="${scala.test.classes}" />
|
||||
|
|
@ -1114,7 +1123,7 @@
|
|||
|
||||
<target name="clover.report">
|
||||
<clover-report coverageCacheSize="nocache">
|
||||
<current outfile="clover_html" title="GATK clover report" showUniqueCoverage="false" numThreads="4">
|
||||
<current outfile="clover_html" title="GATK clover report" showUniqueCoverage="false" numThreads="4">
|
||||
<format type="html" filter="catch,static,property"/>
|
||||
<fileset dir="public">
|
||||
<patternset id="clover.excludes">
|
||||
|
|
@ -1194,8 +1203,8 @@
|
|||
</scalac>
|
||||
</target>
|
||||
|
||||
<!-- NOTE: contracts enabled for all tests -->
|
||||
<target name="test.compile" depends="init.usecontracts,dist,test.java.compile,test.scala.compile" />
|
||||
<!-- NOTE: contracts disabled for all tests now, since contracts don't work with Java 7 -->
|
||||
<target name="test.compile" depends="dist,test.java.compile,test.scala.compile" />
|
||||
|
||||
|
||||
<!-- Run test macro -->
|
||||
|
|
@ -1244,7 +1253,7 @@
|
|||
listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter">
|
||||
<jvmarg value="-Xmx${test.maxmemory}" />
|
||||
<jvmarg value="-ea" />
|
||||
<jvmarg value="-Dclover.pertest.coverage=diff" />
|
||||
<jvmarg value="-Dclover.pertest.coverage=diff" />
|
||||
<jvmarg value="-Djava.awt.headless=true" />
|
||||
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
|
||||
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
|
||||
|
|
@ -1287,7 +1296,7 @@
|
|||
|
||||
<target name="test.init">
|
||||
<property name="testng.classpath" value="testng.default.classpath" />
|
||||
<property name="test.maxmemory" value="${test.default.maxmemory}"/>
|
||||
<property name="test.maxmemory" value="${test.default.maxmemory}"/>
|
||||
</target>
|
||||
|
||||
<target name="init.testgatkjar">
|
||||
|
|
@ -1331,9 +1340,9 @@
|
|||
<target name="committests" depends="unittest,integrationtest,pipelinetest" />
|
||||
|
||||
<!-- Order of the dependencies is significant in the *.release.tests targets -->
|
||||
<target name="gatkfull.binary.release.tests" depends="init.usecontracts,package.gatk.full,init.testgatkjar,unittest,integrationtest" />
|
||||
<target name="gatkfull.binary.release.tests" depends="package.gatk.full,init.testgatkjar,unittest,integrationtest" />
|
||||
|
||||
<target name="queuefull.binary.release.tests" depends="init.usecontracts,package.queue.full,init.testqueuejar,pipelinetest" />
|
||||
<target name="queuefull.binary.release.tests" depends="package.queue.full,init.testqueuejar,pipelinetest" />
|
||||
|
||||
<!-- Our four different test types: UnitTest, IntegrationTest, LargeScaleTest, PipelineTest -->
|
||||
<target name="unittest" depends="test.compile,test.init" description="Run unit tests">
|
||||
|
|
@ -1442,4 +1451,30 @@
|
|||
|
||||
<run-test testtype="${single}" outputdir="${report}/${single}" runfailed="false"/>
|
||||
</target>
|
||||
|
||||
<!-- A target that runs a test without doing ANY compilation or any extra work at all -->
|
||||
<!-- Intended to enable parallel tests that share the same working directory and build -->
|
||||
<target name="runtestonly">
|
||||
<condition property="not.clean">
|
||||
<and>
|
||||
<available file="${build.dir}" />
|
||||
<available file="${lib.dir}" />
|
||||
<available file="${dist.dir}" />
|
||||
<available file="${java.test.classes}" />
|
||||
</and>
|
||||
</condition>
|
||||
<fail message="runtestonly target requires a NON-CLEAN working directory (INCLUDING test classes). Do a full test build using ant test.compile first." unless="not.clean" />
|
||||
|
||||
<condition property="no.single.test.specified">
|
||||
<equals arg1="${single}" arg2="$${single}" />
|
||||
</condition>
|
||||
<fail message="Must specify a specific test. Usage: ant runtestonly -Dsingle=TestClass" if="no.single.test.specified" />
|
||||
|
||||
<property name="testng.classpath" value="testng.default.classpath" />
|
||||
<property name="test.maxmemory" value="${test.default.maxmemory}"/>
|
||||
<property name="include.scala" value="true" />
|
||||
|
||||
<run-test testtype="${single}" outputdir="${report}/${single}" runfailed="false"/>
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
|
|
|||
2
ivy.xml
2
ivy.xml
|
|
@ -41,6 +41,8 @@
|
|||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
<dependency org="it.unimi.dsi" name="fastutil" rev="6.5.3" />
|
||||
|
||||
<!-- <dependency org="jboss" name="javassist" rev="3.7.ga"/> -->
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
|
||||
<dependency org="org.apache.bcel" name="bcel" rev="5.2"/>
|
||||
|
|
|
|||
|
|
@ -55,7 +55,9 @@ import org.broadinstitute.variant.variantcontext.VariantContext;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
|
@ -74,6 +76,12 @@ public class StandardCallerArgumentCollection {
|
|||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
|
||||
|
||||
/**
|
||||
* This argument informs the prior probability of having an indel at a site.
|
||||
*/
|
||||
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double INDEL_HETEROZYGOSITY = 1.0/8000;
|
||||
|
||||
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
|
||||
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
|
||||
|
||||
|
|
@ -112,6 +120,29 @@ public class StandardCallerArgumentCollection {
|
|||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 6;
|
||||
|
||||
/**
|
||||
* By default, the prior specified with the argument --heterozygosity/-hets is used for variant discovery at a particular locus, using an infinite sites model,
|
||||
* see e.g. Waterson (1975) or Tajima (1996).
|
||||
* This model asserts that the probability of having a population of k variant sites in N chromosomes is proportional to theta/k, for 1=1:N
|
||||
*
|
||||
* There are instances where using this prior might not be desireable, e.g. for population studies where prior might not be appropriate,
|
||||
* as for example when the ancestral status of the reference allele is not known.
|
||||
* By using this argument, user can manually specify priors to be used for calling as a vector for doubles, with the following restriciotns:
|
||||
* a) User must specify 2N values, where N is the number of samples.
|
||||
* b) Only diploid calls supported.
|
||||
* c) Probability values are specified in double format, in linear space.
|
||||
* d) No negative values allowed.
|
||||
* e) Values will be added and Pr(AC=0) will be 1-sum, so that they sum up to one.
|
||||
* f) If user-defined values add to more than one, an error will be produced.
|
||||
*
|
||||
* If user wants completely flat priors, then user should specify the same value (=1/(2*N+1)) 2*N times,e.g.
|
||||
* -inputPrior 0.33 -inputPrior 0.33
|
||||
* for the single-sample diploid case.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "input_prior", shortName = "inputPrior", doc = "Input prior for calls", required = false)
|
||||
public List<Double> inputPrior = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
|
||||
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
|
||||
|
|
@ -155,10 +186,6 @@ public class StandardCallerArgumentCollection {
|
|||
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
|
||||
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
|
||||
public PrintStream contaminationLog = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
|
|
@ -170,15 +197,16 @@ public class StandardCallerArgumentCollection {
|
|||
this.alleles = SCAC.alleles;
|
||||
this.GenotypingMode = SCAC.GenotypingMode;
|
||||
this.heterozygosity = SCAC.heterozygosity;
|
||||
this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY;
|
||||
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
|
||||
this.OutputMode = SCAC.OutputMode;
|
||||
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
|
||||
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
|
||||
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
|
||||
this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
|
||||
this.contaminationLog = SCAC.contaminationLog;
|
||||
this.exactCallsLog = SCAC.exactCallsLog;
|
||||
this.sampleContamination=SCAC.sampleContamination;
|
||||
this.AFmodel = SCAC.AFmodel;
|
||||
this.inputPrior = SCAC.inputPrior;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -58,8 +59,12 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele).
|
||||
* Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities(ref bases vs. bases of the alternate allele).</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*/
|
||||
public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
public List<String> getKeyNames() { return Arrays.asList("BaseQRankSum"); }
|
||||
|
|
@ -86,13 +91,13 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot
|
|||
}
|
||||
|
||||
for (Map<Allele,Double> el : alleleLikelihoodMap.getLikelihoodMapValues()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el);
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el);
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
refQuals.add(-10.0*(double)el.get(a));
|
||||
else if (allAlleles.contains(a))
|
||||
altQuals.add(-10.0*(double)el.get(a));
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele()));
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add(-10.0*(double)el.get(a.getMostLikelyAllele()));
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,9 +65,15 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Allele count in genotypes, for each ALT allele, in the same order as listed;
|
||||
* allele Frequency, for each ALT allele, in the same order as listed; total number
|
||||
* of alleles in called genotypes.
|
||||
* Allele counts and frequency for each ALT allele and total number of alleles in called genotypes
|
||||
*
|
||||
* <p>This annotation tool outputs the following:
|
||||
*
|
||||
* <ul>
|
||||
* <li>Allele count in genotypes, for each ALT allele, in the same order as listed</li>
|
||||
* <li>Allele Frequency, for each ALT allele, in the same order as listed</li>
|
||||
* <li>Total number of alleles in called genotypes</li>
|
||||
* </ul></p>
|
||||
*/
|
||||
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -57,14 +58,15 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 6/28/12
|
||||
*/
|
||||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele)
|
||||
* Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele).</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since 6/28/12
|
||||
*/
|
||||
public class ClippingRankSumTest extends RankSumTest {
|
||||
|
||||
|
|
@ -83,12 +85,12 @@ public class ClippingRankSumTest extends RankSumTest {
|
|||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
|
||||
else if (allAlleles.contains(a))
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey()));
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,10 +70,11 @@ import java.util.Map;
|
|||
/**
|
||||
* Total (unfiltered) depth over all samples.
|
||||
*
|
||||
* While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's
|
||||
* <p>While the sample-level (FORMAT) DP field describes the total depth of reads that passed the caller's
|
||||
* internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
|
||||
* over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
|
||||
* N samples with -dcov D is N * D
|
||||
* </p>
|
||||
*/
|
||||
public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
|
||||
|
|
@ -72,11 +73,11 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* The depth of coverage of each VCF allele in this sample.
|
||||
* The depth of coverage of each allele per sample
|
||||
*
|
||||
* The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
|
||||
* <p>The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
|
||||
* sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
|
||||
* Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
|
||||
* caller's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
|
||||
* REF and ALT fields) is the unfiltered count of all reads that carried with them the
|
||||
* REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
|
||||
* power I have to determine the genotype of the sample at this site, while the AD tells me how many times
|
||||
|
|
@ -86,10 +87,12 @@ import java.util.Map;
|
|||
* normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
|
||||
* the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
|
||||
* Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
|
||||
* many non-informatice reads.
|
||||
* Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation,
|
||||
* many non-informative reads.</p>
|
||||
*
|
||||
* <p>Because the AD includes reads and bases that were filtered by the caller and in case of indels is based on a statistical computation,
|
||||
* <b>one should not base assumptions about the underlying genotype based on it</b>;
|
||||
* instead, the genotype likelihoods (PLs) are what determine the genotype calls.
|
||||
* instead, the genotype likelihoods (PLs) are what determine the genotype calls.</p>
|
||||
*
|
||||
*/
|
||||
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
@ -139,12 +142,12 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
if (!vc.getAlleles().contains(a))
|
||||
if (!vc.getAlleles().contains(a.getMostLikelyAllele()))
|
||||
continue; // sanity check - shouldn't be needed
|
||||
alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
alleleCounts.put(a.getMostLikelyAllele(), alleleCounts.get(a.getMostLikelyAllele()) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1));
|
||||
}
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference());
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import cern.jet.math.Arithmetic;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -54,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
|
@ -68,12 +70,19 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
|
||||
* being seen on only the forward or only the reverse strand) in the reads? More bias is
|
||||
* indicative of false positive calls. Note that the fisher strand test may not be
|
||||
* calculated for certain complex indel cases or for multi-allelic sites.
|
||||
* Phred-scaled p-value using Fisher's Exact Test to detect strand bias
|
||||
*
|
||||
* <p>Phred-scaled p-value using Fisher's Exact Test to detect strand bias (the variation
|
||||
* being seen on only the forward or only the reverse strand) in the reads. More bias is
|
||||
* indicative of false positive calls.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.</p>
|
||||
*/
|
||||
public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
private final static Logger logger = Logger.getLogger(FisherStrand.class);
|
||||
|
||||
private static final String FS = "FS";
|
||||
private static final double MIN_PVALUE = 1E-320;
|
||||
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
|
||||
|
|
@ -95,6 +104,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
else if (stratifiedPerReadAlleleLikelihoodMap != null) {
|
||||
// either SNP with no alignment context, or indels: per-read likelihood map needed
|
||||
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc);
|
||||
// logger.info("VC " + vc);
|
||||
// printTable(table, 0.0);
|
||||
return pValueForBestTable(table, null);
|
||||
}
|
||||
else
|
||||
|
|
@ -131,9 +142,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
private Map<String, Object> annotationForOneTable(final double pValue) {
|
||||
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
|
||||
return Collections.singletonMap(FS, value);
|
||||
// Map<String, Object> map = new HashMap<String, Object>();
|
||||
// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)));
|
||||
// return map;
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
|
|
@ -192,7 +200,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
|
||||
private static void printTable(int[][] table, double pValue) {
|
||||
System.out.printf("%d %d; %d %d : %f\n", table[0][0], table[0][1], table[1][0], table[1][1], pValue);
|
||||
logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue));
|
||||
}
|
||||
|
||||
private static boolean rotateTable(int[][] table) {
|
||||
|
|
@ -266,10 +274,10 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
|
||||
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
updateTable(table, mostLikelyAllele, read, ref, alt, representativeCount);
|
||||
updateTable(table, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -306,22 +314,31 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
}
|
||||
|
||||
private static void updateTable(final int[][] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) {
|
||||
// ignore reduced reads because they are always on the forward strand!
|
||||
// TODO -- when het compression is enabled in RR, we somehow need to allow those reads through into the Fisher test
|
||||
if ( read.isReducedRead() )
|
||||
return;
|
||||
|
||||
final boolean matchesRef = allele.equals(ref, true);
|
||||
final boolean matchesAlt = allele.equals(alt, true);
|
||||
|
||||
if ( matchesRef || matchesAlt ) {
|
||||
final int row = matchesRef ? 0 : 1;
|
||||
|
||||
final boolean isFW = !read.getReadNegativeStrandFlag();
|
||||
if ( read.isStrandless() ) {
|
||||
|
||||
int row = matchesRef ? 0 : 1;
|
||||
int column = isFW ? 0 : 1;
|
||||
// ignore strandless reduced reads because they are always on the forward strand!
|
||||
if ( !read.isReducedRead() ) {
|
||||
|
||||
table[row][column] += representativeCount;
|
||||
// a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
|
||||
// (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
|
||||
// if the read is only seen once, because it's a merged read or other)
|
||||
final int toAdd = Math.max(representativeCount / 2, 1);
|
||||
table[row][0] += toAdd;
|
||||
table[row][1] += toAdd;
|
||||
}
|
||||
} else {
|
||||
// a normal read with an actual strand
|
||||
final boolean isFW = !read.getReadNegativeStrandFlag();
|
||||
final int column = isFW ? 0 : 1;
|
||||
table[row][column] += representativeCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,10 +68,16 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* The GC content (# GC bases / # all bases) of the reference within 50 bp +/- this site
|
||||
* GC content of the reference around the given site
|
||||
*
|
||||
* <p>The GC content is the number of GC bases relative to the total number of bases (# GC bases / # all bases) around this site on the reference.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The window size used to calculate the GC content around the site is set by the tool used for annotation
|
||||
* (currently UnifiedGenotyper, HaplotypeCaller or VariantAnnotator). See the Technical Document for each tool
|
||||
* to find out what window size they use.</p>
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
public class GCContent extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -87,7 +93,7 @@ public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnota
|
|||
|
||||
public List<String> getKeyNames() { return Arrays.asList("GC"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content within 20 bp +/- the variant")); }
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("GC", 1, VCFHeaderLineType.Integer, "GC content around the variant (see docs for window size details)")); }
|
||||
|
||||
public boolean useZeroQualityReads() { return false; }
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
|
|
@ -68,9 +69,16 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium
|
||||
* Hardy-Weinberg test for disequilibrium
|
||||
*
|
||||
* <p>This annotation calculates the Phred-scaled P value of genotype-based (using GT field) test for Hardy-Weinberg test for disequilibrium.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <h4>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</h4>
|
||||
* <p>Right now we just ignore genotypes that are not confident, but this throws off our HW ratios.
|
||||
* More analysis is needed to determine the right thing to do when the genotyper cannot decide whether a given sample is het or hom var.</p>
|
||||
*/
|
||||
public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgressAnnotation {
|
||||
public class HardyWeinberg extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
private static final int MIN_SAMPLES = 10;
|
||||
private static final int MIN_GENOTYPE_QUALITY = 10;
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -63,9 +64,16 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Largest contiguous homopolymer run of the variant allele in either direction on the reference. Computed only for bi-allelic sites.
|
||||
* Largest contiguous homopolymer run of the variant allele
|
||||
*
|
||||
* <p>Calculates the length of the largest contiguous homopolymer run of the variant allele in either direction on the reference.</p>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <p>This can only be computed for bi-allelic sites.</p>
|
||||
* <h4>This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.</h4>
|
||||
* <p>This needs to be computed in a more accurate manner. We currently look only at direct runs of the alternate allele adjacent to this position.</p>
|
||||
*/
|
||||
public class HomopolymerRun extends InfoFieldAnnotation {
|
||||
public class HomopolymerRun extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
private boolean ANNOTATE_INDELS = true;
|
||||
|
||||
|
|
|
|||
|
|
@ -65,13 +65,20 @@ import org.broadinstitute.variant.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation
|
||||
* versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is
|
||||
* diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than
|
||||
* the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios.
|
||||
* Likelihood of being a Mendelian Violation
|
||||
*
|
||||
* <p>Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation
|
||||
* versus the likelihood of the site transmitting according to mendelian rules. </p>
|
||||
*
|
||||
* <p>Note that this annotation requires a valid ped file.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>This tool assumes that the organism is diploid. When multiple trios are present, the annotation is simply the maximum
|
||||
* of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain
|
||||
* sites and many trios.</p>
|
||||
*/
|
||||
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation {
|
||||
|
||||
private MendelianViolation mendelianViolation = null;
|
||||
public static final String MVLR_KEY = "MVLR";
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -59,8 +60,12 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele)
|
||||
* Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele).</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*/
|
||||
public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
|
|
@ -88,13 +93,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn
|
|||
return;
|
||||
}
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : likelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
// BUGBUG: There needs to be a comparable isUsableBase check here
|
||||
if (a.isNoCall())
|
||||
if (! a.isInformative())
|
||||
continue; // read is non-informative
|
||||
if (a.isReference())
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)el.getKey().getMappingQuality());
|
||||
else if (allAlleles.contains(a))
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)el.getKey().getMappingQuality());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -54,16 +55,14 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Variant confidence (from the QUAL field) / unfiltered depth of non-reference samples. Note that the QD is also normalized by event length.
|
||||
|
|
@ -72,6 +71,7 @@ import java.util.Map;
|
|||
* reads associated with the samples with polymorphic genotypes.
|
||||
*/
|
||||
public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
// private final static Logger logger = Logger.getLogger(QualByDepth.class);
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -113,13 +113,37 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( depth == 0 )
|
||||
return null;
|
||||
|
||||
double altAlleleLength = AverageAltAlleleLength.getMeanAltAlleleLength(vc);
|
||||
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
|
||||
double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength);
|
||||
QD = fixTooHighQD(QD);
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* The haplotype caller generates very high quality scores when multiple events are on the
|
||||
* same haplotype. This causes some very good variants to have unusually high QD values,
|
||||
* and VQSR will filter these out. This code looks at the QD value, and if it is above
|
||||
* threshold we map it down to the mean high QD value, with some jittering
|
||||
*
|
||||
* // TODO -- remove me when HaplotypeCaller bubble caller is live
|
||||
*
|
||||
* @param QD the raw QD score
|
||||
* @return a QD value
|
||||
*/
|
||||
private double fixTooHighQD(final double QD) {
|
||||
if ( QD < MAX_QD_BEFORE_FIXING ) {
|
||||
return QD;
|
||||
} else {
|
||||
return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA;
|
||||
}
|
||||
}
|
||||
|
||||
private final static double MAX_QD_BEFORE_FIXING = 35;
|
||||
private final static double IDEAL_HIGH_QD = 30;
|
||||
private final static double JITTER_SIGMA = 3;
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("QD"); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
|
|
|
|||
|
|
@ -183,6 +183,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
* @param headerLines
|
||||
*/
|
||||
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
useDithering = ! toolkit.getArguments().disableRandomization;
|
||||
useDithering = ! toolkit.getArguments().disableDithering;
|
||||
}
|
||||
}
|
||||
|
|
@ -51,6 +51,7 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
|
@ -65,8 +66,12 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error).
|
||||
* Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.
|
||||
* U-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele
|
||||
*
|
||||
* <p>This tool calculates the u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele. If the alternate allele is only seen near the ends of reads, this is indicative of error.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>The read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles.</p>
|
||||
*/
|
||||
public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation {
|
||||
|
||||
|
|
@ -103,8 +108,8 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
}
|
||||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (a.isNoCall())
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
if (! a.isInformative() )
|
||||
continue; // read is non-informative
|
||||
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
|
|
@ -119,9 +124,9 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
|
|||
if (readPos > numAlignedBases / 2)
|
||||
readPos = numAlignedBases - (readPos + 1);
|
||||
|
||||
if (a.isReference())
|
||||
if (a.getMostLikelyAllele().isReference())
|
||||
refQuals.add((double)readPos);
|
||||
else if (allAlleles.contains(a))
|
||||
else if (allAlleles.contains(a.getMostLikelyAllele()))
|
||||
altQuals.add((double)readPos);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -65,7 +66,9 @@ import java.util.Map;
|
|||
|
||||
|
||||
/**
|
||||
* Fraction of reads containing spanning deletions at this site.
|
||||
* Fraction of reads containing spanning deletions at this site
|
||||
*
|
||||
* <p>Note that this annotation is currently not compatible with HaplotypeCaller.</p>
|
||||
*/
|
||||
public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
|
||||
|
|
@ -86,10 +89,12 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn
|
|||
int deletions = 0;
|
||||
int depth = 0;
|
||||
for ( Map.Entry<String, AlignmentContext> sample : stratifiedContexts.entrySet() ) {
|
||||
AlignmentContext context = sample.getValue();
|
||||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
deletions += pileup.getNumberOfDeletions();
|
||||
depth += pileup.getNumberOfElements();
|
||||
for ( final PileupElement p : sample.getValue().getBasePileup() ) {
|
||||
final int actualSampleDepth = p.getRepresentativeCount();
|
||||
depth += actualSampleDepth;
|
||||
if ( p.isDeletion() )
|
||||
deletions += actualSampleDepth;
|
||||
}
|
||||
}
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", depth == 0 ? 0.0 : (double)deletions/(double)depth));
|
||||
|
|
|
|||
|
|
@ -65,7 +65,14 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* Annotates variants that are composed of tandem repeats
|
||||
*
|
||||
* <p>This tool outputs the number of times the tandem repeat unit is repeated, for each allele (including reference).</p>
|
||||
*
|
||||
* <h2>Caveat</h2>
|
||||
* <p>This annotation is currently not compatible with HaplotypeCaller.</p>
|
||||
*/
|
||||
public class TandemRepeatAnnotator extends InfoFieldAnnotation implements StandardAnnotation {
|
||||
private static final String STR_PRESENT = "STR";
|
||||
private static final String REPEAT_UNIT_KEY = "RU";
|
||||
|
|
|
|||
|
|
@ -65,12 +65,21 @@ import org.broadinstitute.variant.variantcontext.VariantContext;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin, lfran, ebanks
|
||||
* Date: 11/14/11
|
||||
* Wittkowski transmission disequilibrium test
|
||||
*
|
||||
* <p>Test statistic from Wittkowski transmission disequilibrium test.
|
||||
* The calculation is based on the following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT</p>
|
||||
*
|
||||
* <p>Note that this annotation requires a valid ped file.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>This annotation can only be used with VariantAnnotator (not with UnifiedGenotyper or HaplotypeCaller).</p>
|
||||
*
|
||||
* @author rpoplin, lfran, ebanks
|
||||
* @since 11/14/11
|
||||
*/
|
||||
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements RodRequiringAnnotation {
|
||||
|
||||
private Set<Sample> trios = null;
|
||||
private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information
|
||||
|
|
|
|||
|
|
@ -50,7 +50,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.IndelUtils;
|
||||
|
|
@ -62,8 +61,11 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.)
|
||||
*
|
||||
* <p>This tool assigns a roughly correct category of the variant type (SNP, MNP, insertion, deletion, etc.).
|
||||
* It also specifies whether the variant is multiallelic (>2 alleles).</p>
|
||||
*/
|
||||
public class VariantType extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
public class VariantType extends InfoFieldAnnotation {
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ import java.util.List;
|
|||
|
||||
public class BQSRGatherer extends Gatherer {
|
||||
|
||||
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty";
|
||||
private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file";
|
||||
private static final String MISSING_OUTPUT_FILE = "missing output file name";
|
||||
|
||||
@Override
|
||||
|
|
@ -80,6 +80,8 @@ public class BQSRGatherer extends Gatherer {
|
|||
RecalibrationReport generalReport = null;
|
||||
for (File input : inputs) {
|
||||
final RecalibrationReport inputReport = new RecalibrationReport(input);
|
||||
if( inputReport.isEmpty() ) { continue; }
|
||||
|
||||
if (generalReport == null)
|
||||
generalReport = inputReport;
|
||||
else
|
||||
|
|
|
|||
|
|
@ -95,14 +95,14 @@ import java.util.List;
|
|||
*
|
||||
* <p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The input read data whose base quality scores need to be assessed.
|
||||
* <p>
|
||||
* A database of known polymorphic sites to skip over.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A GATK Report file with many tables:
|
||||
* <ol>
|
||||
|
|
@ -116,7 +116,7 @@ import java.util.List;
|
|||
* The GATK Report is intended to be easy to read by humans or computers. Check out the documentation of the GATKReport to learn how to manipulate this table.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -T BaseRecalibrator \
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ import java.util.List;
|
|||
* User: rpoplin
|
||||
* Date: Nov 27, 2009
|
||||
*
|
||||
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
|
||||
* A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker.
|
||||
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
|
||||
*/
|
||||
|
||||
|
|
@ -91,7 +91,7 @@ public class RecalibrationArgumentCollection {
|
|||
* If not provided, then no plots will be generated (useful for queue scatter/gathering).
|
||||
* However, we *highly* recommend that users generate these plots whenever possible for QC checking.
|
||||
*/
|
||||
@Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false)
|
||||
@Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false, defaultToStdout = false)
|
||||
public File RECAL_PDF_FILE = null;
|
||||
|
||||
/**
|
||||
|
|
@ -131,14 +131,14 @@ public class RecalibrationArgumentCollection {
|
|||
public boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* BaseRecalibrator accepts a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* reads which have had the reference inserted because of color space inconsistencies.
|
||||
*/
|
||||
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
|
||||
* BaseRecalibrator accepts a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
|
||||
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
|
||||
* their color space tag can not be recalibrated.
|
||||
*/
|
||||
|
|
@ -146,38 +146,38 @@ public class RecalibrationArgumentCollection {
|
|||
public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base mismatches
|
||||
* The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
|
||||
*/
|
||||
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "size of the k-mer context to be used for base mismatches", required = false)
|
||||
@Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false)
|
||||
public int MISMATCHES_CONTEXT_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value for base insertions and deletions
|
||||
* The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size.
|
||||
*/
|
||||
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false)
|
||||
@Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false)
|
||||
public int INDELS_CONTEXT_SIZE = 3;
|
||||
|
||||
/**
|
||||
* The cycle covariate will generate an error if it encounters a cycle greater than this value.
|
||||
* This argument is ignored if the Cycle covariate is not used.
|
||||
*/
|
||||
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false)
|
||||
@Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false)
|
||||
public int MAXIMUM_CYCLE_VALUE = 500;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off]
|
||||
*/
|
||||
@Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false)
|
||||
public byte MISMATCHES_DEFAULT_QUALITY = -1;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. (default is on)
|
||||
* A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on]
|
||||
*/
|
||||
@Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false)
|
||||
public byte INSERTIONS_DEFAULT_QUALITY = 45;
|
||||
|
||||
/**
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off)
|
||||
* A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on]
|
||||
*/
|
||||
@Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false)
|
||||
public byte DELETIONS_DEFAULT_QUALITY = 45;
|
||||
|
|
@ -220,7 +220,7 @@ public class RecalibrationArgumentCollection {
|
|||
public String FORCE_PLATFORM = null;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only")
|
||||
@Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false)
|
||||
public PrintStream RECAL_TABLE_UPDATE_LOG = null;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ public class RecalibrationEngine {
|
|||
final NestedIntegerArray<RecalDatum> byQualTable = finalRecalibrationTables.getQualityScoreTable();
|
||||
|
||||
// iterate over all values in the qual table
|
||||
for ( NestedIntegerArray.Leaf<RecalDatum> leaf : byQualTable.getAllLeaves() ) {
|
||||
for ( final NestedIntegerArray.Leaf<RecalDatum> leaf : byQualTable.getAllLeaves() ) {
|
||||
final int rgKey = leaf.keys[0];
|
||||
final int eventIndex = leaf.keys[2];
|
||||
final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex);
|
||||
|
|
@ -206,7 +206,9 @@ public class RecalibrationEngine {
|
|||
*/
|
||||
@Requires("! finalized")
|
||||
private RecalibrationTables mergeThreadLocalRecalibrationTables() {
|
||||
if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty");
|
||||
if ( recalibrationTablesList.isEmpty() ) {
|
||||
recalibrationTablesList.add( new RecalibrationTables(covariates, numReadGroups, maybeLogStream) );
|
||||
}
|
||||
|
||||
RecalibrationTables merged = null;
|
||||
for ( final RecalibrationTables table : recalibrationTablesList ) {
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
|
|
@ -55,18 +56,27 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
|
|||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.recalibration.*;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Evaluate the performance of the base recalibration process
|
||||
*
|
||||
* <p>This tool aims to evaluate the results of the Base Quality Score Recalibration (BQSR) process.</p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>This tool is currently experimental. We do not provide documentation nor support for its operation.</p>
|
||||
*
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class})
|
||||
@PartitionBy(PartitionType.READ)
|
||||
public class RecalibrationPerformance extends RodWalker<Integer, Integer> implements NanoSchedulable {
|
||||
|
||||
@Output(doc="Write output to this file", required = true)
|
||||
@Output
|
||||
public PrintStream out;
|
||||
|
||||
@Input(fullName="recal", shortName="recal", required=false, doc="The input covariates table file")
|
||||
|
|
|
|||
|
|
@ -53,39 +53,155 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
* @since 6/15/12
|
||||
*/
|
||||
public class BaseAndQualsCounts extends BaseCounts {
|
||||
private final long[] sumInsertionQuals;
|
||||
private final long[] sumDeletionQuals;
|
||||
|
||||
public BaseAndQualsCounts() {
|
||||
super();
|
||||
this.sumInsertionQuals = new long[BaseIndex.values().length];
|
||||
this.sumDeletionQuals = new long[BaseIndex.values().length];
|
||||
// Java primitive arrays comes zero-filled, so no need to do it explicitly.
|
||||
private long sumInsertionQual_A = 0;
|
||||
private long sumDeletionQual_A = 0;
|
||||
private long sumInsertionQual_C = 0;
|
||||
private long sumDeletionQual_C = 0;
|
||||
private long sumInsertionQual_G = 0;
|
||||
private long sumDeletionQual_G = 0;
|
||||
private long sumInsertionQual_T = 0;
|
||||
private long sumDeletionQual_T = 0;
|
||||
private long sumInsertionQual_D = 0;
|
||||
private long sumDeletionQual_D = 0;
|
||||
private long sumInsertionQual_I = 0;
|
||||
private long sumDeletionQual_I = 0;
|
||||
private long sumInsertionQual_N = 0;
|
||||
private long sumDeletionQual_N = 0;
|
||||
|
||||
/*
|
||||
* Increments the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
*/
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
|
||||
incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false);
|
||||
}
|
||||
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
/*
|
||||
* Increments the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
* @param isSoftClip true if is soft-clipped
|
||||
*/
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) {
|
||||
// if we already have high quality bases, ignore low quality ones
|
||||
if ( isLowQualBase && !isLowQuality() )
|
||||
return;
|
||||
|
||||
// if this is a high quality base then remove any low quality bases and start from scratch
|
||||
if ( !isLowQualBase && isLowQuality() ) {
|
||||
if ( totalCount() > 0 )
|
||||
clear();
|
||||
setLowQuality(false);
|
||||
}
|
||||
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.incr(i, baseQual);
|
||||
sumInsertionQuals[i.index] += insQual;
|
||||
sumDeletionQuals[i.index] += delQual;
|
||||
super.incr(i, baseQual, baseMappingQual, isSoftClip);
|
||||
switch (i) {
|
||||
case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break;
|
||||
case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break;
|
||||
case G: sumInsertionQual_G += insQual; sumDeletionQual_G += delQual; break;
|
||||
case T: sumInsertionQual_T += insQual; sumDeletionQual_T += delQual; break;
|
||||
case D: sumInsertionQual_D += insQual; sumDeletionQual_D += delQual; break;
|
||||
case I: sumInsertionQual_I += insQual; sumDeletionQual_I += delQual; break;
|
||||
case N: sumInsertionQual_N += insQual; sumDeletionQual_N += delQual; break;
|
||||
}
|
||||
}
|
||||
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
/*
|
||||
* Decrements the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
*/
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
|
||||
decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrements the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
* @param isSoftClip true if is soft-clipped
|
||||
*/
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) {
|
||||
// if this is not the right type of base, ignore it
|
||||
if ( isLowQualBase != isLowQuality() )
|
||||
return;
|
||||
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.decr(i, baseQual);
|
||||
sumInsertionQuals[i.index] -= insQual;
|
||||
sumDeletionQuals[i.index] -= delQual;
|
||||
super.decr(i, baseQual, baseMappingQual, isSoftClip);
|
||||
switch (i) {
|
||||
case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break;
|
||||
case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break;
|
||||
case G: sumInsertionQual_G -= insQual; sumDeletionQual_G -= delQual; break;
|
||||
case T: sumInsertionQual_T -= insQual; sumDeletionQual_T -= delQual; break;
|
||||
case D: sumInsertionQual_D -= insQual; sumDeletionQual_D -= delQual; break;
|
||||
case I: sumInsertionQual_I -= insQual; sumDeletionQual_I -= delQual; break;
|
||||
case N: sumInsertionQual_N -= insQual; sumDeletionQual_N -= delQual; break;
|
||||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumInsertionQuals);
|
||||
return (byte) (getInsertionQual(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumDeletionQuals);
|
||||
return (byte) (getDeletionQual(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) {
|
||||
return (byte) (sumQuals[base.index] / countOfBase(base));
|
||||
private long getInsertionQual(final BaseIndex base) {
|
||||
switch (base) {
|
||||
case A: return sumInsertionQual_A;
|
||||
case C: return sumInsertionQual_C;
|
||||
case G: return sumInsertionQual_G;
|
||||
case T: return sumInsertionQual_T;
|
||||
case D: return sumInsertionQual_D;
|
||||
case I: return sumInsertionQual_I;
|
||||
case N: return sumInsertionQual_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
private long getDeletionQual(final BaseIndex base) {
|
||||
switch (base) {
|
||||
case A: return sumDeletionQual_A;
|
||||
case C: return sumDeletionQual_C;
|
||||
case G: return sumDeletionQual_G;
|
||||
case T: return sumDeletionQual_T;
|
||||
case D: return sumDeletionQual_D;
|
||||
case I: return sumDeletionQual_I;
|
||||
case N: return sumDeletionQual_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears out all stored data in this object
|
||||
*/
|
||||
public void clear() {
|
||||
super.clear();
|
||||
sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0;
|
||||
sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -62,70 +64,118 @@ import com.google.java.contract.Requires;
|
|||
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
|
||||
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
|
||||
|
||||
private final int[] counts; // keeps track of the base counts
|
||||
private final long[] sumQuals; // keeps track of the quals of each base
|
||||
private int totalCount = 0; // keeps track of total count since this is requested so often
|
||||
|
||||
public BaseCounts() {
|
||||
counts = new int[BaseIndex.values().length];
|
||||
sumQuals = new long[BaseIndex.values().length];
|
||||
// Java primitive arrays comes zero-filled, so no need to do it explicitly.
|
||||
}
|
||||
private int count_A = 0; // keeps track of the base counts
|
||||
private int sumQual_A = 0; // keeps track of the quals of each base
|
||||
private int count_C = 0;
|
||||
private int sumQual_C = 0;
|
||||
private int count_G = 0;
|
||||
private int sumQual_G = 0;
|
||||
private int count_T = 0;
|
||||
private int sumQual_T = 0;
|
||||
private int count_D = 0;
|
||||
private int sumQual_D = 0;
|
||||
private int count_I = 0;
|
||||
private int sumQual_I = 0;
|
||||
private int count_N = 0;
|
||||
private int sumQual_N = 0;
|
||||
private int totalCount = 0; // keeps track of total count since this is requested so often
|
||||
private int nSoftClippedBases = 0;
|
||||
private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this
|
||||
private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise
|
||||
|
||||
|
||||
public static BaseCounts createWithCounts(int[] countsACGT) {
|
||||
BaseCounts baseCounts = new BaseCounts();
|
||||
baseCounts.counts[BaseIndex.A.index] = countsACGT[0];
|
||||
baseCounts.counts[BaseIndex.C.index] = countsACGT[1];
|
||||
baseCounts.counts[BaseIndex.G.index] = countsACGT[2];
|
||||
baseCounts.counts[BaseIndex.T.index] = countsACGT[3];
|
||||
baseCounts.count_A = countsACGT[0];
|
||||
baseCounts.count_C = countsACGT[1];
|
||||
baseCounts.count_G = countsACGT[2];
|
||||
baseCounts.count_T = countsACGT[3];
|
||||
baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3];
|
||||
return baseCounts;
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void add(final BaseCounts other) {
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
final int otherCount = other.counts[i.index];
|
||||
counts[i.index] += otherCount;
|
||||
totalCount += otherCount;
|
||||
}
|
||||
this.count_A += other.count_A;
|
||||
this.count_C += other.count_C;
|
||||
this.count_G += other.count_G;
|
||||
this.count_T += other.count_T;
|
||||
this.count_D += other.count_D;
|
||||
this.count_I += other.count_I;
|
||||
this.count_N += other.count_N;
|
||||
this.totalCount += other.totalCount;
|
||||
this.nSoftClippedBases = other.nSoftClippedBases;
|
||||
this.mappingQualities.addAll(other.mappingQualities);
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(final BaseCounts other) {
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
final int otherCount = other.counts[i.index];
|
||||
counts[i.index] -= otherCount;
|
||||
totalCount -= otherCount;
|
||||
}
|
||||
this.count_A -= other.count_A;
|
||||
this.count_C -= other.count_C;
|
||||
this.count_G -= other.count_G;
|
||||
this.count_T -= other.count_T;
|
||||
this.count_D -= other.count_D;
|
||||
this.count_I -= other.count_I;
|
||||
this.count_N -= other.count_N;
|
||||
this.totalCount -= other.totalCount;
|
||||
this.nSoftClippedBases -= other.nSoftClippedBases;
|
||||
this.mappingQualities.removeAll(other.mappingQualities);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final byte base) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
counts[i.index]++;
|
||||
totalCount++;
|
||||
add(BaseIndex.byteToBase(base), 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final BaseIndex base, final byte qual) {
|
||||
counts[base.index]++;
|
||||
totalCount++;
|
||||
sumQuals[base.index] += qual;
|
||||
public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) {
|
||||
switch (base) {
|
||||
case A: ++count_A; sumQual_A += qual; break;
|
||||
case C: ++count_C; sumQual_C += qual; break;
|
||||
case G: ++count_G; sumQual_G += qual; break;
|
||||
case T: ++count_T; sumQual_T += qual; break;
|
||||
case D: ++count_D; sumQual_D += qual; break;
|
||||
case I: ++count_I; sumQual_I += qual; break;
|
||||
case N: ++count_N; sumQual_N += qual; break;
|
||||
}
|
||||
++totalCount;
|
||||
nSoftClippedBases += isSoftclip ? 1 : 0;
|
||||
mappingQualities.add(mappingQuality);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(final byte base) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
counts[i.index]--;
|
||||
totalCount--;
|
||||
add(BaseIndex.byteToBase(base), -1);
|
||||
}
|
||||
|
||||
private void add(final BaseIndex base, int amount) {
|
||||
switch(base) {
|
||||
case A: count_A += amount; break;
|
||||
case C: count_C += amount; break;
|
||||
case G: count_G += amount; break;
|
||||
case T: count_T += amount; break;
|
||||
case D: count_D += amount; break;
|
||||
case I: count_I += amount; break;
|
||||
case N: count_N += amount; break;
|
||||
}
|
||||
totalCount += amount;
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(final BaseIndex base, final byte qual) {
|
||||
counts[base.index]--;
|
||||
totalCount--;
|
||||
sumQuals[base.index] -= qual;
|
||||
public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) {
|
||||
switch (base) {
|
||||
case A: --count_A; sumQual_A -= qual; break;
|
||||
case C: --count_C; sumQual_C -= qual; break;
|
||||
case G: --count_G; sumQual_G -= qual; break;
|
||||
case T: --count_T; sumQual_T -= qual; break;
|
||||
case D: --count_D; sumQual_D -= qual; break;
|
||||
case I: --count_I; sumQual_I -= qual; break;
|
||||
case N: --count_N; sumQual_N -= qual; break;
|
||||
}
|
||||
--totalCount;
|
||||
nSoftClippedBases -= isSoftclip ? 1 : 0;
|
||||
mappingQualities.remove((Integer) mappingQuality);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
|
|
@ -135,7 +185,16 @@ import com.google.java.contract.Requires;
|
|||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(final BaseIndex base) {
|
||||
return sumQuals[base.index];
|
||||
switch (base) {
|
||||
case A: return sumQual_A;
|
||||
case C: return sumQual_C;
|
||||
case G: return sumQual_G;
|
||||
case T: return sumQual_T;
|
||||
case D: return sumQual_D;
|
||||
case I: return sumQual_I;
|
||||
case N: return sumQual_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
|
|
@ -155,12 +214,21 @@ import com.google.java.contract.Requires;
|
|||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final BaseIndex base) {
|
||||
return counts[base.index];
|
||||
switch (base) {
|
||||
case A: return count_A;
|
||||
case C: return count_C;
|
||||
case G: return count_G;
|
||||
case T: return count_T;
|
||||
case D: return count_D;
|
||||
case I: return count_I;
|
||||
case N: return count_N;
|
||||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfBase(final BaseIndex base) {
|
||||
return sumQuals[base.index];
|
||||
return getSumQuals(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
|
|
@ -168,12 +236,25 @@ import com.google.java.contract.Requires;
|
|||
return (byte) (sumQualsOfBase(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int nSoftclips() {
|
||||
return nSoftClippedBases;
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int totalCount() {
|
||||
return totalCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this object
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this object
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQualities);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
|
|
@ -193,14 +274,14 @@ import com.google.java.contract.Requires;
|
|||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(final BaseIndex baseIndex) {
|
||||
return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount;
|
||||
return (totalCount == 0) ? 0.0 : (double)countOfBase(baseIndex) / (double)totalCount;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
b.append(i.toString()).append("=").append(counts[i.index]).append(",");
|
||||
b.append(i.toString()).append("=").append(countOfBase(i)).append(",");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
|
@ -209,22 +290,42 @@ import com.google.java.contract.Requires;
|
|||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the base index for which the count is highest, including indel indexes
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (counts[i.index] > counts[maxI.index])
|
||||
maxI = i;
|
||||
}
|
||||
return maxI;
|
||||
return baseIndexWithMostCounts(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the base index for which the count is highest, excluding indel indexes
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
return baseIndexWithMostCounts(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the base index with the most counts
|
||||
*
|
||||
* @param allowIndels should we allow base indexes representing indels?
|
||||
* @return non-null base index
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
int maxCount = countOfBase(maxI);
|
||||
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (i.isNucleotide() && counts[i.index] > counts[maxI.index])
|
||||
if ( !allowIndels && !i.isNucleotide() )
|
||||
continue;
|
||||
|
||||
final int myCount = countOfBase(i);
|
||||
if (myCount > maxCount) {
|
||||
maxI = i;
|
||||
maxCount = myCount;
|
||||
}
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
|
@ -235,27 +336,41 @@ import com.google.java.contract.Requires;
|
|||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (sumQuals[i.index] > sumQuals[maxI.index])
|
||||
maxI = i;
|
||||
}
|
||||
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts());
|
||||
return baseIndexWithMostProbability(true);
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
|
||||
return baseIndexWithMostProbability(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the base index with the most probability
|
||||
*
|
||||
* @param allowIndels should we allow base indexes representing indels?
|
||||
* @return non-null base index
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
long maxSum = getSumQuals(maxI);
|
||||
|
||||
for (final BaseIndex i : BaseIndex.values()) {
|
||||
if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index])
|
||||
if ( !allowIndels && !i.isNucleotide() )
|
||||
continue;
|
||||
|
||||
final long mySum = getSumQuals(i);
|
||||
if (mySum > maxSum) {
|
||||
maxI = i;
|
||||
maxSum = mySum;
|
||||
}
|
||||
}
|
||||
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
|
||||
return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels));
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index];
|
||||
return totalCount - countOfBase(BaseIndex.D) - countOfBase(BaseIndex.I);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -268,10 +383,29 @@ import com.google.java.contract.Requires;
|
|||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex base) {
|
||||
final int total = totalCountWithoutIndels();
|
||||
return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total;
|
||||
return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total;
|
||||
}
|
||||
|
||||
public int[] countsArray() {
|
||||
return counts.clone();
|
||||
/**
|
||||
* @return true if this instance represents low quality bases
|
||||
*/
|
||||
public boolean isLowQuality() { return isLowQuality; }
|
||||
|
||||
/**
|
||||
* Sets the low quality value
|
||||
*
|
||||
* @param value true if this instance represents low quality bases false otherwise
|
||||
*/
|
||||
public void setLowQuality(final boolean value) { isLowQuality = value; }
|
||||
|
||||
/**
|
||||
* Clears out all stored data in this object
|
||||
*/
|
||||
public void clear() {
|
||||
count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0;
|
||||
sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0;
|
||||
totalCount = 0;
|
||||
nSoftClippedBases = 0;
|
||||
mappingQualities.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -121,7 +121,7 @@ public enum BaseIndex {
|
|||
*
|
||||
* @return whether or not it is a nucleotide, given the definition above
|
||||
*/
|
||||
public boolean isNucleotide() {
|
||||
public final boolean isNucleotide() {
|
||||
return !isIndel();
|
||||
}
|
||||
|
||||
|
|
@ -130,7 +130,7 @@ public enum BaseIndex {
|
|||
*
|
||||
* @return true for I or D, false otherwise
|
||||
*/
|
||||
public boolean isIndel() {
|
||||
public final boolean isIndel() {
|
||||
return this == D || this == I;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,15 +69,15 @@ import java.util.Map;
|
|||
* <p>
|
||||
* This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files.
|
||||
* </p>
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* Two BAM files (using -I) with different read group IDs
|
||||
* </p>
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
|
|
|
|||
|
|
@ -46,10 +46,12 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.TreeSet;
|
||||
|
||||
|
||||
/**
|
||||
* A stash of regions that must be kept uncompressed in all samples
|
||||
|
|
@ -61,7 +63,7 @@ import java.util.TreeSet;
|
|||
* Date: 10/15/12
|
||||
* Time: 4:08 PM
|
||||
*/
|
||||
public class CompressionStash extends TreeSet<FinishedGenomeLoc> {
|
||||
public class CompressionStash extends ObjectAVLTreeSet<FinishedGenomeLoc> {
|
||||
public CompressionStash() {
|
||||
super();
|
||||
}
|
||||
|
|
@ -75,7 +77,7 @@ public class CompressionStash extends TreeSet<FinishedGenomeLoc> {
|
|||
*/
|
||||
@Override
|
||||
public boolean add(final FinishedGenomeLoc insertLoc) {
|
||||
TreeSet<FinishedGenomeLoc> removedLocs = new TreeSet<FinishedGenomeLoc>();
|
||||
ObjectSortedSet<FinishedGenomeLoc> removedLocs = new ObjectAVLTreeSet<FinishedGenomeLoc>();
|
||||
for (FinishedGenomeLoc existingLoc : this) {
|
||||
if (existingLoc.isPast(insertLoc)) {
|
||||
break; // if we're past the loc we're done looking for overlaps.
|
||||
|
|
|
|||
|
|
@ -46,10 +46,10 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* The element that describes the header of the sliding window.
|
||||
|
|
@ -62,9 +62,9 @@ public class HeaderElement {
|
|||
private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
|
||||
private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site.
|
||||
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
|
||||
private int nSoftClippedBases; // How many bases in this site came from soft clipped bases
|
||||
private int location; // Genome location of this site (the sliding window knows which contig we're at
|
||||
private LinkedList<Integer> mappingQuality; // keeps the mapping quality of each read that contributed to this element (site)
|
||||
|
||||
protected static final int MIN_COUNT_FOR_USING_PVALUE = 2;
|
||||
|
||||
public int getLocation() {
|
||||
return location;
|
||||
|
|
@ -85,7 +85,7 @@ public class HeaderElement {
|
|||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(final int location) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList<Integer>());
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -95,7 +95,7 @@ public class HeaderElement {
|
|||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(final int location, final int insertionsToTheRight) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new LinkedList<Integer>());
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -104,55 +104,67 @@ public class HeaderElement {
|
|||
* @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read
|
||||
* @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read
|
||||
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
|
||||
* @param nSoftClippedBases number of softclipped bases of this HeaderElement
|
||||
* @param location the reference location of this reference element
|
||||
* @param mappingQuality the list of mapping quality values of all reads that contributed to this
|
||||
* HeaderElement
|
||||
*/
|
||||
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList<Integer> mappingQuality) {
|
||||
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int location) {
|
||||
this.consensusBaseCounts = consensusBaseCounts;
|
||||
this.filteredBaseCounts = filteredBaseCounts;
|
||||
this.insertionsToTheRight = insertionsToTheRight;
|
||||
this.nSoftClippedBases = nSoftClippedBases;
|
||||
this.location = location;
|
||||
this.mappingQuality = mappingQuality;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
|
||||
* by insertion, deletion and mismatches.
|
||||
*
|
||||
* @param minVariantPvalue min p-value for deciding that a position is or is not variable due to mismatches
|
||||
* @param minVariantProportion min proportion for deciding that a position is or is not variable due to mismatches
|
||||
* @param minIndelProportion min proportion for deciding that a position is or is not variable due to indels
|
||||
* @return true if site is variant by any definition. False otherwise.
|
||||
*/
|
||||
public boolean isVariant(double minVariantProportion, double minIndelProportion) {
|
||||
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) {
|
||||
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the base insertion quality
|
||||
* @param delQual the base deletion quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
* @param minBaseQual the minimum base qual allowed to be a good base
|
||||
* @param minMappingQual the minimum mapping qual allowed to be a good read
|
||||
* @param isSoftClipped true if the base is soft-clipped in the original read
|
||||
*/
|
||||
public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
// If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
|
||||
if ( baseMappingQuality >= minMappingQual )
|
||||
consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
|
||||
else
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
|
||||
this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the base insertion quality
|
||||
* @param delQual the base deletion quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
* @param minBaseQual the minimum base qual allowed to be a good base
|
||||
* @param minMappingQual the minimum mapping qual allowed to be a good read
|
||||
* @param isSoftClipped true if the base is soft-clipped in the original read
|
||||
*/
|
||||
public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
// If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
|
||||
if ( baseMappingQuality >= minMappingQual )
|
||||
consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped);
|
||||
else
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
|
||||
this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
}
|
||||
/**
|
||||
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
|
||||
|
|
@ -189,15 +201,6 @@ public class HeaderElement {
|
|||
return (!hasFilteredData() && !hasConsensusData());
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQuality);
|
||||
}
|
||||
|
||||
/**
|
||||
* removes an insertion from this element (if you removed a read that had an insertion)
|
||||
*/
|
||||
|
|
@ -232,7 +235,7 @@ public class HeaderElement {
|
|||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess deletions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
* @return whether or not the HeaderElement is variant due to excess deletions
|
||||
*/
|
||||
private boolean isVariantFromDeletions(double minIndelProportion) {
|
||||
return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
|
||||
|
|
@ -241,12 +244,15 @@ public class HeaderElement {
|
|||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess mismatches
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage).
|
||||
* @param minVariantProportion the minimum proportion to call a site variant (used with high coverage).
|
||||
* @return whether or not the HeaderElement is variant due to excess mismatches
|
||||
*/
|
||||
protected boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) {
|
||||
final int totalCount = consensusBaseCounts.totalCountWithoutIndels();
|
||||
final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon);
|
||||
return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -256,37 +262,88 @@ public class HeaderElement {
|
|||
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
|
||||
*/
|
||||
protected boolean isVariantFromSoftClips() {
|
||||
final int nSoftClippedBases = consensusBaseCounts.nSoftclips();
|
||||
return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
|
||||
}
|
||||
|
||||
protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
|
||||
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
|
||||
/**
|
||||
* Calculates the number of alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return the number of alleles necessary to represent this site or -1 if there are too many indels
|
||||
*/
|
||||
public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) {
|
||||
final ObjectArrayList<BaseIndex> alleles = getAlleles(minVariantPvalue, minVariantProportion);
|
||||
return alleles == null ? -1 : alleles.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of haplotypes necessary to represent this site.
|
||||
* Calculates the alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return the number of alleles necessary to represent this site.
|
||||
* @return the list of alleles necessary to represent this site or null if there are too many indels
|
||||
*/
|
||||
public int getNumberOfAlleles(final double minVariantProportion) {
|
||||
public ObjectArrayList<BaseIndex> getAlleles(final double minVariantPvalue, final double minVariantProportion) {
|
||||
// make sure we have bases at all
|
||||
final int totalBaseCount = consensusBaseCounts.totalCount();
|
||||
if (totalBaseCount == 0)
|
||||
return 0;
|
||||
if ( totalBaseCount == 0 )
|
||||
return new ObjectArrayList<BaseIndex>(0);
|
||||
|
||||
final int minBaseCountForRelevantAlleles = (int)(minVariantProportion * totalBaseCount);
|
||||
// next, check for insertions; technically, the insertion count can be greater than totalBaseCount
|
||||
// (because of the way insertions are counted), so we need to account for that
|
||||
if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) )
|
||||
return null;
|
||||
|
||||
int nAlleles = 0;
|
||||
for ( BaseIndex base : BaseIndex.values() ) {
|
||||
// finally, check for the bases themselves (including deletions)
|
||||
final ObjectArrayList<BaseIndex> alleles = new ObjectArrayList<BaseIndex>(4);
|
||||
for ( final BaseIndex base : BaseIndex.values() ) {
|
||||
final int baseCount = consensusBaseCounts.countOfBase(base);
|
||||
|
||||
// don't consider this allele if the count is 0
|
||||
if ( baseCount == 0 )
|
||||
continue;
|
||||
|
||||
if ( baseCount >= minBaseCountForRelevantAlleles )
|
||||
nAlleles++;
|
||||
if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) {
|
||||
if ( base == BaseIndex.D )
|
||||
return null;
|
||||
alleles.add(base);
|
||||
}
|
||||
}
|
||||
return nAlleles;
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there are a significant number of softclips.
|
||||
*
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return true if there are significant softclips, false otherwise
|
||||
*/
|
||||
public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) {
|
||||
return hasSignificantCount(consensusBaseCounts.nSoftclips(), consensusBaseCounts.totalCount(), minVariantPvalue, minVariantProportion);
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there are a significant number of count.
|
||||
*
|
||||
* @param count the count (k) to test against
|
||||
* @param total the total (n) to test against
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return true if there is a significant count given the provided pvalue, false otherwise
|
||||
*/
|
||||
private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) {
|
||||
if ( count == 0 || total == 0 )
|
||||
return false;
|
||||
|
||||
// use p-values for low counts of k
|
||||
if ( count <= MIN_COUNT_FOR_USING_PVALUE ) {
|
||||
final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count);
|
||||
return pvalue > minVariantPvalue;
|
||||
}
|
||||
|
||||
// otherwise, use straight proportions
|
||||
final int minBaseCountForSignificance = (int)(minVariantProportion * total);
|
||||
return count >= minBaseCountForSignificance;
|
||||
}
|
||||
}
|
||||
|
|
@ -46,18 +46,17 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -91,52 +90,72 @@ import java.util.TreeSet;
|
|||
public class MultiSampleCompressor {
|
||||
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
||||
|
||||
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
|
||||
protected Object2ObjectMap<String, SingleSampleCompressor> compressorsPerSample = new Object2ObjectOpenHashMap<String, SingleSampleCompressor>();
|
||||
|
||||
public MultiSampleCompressor(SAMFileHeader header,
|
||||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltPValueToTriggerVariant,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final boolean allowPolyploidReduction) {
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, allowPolyploidReduction));
|
||||
minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
}
|
||||
|
||||
public Set<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||
/**
|
||||
* Add an alignment to the compressor
|
||||
*
|
||||
* @param read the read to be added
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public ObjectSet<GATKSAMRecord> addAlignment(final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
String sampleName = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sampleName);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sampleName);
|
||||
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read);
|
||||
Set<GATKSAMRecord> reads = readsAndStash.getFirst();
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions);
|
||||
ObjectSet<GATKSAMRecord> reads = readsAndStash.getFirst();
|
||||
CompressionStash regions = readsAndStash.getSecond();
|
||||
|
||||
reads.addAll(closeVariantRegionsInAllSamples(regions));
|
||||
reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions));
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
public Set<GATKSAMRecord> close() {
|
||||
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
/**
|
||||
* Properly closes the compressor.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ObjectSet<GATKSAMRecord> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor sample : compressorsPerSample.values() ) {
|
||||
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close();
|
||||
reads = readsAndStash.getFirst();
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = sample.close(knownSnpPositions);
|
||||
reads.addAll(readsAndStash.getFirst());
|
||||
}
|
||||
return reads;
|
||||
}
|
||||
|
||||
private Set<GATKSAMRecord> closeVariantRegionsInAllSamples(CompressionStash regions) {
|
||||
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
/**
|
||||
* Finalizes current variant regions.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
private ObjectSet<GATKSAMRecord> closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if (!regions.isEmpty()) {
|
||||
for (SingleSampleCompressor sample : compressorsPerSample.values()) {
|
||||
reads.addAll(sample.closeVariantRegions(regions));
|
||||
reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions));
|
||||
}
|
||||
}
|
||||
return reads;
|
||||
|
|
|
|||
|
|
@ -46,13 +46,15 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMProgramRecord;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -65,13 +67,17 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
|
||||
|
|
@ -83,17 +89,17 @@ import java.util.*;
|
|||
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
|
||||
* savings in file size and performance of the downstream tools.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The BAM file to be compressed
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* The compressed (reduced) BAM file.
|
||||
*
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
@ -107,9 +113,9 @@ import java.util.*;
|
|||
@PartitionBy(PartitionType.CONTIG)
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40)
|
||||
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
|
||||
@Output
|
||||
@Output(required = false, defaultToStdout = false)
|
||||
private StingSAMFileWriter out = null;
|
||||
private SAMFileWriter writerToUse = null;
|
||||
|
||||
|
|
@ -117,7 +123,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* The number of bases to keep around mismatches (potential variation)
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
|
||||
private int contextSize = 10;
|
||||
public int contextSize = 10;
|
||||
|
||||
/**
|
||||
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
|
||||
|
|
@ -125,7 +131,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
|
||||
private int minMappingQuality = 20;
|
||||
public int minMappingQuality = 20;
|
||||
|
||||
/**
|
||||
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
|
||||
|
|
@ -133,41 +139,45 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
|
||||
private byte minBaseQual = 20;
|
||||
public byte minBaseQual = 15;
|
||||
|
||||
/**
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
|
||||
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with
|
||||
* quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
*/
|
||||
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
|
||||
private byte minTailQuality = 2;
|
||||
public byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Allow the experimental polyploid-based reduction capabilities of this tool
|
||||
* Any number of VCF files representing known SNPs to be used for the polyploid-based reduction.
|
||||
* Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored.
|
||||
* If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present
|
||||
* in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will
|
||||
* be triggered anywhere there is a single SNP present in a consensus window.
|
||||
*/
|
||||
@Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
|
||||
private boolean USE_POLYPLOID_REDUCTION = false;
|
||||
@Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
|
||||
* and read group).
|
||||
*/
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
|
||||
private boolean DONT_SIMPLIFY_READS = false;
|
||||
public boolean DONT_SIMPLIFY_READS = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly in those cases.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
|
||||
private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
|
||||
* quality.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
|
||||
private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
public boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
|
||||
/**
|
||||
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
|
|
@ -175,7 +185,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
|
||||
*/
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
|
||||
private boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
public boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
|
||||
/**
|
||||
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
|
||||
|
|
@ -183,55 +193,68 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
|
||||
*/
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
|
||||
private boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
public boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
|
||||
/**
|
||||
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
|
||||
* border.
|
||||
*/
|
||||
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
|
||||
private boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
public boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
|
||||
/**
|
||||
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
* considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that
|
||||
* this value is used only regions with high coverage.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
|
||||
private double minAltProportionToTriggerVariant = 0.05;
|
||||
public double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region.
|
||||
* Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to
|
||||
* trigger polyploid compression). Note that this value is used only regions with low coverage.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false)
|
||||
public double minAltPValueToTriggerVariant = 0.01;
|
||||
|
||||
/**
|
||||
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
|
||||
private double minIndelProportionToTriggerVariant = 0.05;
|
||||
public double minIndelProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
|
||||
* The number of reads emitted per sample in a variant region can be downsampled for better compression.
|
||||
* This level of downsampling only happens after the region has been evaluated, therefore it can
|
||||
* be combined with the engine level downsampling.
|
||||
* A value of 0 turns downsampling off.
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
|
||||
private int downsampleCoverage = 250;
|
||||
public int downsampleCoverage = 250;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
|
||||
private boolean nwayout = false;
|
||||
public boolean nwayout = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
|
||||
private int debugLevel = 0;
|
||||
public int debugLevel = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
|
||||
private String debugRead = "";
|
||||
public String debugRead = "";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
|
||||
private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
|
||||
private boolean NO_PG_TAG = false;
|
||||
public boolean NO_PG_TAG = false;
|
||||
|
||||
public enum DownsampleStrategy {
|
||||
Normal,
|
||||
|
|
@ -240,10 +263,12 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
|
||||
int nCompressedReads = 0;
|
||||
|
||||
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
Object2LongOpenHashMap<String> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
|
||||
SortedSet<GenomeLoc> intervalList;
|
||||
ObjectSortedSet<GenomeLoc> intervalList;
|
||||
|
||||
ObjectSortedSet<GenomeLoc> knownSnpPositions;
|
||||
|
||||
// IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
|
|
@ -256,17 +281,33 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
if ( !nwayout && out == null )
|
||||
throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes");
|
||||
|
||||
if ( nwayout && out != null )
|
||||
throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other");
|
||||
|
||||
if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 )
|
||||
throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
||||
if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 )
|
||||
throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
||||
if ( known.isEmpty() )
|
||||
knownSnpPositions = null;
|
||||
else
|
||||
knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
GenomeAnalysisEngine toolkit = getToolkit();
|
||||
readNameHash = new HashMap<String, Long>(); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new TreeSet<GenomeLoc>(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
readNameHash = new Object2LongOpenHashMap<String>(100000); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new ObjectAVLTreeSet<GenomeLoc>(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
|
||||
if (toolkit.getIntervals() != null)
|
||||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
|
||||
final boolean preSorted = true;
|
||||
final boolean indexOnTheFly = true;
|
||||
final boolean keep_records = true;
|
||||
final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate;
|
||||
if (nwayout) {
|
||||
SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME);
|
||||
|
|
@ -276,7 +317,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
writerToUse = out;
|
||||
out.setPresorted(false);
|
||||
if (!NO_PG_TAG) {
|
||||
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, keep_records, this, PROGRAM_RECORD_NAME);
|
||||
Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), !preSorted, this, PROGRAM_RECORD_NAME);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -295,8 +336,8 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* @return a linked list with all the reads produced by the clipping operations
|
||||
*/
|
||||
@Override
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
LinkedList<GATKSAMRecord> mappedReads;
|
||||
public ObjectArrayList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
ObjectArrayList<GATKSAMRecord> mappedReads;
|
||||
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
|
||||
System.out.println("Found debug read!");
|
||||
|
||||
|
|
@ -325,18 +366,18 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
if (HARD_CLIP_TO_INTERVAL)
|
||||
mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval
|
||||
else {
|
||||
mappedReads = new LinkedList<GATKSAMRecord>();
|
||||
mappedReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
mappedReads.add(read);
|
||||
}
|
||||
}
|
||||
else {
|
||||
mappedReads = new LinkedList<GATKSAMRecord>();
|
||||
mappedReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
if (!read.isEmpty())
|
||||
mappedReads.add(read);
|
||||
}
|
||||
|
||||
if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
|
||||
LinkedList<GATKSAMRecord> tempList = new LinkedList<GATKSAMRecord>();
|
||||
ObjectArrayList<GATKSAMRecord> tempList = new ObjectArrayList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord mRead : mappedReads) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
|
||||
if (!clippedRead.isEmpty())
|
||||
|
|
@ -349,8 +390,22 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
for (GATKSAMRecord mappedRead : mappedReads)
|
||||
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
|
||||
|
||||
return mappedReads;
|
||||
// add the SNPs to the list of known positions
|
||||
populateKnownSNPs(metaDataTracker);
|
||||
|
||||
return mappedReads;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the positions of known SNPs to the set so that we can keep track of it
|
||||
*
|
||||
* @param metaDataTracker the ref meta data tracker
|
||||
*/
|
||||
protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) {
|
||||
for ( final VariantContext vc : metaDataTracker.getValues(known) ) {
|
||||
if ( vc.isSNP() )
|
||||
knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -363,7 +418,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, USE_POLYPLOID_REDUCTION));
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -375,7 +430,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* @param stash the stash that keeps the reads in order for processing
|
||||
* @return the stash with all reads that have not been processed yet
|
||||
*/
|
||||
public ReduceReadsStash reduce(LinkedList<GATKSAMRecord> mappedReads, ReduceReadsStash stash) {
|
||||
public ReduceReadsStash reduce(ObjectArrayList<GATKSAMRecord> mappedReads, ReduceReadsStash stash) {
|
||||
if (debugLevel == 1)
|
||||
stash.print();
|
||||
|
||||
|
|
@ -387,7 +442,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
|
||||
if (originalRead) {
|
||||
List<GATKSAMRecord> readsReady = new LinkedList<GATKSAMRecord>();
|
||||
ObjectArrayList<GATKSAMRecord> readsReady = new ObjectArrayList<GATKSAMRecord>();
|
||||
readsReady.addAll(stash.getAllReadsBefore(read));
|
||||
readsReady.add(read);
|
||||
|
||||
|
|
@ -395,9 +450,16 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
if (debugLevel == 1)
|
||||
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
|
||||
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady))
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions))
|
||||
outputRead(compressedRead);
|
||||
|
||||
// We only care about maintaining the link between read pairs if they are in the same variant
|
||||
// region. Since an entire variant region's worth of reads is returned in a single call to
|
||||
// stash.compress(), the readNameHash can be cleared after the for() loop above.
|
||||
// The advantage of clearing the hash is that otherwise it holds all reads that have been encountered,
|
||||
// which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory.
|
||||
readNameHash.clear();
|
||||
|
||||
}
|
||||
} else
|
||||
stash.add(read);
|
||||
|
|
@ -405,6 +467,10 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
firstRead = false;
|
||||
}
|
||||
|
||||
// reduce memory requirements by removing old positions
|
||||
if ( !mappedReads.isEmpty() )
|
||||
clearStaleKnownPositions(mappedReads.get(0));
|
||||
|
||||
return stash;
|
||||
}
|
||||
|
||||
|
|
@ -417,13 +483,38 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
public void onTraversalDone(ReduceReadsStash stash) {
|
||||
|
||||
// output any remaining reads in the compressor
|
||||
for (GATKSAMRecord read : stash.close())
|
||||
for (GATKSAMRecord read : stash.close(knownSnpPositions))
|
||||
outputRead(read);
|
||||
|
||||
if (nwayout)
|
||||
writerToUse.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes known positions that are no longer relevant for use with het compression.
|
||||
*
|
||||
* @param read the current read, used for checking whether there are stale positions we can remove
|
||||
*/
|
||||
protected void clearStaleKnownPositions(final GATKSAMRecord read) {
|
||||
// nothing to clear if not used or empty
|
||||
if ( knownSnpPositions == null || knownSnpPositions.isEmpty() )
|
||||
return;
|
||||
|
||||
// not ready to be cleared until we encounter a read from a different contig
|
||||
final int contigIndexOfRead = read.getReferenceIndex();
|
||||
if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead )
|
||||
return;
|
||||
|
||||
// because we expect most elements to be stale, it's not going to be efficient to remove them one at a time
|
||||
final ObjectAVLTreeSet<GenomeLoc> goodLocs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
for ( final GenomeLoc loc : knownSnpPositions ) {
|
||||
if ( loc.getContigIndex() == contigIndexOfRead )
|
||||
goodLocs.add(loc);
|
||||
}
|
||||
knownSnpPositions.clear();
|
||||
knownSnpPositions.addAll(goodLocs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
|
||||
*
|
||||
|
|
@ -433,8 +524,8 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* @param read the read to be hard clipped to the interval.
|
||||
* @return a shallow copy of the read hard clipped to the interval
|
||||
*/
|
||||
private LinkedList<GATKSAMRecord> hardClipReadToInterval(GATKSAMRecord read) {
|
||||
LinkedList<GATKSAMRecord> clippedReads = new LinkedList<GATKSAMRecord>();
|
||||
private ObjectArrayList<GATKSAMRecord> hardClipReadToInterval(GATKSAMRecord read) {
|
||||
ObjectArrayList<GATKSAMRecord> clippedReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
|
||||
|
||||
|
|
@ -588,7 +679,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
|
||||
|
||||
if (!DONT_COMPRESS_READ_NAMES)
|
||||
compressReadName(read);
|
||||
nextReadNumber = compressReadName(readNameHash, read, nextReadNumber);
|
||||
|
||||
writerToUse.addAlignment(read);
|
||||
}
|
||||
|
|
@ -623,21 +714,28 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* Compresses the read name using the readNameHash if we have already compressed
|
||||
* this read name before.
|
||||
*
|
||||
* @param read any read
|
||||
* @param hash the hash table containing the read name to compressed read name map
|
||||
* @param read any read
|
||||
* @param nextReadNumber the number to use in the compressed read name in case this is a new read name
|
||||
* @return the next number to use in the compressed read name
|
||||
*/
|
||||
private void compressReadName(GATKSAMRecord read) {
|
||||
String name = read.getReadName();
|
||||
String compressedName = read.isReducedRead() ? "C" : "";
|
||||
final Long readNumber = readNameHash.get(name);
|
||||
if (readNumber != null) {
|
||||
compressedName += readNumber.toString();
|
||||
} else {
|
||||
readNameHash.put(name, nextReadNumber);
|
||||
compressedName += nextReadNumber.toString();
|
||||
nextReadNumber++;
|
||||
protected static long compressReadName(final Object2LongOpenHashMap<String> hash, final GATKSAMRecord read, final long nextReadNumber) {
|
||||
final String name = read.getReadName();
|
||||
final StringBuilder compressedName = new StringBuilder();
|
||||
long result = nextReadNumber;
|
||||
if (read.isReducedRead()) {
|
||||
compressedName.append("C");
|
||||
}
|
||||
|
||||
read.setReadName(compressedName);
|
||||
final Long readNumber = hash.get(name);
|
||||
if (readNumber != null) {
|
||||
compressedName.append(readNumber);
|
||||
} else {
|
||||
hash.put(name, nextReadNumber);
|
||||
compressedName.append(nextReadNumber);
|
||||
result++;
|
||||
}
|
||||
read.setReadName(compressedName.toString());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -649,8 +747,8 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* @param read the read
|
||||
* @return Returns true if the read is the original read that went through map().
|
||||
*/
|
||||
private boolean isOriginalRead(LinkedList<GATKSAMRecord> list, GATKSAMRecord read) {
|
||||
return isWholeGenome() || list.getFirst().equals(read);
|
||||
private boolean isOriginalRead(ObjectArrayList<GATKSAMRecord> list, GATKSAMRecord read) {
|
||||
return isWholeGenome() || list.get(0).equals(read);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectSortedSet;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -106,11 +108,12 @@ public class ReduceReadsStash {
|
|||
/**
|
||||
* sends the read to the MultiSampleCompressor
|
||||
*
|
||||
* @param read the read to be compressed
|
||||
* @param read the read to be compressed
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> compress(GATKSAMRecord read) {
|
||||
return compressor.addAlignment(read);
|
||||
public Iterable<GATKSAMRecord> compress(final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return compressor.addAlignment(read, knownSnpPositions);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -125,18 +128,19 @@ public class ReduceReadsStash {
|
|||
/**
|
||||
* Close the stash, processing all remaining reads in order
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return a list of all the reads produced by the SlidingWindow machinery)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
public Iterable<GATKSAMRecord> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
LinkedList<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// compress all the stashed reads (in order)
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions))
|
||||
result.add(compressedRead);
|
||||
|
||||
// output any remaining reads from the compressor
|
||||
for (GATKSAMRecord read : compressor.close())
|
||||
for (GATKSAMRecord read : compressor.close(knownSnpPositions))
|
||||
result.add(read);
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -46,14 +46,13 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author carneiro, depristo
|
||||
|
|
@ -63,38 +62,45 @@ public class SingleSampleCompressor {
|
|||
final private int contextSize;
|
||||
final private int downsampleCoverage;
|
||||
final private int minMappingQuality;
|
||||
final private double minAltPValueToTriggerVariant;
|
||||
final private double minAltProportionToTriggerVariant;
|
||||
final private double minIndelProportionToTriggerVariant;
|
||||
final private int minBaseQual;
|
||||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
final private boolean allowPolyploidReduction;
|
||||
|
||||
private SlidingWindow slidingWindow;
|
||||
private int slidingWindowCounter;
|
||||
|
||||
public static Pair<Set<GATKSAMRecord>, CompressionStash> emptyPair = new Pair<Set<GATKSAMRecord>,CompressionStash>(new TreeSet<GATKSAMRecord>(), new CompressionStash());
|
||||
public static Pair<ObjectSet<GATKSAMRecord>, CompressionStash> emptyPair = new Pair<ObjectSet<GATKSAMRecord>,CompressionStash>(new ObjectAVLTreeSet<GATKSAMRecord>(), new CompressionStash());
|
||||
|
||||
public SingleSampleCompressor(final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltPValueToTriggerVariant,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final boolean allowPolyploidReduction) {
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
this.slidingWindowCounter = 0;
|
||||
this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant;
|
||||
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
|
||||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
public Pair<Set<GATKSAMRecord>, CompressionStash> addAlignment( GATKSAMRecord read ) {
|
||||
Set<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
/**
|
||||
* Add an alignment to the compressor
|
||||
*
|
||||
* @param read the read to be added
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet<GenomeLoc> knownSnpPositions ) {
|
||||
ObjectSet<GATKSAMRecord> reads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash stash = new CompressionStash();
|
||||
int readOriginalStart = read.getUnclippedStart();
|
||||
|
||||
|
|
@ -104,27 +110,43 @@ public class SingleSampleCompressor {
|
|||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||
|
||||
// close the current sliding window
|
||||
Pair<Set<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close();
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions);
|
||||
reads = readsAndStash.getFirst();
|
||||
stash = readsAndStash.getSecond();
|
||||
slidingWindow = null; // so we create a new one on the next if
|
||||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), allowPolyploidReduction);
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(),
|
||||
slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant,
|
||||
minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
stash.addAll(slidingWindow.addRead(read));
|
||||
return new Pair<Set<GATKSAMRecord>, CompressionStash>(reads, stash);
|
||||
return new Pair<ObjectSet<GATKSAMRecord>, CompressionStash>(reads, stash);
|
||||
}
|
||||
|
||||
public Pair<Set<GATKSAMRecord>, CompressionStash> close() {
|
||||
return (slidingWindow != null) ? slidingWindow.close() : emptyPair;
|
||||
/**
|
||||
* Properly closes the compressor.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Pair<ObjectSet<GATKSAMRecord>, CompressionStash> close(final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair;
|
||||
}
|
||||
|
||||
public Set<GATKSAMRecord> closeVariantRegions(CompressionStash regions) {
|
||||
return slidingWindow == null ? Collections.<GATKSAMRecord>emptySet() : slidingWindow.closeVariantRegions(regions);
|
||||
/**
|
||||
* Finalizes current variant regions.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -47,20 +47,18 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Running Consensus is a read that is compressed as a sliding window travels over the reads
|
||||
|
|
@ -76,17 +74,25 @@ import java.util.List;
|
|||
* @since 8/26/11
|
||||
*/
|
||||
public class SyntheticRead {
|
||||
// Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce
|
||||
// memory footprint.
|
||||
// TODO: better name
|
||||
|
||||
/**
|
||||
* The types of strandedness for synthetic reads
|
||||
*/
|
||||
public enum StrandType {
|
||||
POSITIVE,
|
||||
NEGATIVE,
|
||||
STRANDLESS
|
||||
}
|
||||
|
||||
// Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint.
|
||||
private static class SingleBaseInfo {
|
||||
byte baseIndexOrdinal; // enum BaseIndex.ordinal
|
||||
byte count;
|
||||
int count;
|
||||
byte qual;
|
||||
byte insertionQual;
|
||||
byte deletionQual;
|
||||
|
||||
SingleBaseInfo(byte baseIndexOrdinal, byte count, byte qual, byte insertionQual, byte deletionQual) {
|
||||
SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) {
|
||||
this.baseIndexOrdinal = baseIndexOrdinal;
|
||||
this.count = count;
|
||||
this.qual = qual;
|
||||
|
|
@ -123,9 +129,8 @@ public class SyntheticRead {
|
|||
}
|
||||
|
||||
|
||||
private final List<SingleBaseInfo> basesCountsQuals;
|
||||
private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
|
||||
private String readTag;
|
||||
private final ObjectArrayList<SingleBaseInfo> basesCountsQuals;
|
||||
private double mappingQuality;
|
||||
|
||||
// Information to produce a GATKSAMRecord
|
||||
private SAMFileHeader header;
|
||||
|
|
@ -135,7 +140,7 @@ public class SyntheticRead {
|
|||
private String readName;
|
||||
private int refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
private boolean isNegativeStrand = false;
|
||||
private StrandType strandType = StrandType.STRANDLESS;
|
||||
|
||||
/**
|
||||
* Full initialization of the running consensus if you have all the information and are ready to
|
||||
|
|
@ -147,14 +152,12 @@ public class SyntheticRead {
|
|||
* @param contigIndex the read's contig index
|
||||
* @param readName the read's name
|
||||
* @param refStart the alignment start (reference based)
|
||||
* @param readTag the reduce reads tag for the synthetic read
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) {
|
||||
final int initialCapacity = 10000;
|
||||
basesCountsQuals = new ArrayList<SingleBaseInfo>(initialCapacity);
|
||||
basesCountsQuals = new ObjectArrayList<SingleBaseInfo>(initialCapacity);
|
||||
mappingQuality = 0.0;
|
||||
|
||||
this.readTag = readTag;
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
|
|
@ -162,24 +165,7 @@ public class SyntheticRead {
|
|||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.isNegativeStrand = isNegativeRead;
|
||||
}
|
||||
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
basesCountsQuals = new ArrayList<SingleBaseInfo>(bases.size());
|
||||
for (int i = 0; i < bases.size(); ++i) {
|
||||
basesCountsQuals.add(new SingleBaseInfo(bases.get(i).getOrdinalByte(), counts.get(i), quals.get(i), insertionQuals.get(i), deletionQuals.get(i)));
|
||||
}
|
||||
this.mappingQuality = mappingQuality;
|
||||
this.readTag = readTag;
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.isNegativeStrand = isNegativeRead;
|
||||
this.strandType = strandType;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -190,7 +176,7 @@ public class SyntheticRead {
|
|||
* @param count number of reads with this base
|
||||
*/
|
||||
@Requires("count <= Byte.MAX_VALUE")
|
||||
public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
|
||||
public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) {
|
||||
basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual));
|
||||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
|
@ -220,15 +206,18 @@ public class SyntheticRead {
|
|||
read.setReferenceIndex(contigIndex);
|
||||
read.setReadPairedFlag(false);
|
||||
read.setReadUnmappedFlag(false);
|
||||
read.setReadNegativeStrandFlag(isNegativeStrand);
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
if ( strandType != StrandType.STRANDLESS ) {
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line
|
||||
read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE);
|
||||
}
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
read.setAlignmentStart(refStart);
|
||||
read.setReadName(readName);
|
||||
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
|
||||
read.setReadBases(convertReadBases());
|
||||
read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size()));
|
||||
read.setReadGroup(readGroupRecord);
|
||||
read.setAttribute(readTag, convertBaseCounts());
|
||||
read.setReducedReadCountsTag(convertBaseCounts());
|
||||
|
||||
if (hasIndelQualities) {
|
||||
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
|
||||
|
|
@ -278,22 +267,14 @@ public class SyntheticRead {
|
|||
});
|
||||
}
|
||||
|
||||
protected byte [] convertBaseCounts() {
|
||||
byte[] countsArray = convertVariableGivenBases(new SingleBaseInfoIterator() {
|
||||
public Byte next() {
|
||||
return it.next().count;
|
||||
}
|
||||
});
|
||||
|
||||
if (countsArray.length == 0)
|
||||
throw new ReviewedStingException("Reduced read has counts array of length 0");
|
||||
|
||||
byte[] compressedCountsArray = new byte [countsArray.length];
|
||||
compressedCountsArray[0] = countsArray[0];
|
||||
for (int i = 1; i < countsArray.length; i++)
|
||||
compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
|
||||
|
||||
return compressedCountsArray;
|
||||
protected int[] convertBaseCounts() {
|
||||
int[] variableArray = new int[getReadLengthWithNoDeletions()];
|
||||
int i = 0;
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
|
||||
if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte())
|
||||
variableArray[i++] = singleBaseInfo.count;
|
||||
}
|
||||
return variableArray;
|
||||
}
|
||||
|
||||
private byte [] convertReadBases() {
|
||||
|
|
@ -316,7 +297,7 @@ public class SyntheticRead {
|
|||
* @return the cigar string for the synthetic read
|
||||
*/
|
||||
private Cigar buildCigar() {
|
||||
LinkedList<CigarElement> cigarElements = new LinkedList<CigarElement>();
|
||||
ObjectArrayList<CigarElement> cigarElements = new ObjectArrayList<CigarElement>();
|
||||
CigarOperator cigarOperator = null;
|
||||
int length = 0;
|
||||
for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) {
|
||||
|
|
@ -369,7 +350,6 @@ public class SyntheticRead {
|
|||
variableArray[i++] = count;
|
||||
}
|
||||
return variableArray;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -44,10 +44,11 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -55,6 +56,8 @@ import org.broadinstitute.sting.gatk.report.GATKReport;
|
|||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -63,25 +66,25 @@ import java.util.LinkedList;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Simple walker to plot the coverage distribution per base.
|
||||
* Simple walker to plot the coverage distribution per base
|
||||
*
|
||||
* <p>
|
||||
* Features of this walker:
|
||||
* <li>includes a smart counting of uncovered bases without visiting the uncovered loci.</li>
|
||||
* <li>includes a smart counting of uncovered bases without visiting the uncovered loci</li>
|
||||
* <li>includes reads with deletions in the loci (optionally can be turned off)</li>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The BAM file and an optional interval list (works for WGS as well)
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A GATK Report with the coverage distribution per base
|
||||
*
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
@ -91,15 +94,16 @@ import java.util.Map;
|
|||
* -fd \
|
||||
* -o report.grp
|
||||
* </pre>
|
||||
* User: carneiro
|
||||
* Date: 1/27/13
|
||||
* Time: 11:16 AM
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 1/27/13
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
public class BaseCoverageDistribution extends LocusWalker<ArrayList<Integer>, Map<Integer, ArrayList<Long>>> {
|
||||
/**
|
||||
* The output GATK Report table
|
||||
*/
|
||||
@Output(required = true, doc = "The output GATK Report table")
|
||||
@Output(doc = "The output GATK Report table")
|
||||
private PrintStream out;
|
||||
|
||||
/**
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
|
|
@ -63,11 +63,36 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
|
|||
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Outputs a list of intervals that are covered above a given threshold.
|
||||
*
|
||||
* <p>The list can be used as an interval list for other walkers. Note that if the -uncovered argument is given, the tool will instead output intervals that fail the coverage threshold.</p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* One or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* List of covered (or uncovered) intervals.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Example</h3>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -T FindCoveredIntervals \
|
||||
* -R ref.fasta \
|
||||
* -I my_file.bam \
|
||||
* -o output.list
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
@PartitionBy(PartitionType.CONTIG)
|
||||
@ActiveRegionTraversalParameters(extension = 0, maxRegion = 50000)
|
||||
public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
|
||||
@Output(required = true)
|
||||
@Output
|
||||
private PrintStream out;
|
||||
|
||||
@Argument(fullName = "uncovered", shortName = "u", required = false, doc = "output intervals that fail the coverage threshold instead")
|
||||
|
|
@ -80,7 +105,7 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
|
|||
// Look to see if the region has sufficient coverage
|
||||
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
|
||||
int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup());
|
||||
int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage();
|
||||
|
||||
// note the linear probability scale
|
||||
return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Generic code for Diagnose Target Statistics
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 4/23/13
|
||||
*/
|
||||
abstract class AbstractStratification {
|
||||
|
||||
private long preComputedTotalCoverage = -1;
|
||||
private Map<CallableStatus, Integer> statusTally = null;
|
||||
protected ThresHolder thresholds;
|
||||
|
||||
/**
|
||||
* Calculates the average "good" coverage of this sample. Good means "passes the base and
|
||||
* mapping quality requirements.
|
||||
*
|
||||
* @return the average "good" coverage
|
||||
*/
|
||||
public double averageCoverage(final int size) {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
preComputedTotalCoverage = calculateTotalCoverage(getElements());
|
||||
return (double) preComputedTotalCoverage / size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the total "good" coverage of this sample. Good means "passes the base and
|
||||
* mapping quality requirements.
|
||||
*
|
||||
* @return the total "good" coverage across the interval for this sample
|
||||
*/
|
||||
public long getCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
preComputedTotalCoverage = calculateTotalCoverage(getElements());
|
||||
return preComputedTotalCoverage;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This is how the extending class will calculate it's own total coverage
|
||||
*
|
||||
* @return the total coverage
|
||||
*/
|
||||
private long calculateTotalCoverage(Iterable<AbstractStratification> elements) {
|
||||
long cov = 0;
|
||||
for (AbstractStratification element : elements) {
|
||||
cov += element.getCoverage();
|
||||
}
|
||||
return cov;
|
||||
}
|
||||
|
||||
/**
|
||||
* What are the list of elements in your class? For example:
|
||||
*
|
||||
* IntervalStatistics => List<SampleStatistics>
|
||||
* SampleStatistics => List<LocusStatistics>
|
||||
*
|
||||
* @return the corresponding list of elements of the extending class
|
||||
*/
|
||||
public abstract Iterable<AbstractStratification> getElements();
|
||||
|
||||
/**
|
||||
* Calculates the Callable statuses for the statistic as a whole (interval, sample or locus)
|
||||
*
|
||||
* @return the callable status(es) for the whole object
|
||||
*/
|
||||
public abstract Iterable<CallableStatus> callableStatuses();
|
||||
|
||||
|
||||
/**
|
||||
* Tally up all the callable status of all the loci in this sample.
|
||||
*
|
||||
* @return a map of callable status and counts
|
||||
*/
|
||||
public Map<CallableStatus, Integer> getStatusTally() {
|
||||
if (statusTally == null) {
|
||||
statusTally = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
|
||||
for (AbstractStratification stats : getElements()) {
|
||||
for (CallableStatus status : stats.callableStatuses()) {
|
||||
statusTally.put(status, !statusTally.containsKey(status) ? 1 : statusTally.get(status) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
return statusTally;
|
||||
}
|
||||
|
||||
public static List<CallableStatus> queryStatus(List<Metric> statList, AbstractStratification stratification) {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stat : statList) {
|
||||
final CallableStatus status = stat.status(stratification);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -44,7 +44,7 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
|
|
@ -52,9 +52,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
|||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
public enum CallableStatus {
|
||||
|
||||
REF_N("the reference base was an N, which is not considered callable the GATK"),
|
||||
enum CallableStatus {
|
||||
|
||||
PASS("the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE"),
|
||||
|
||||
|
|
@ -68,12 +66,7 @@ public enum CallableStatus {
|
|||
|
||||
BAD_MATE("the reads are not properly mated, suggesting mapping errors"),
|
||||
|
||||
NO_READS("there are no reads contained in the interval"),
|
||||
|
||||
//
|
||||
// Interval-level statuses
|
||||
//
|
||||
LOW_MEDIAN_DEPTH("interval has insufficient median depth across samples");
|
||||
NO_READS("there are no reads contained in the interval");
|
||||
|
||||
public final String description;
|
||||
|
||||
|
|
@ -44,10 +44,10 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
|
@ -56,13 +56,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -75,7 +76,7 @@ import java.util.*;
|
|||
* </p>
|
||||
* <p/>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>A reference file</li>
|
||||
|
|
@ -84,12 +85,12 @@ import java.util.*;
|
|||
* </ul>
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A modified VCF detailing each interval by sample
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
|
|
@ -110,79 +111,52 @@ import java.util.*;
|
|||
@PartitionBy(PartitionType.INTERVAL)
|
||||
public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||
|
||||
@Output(doc = "File to which variants should be written", required = true)
|
||||
private static final String AVG_INTERVAL_DP_KEY = "IDP";
|
||||
|
||||
@Output(doc = "File to which interval statistics should be written")
|
||||
private VariantContextWriter vcfWriter = null;
|
||||
|
||||
@Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false)
|
||||
private int minimumBaseQuality = 20;
|
||||
@ArgumentCollection
|
||||
private ThresHolder thresholds = new ThresHolder();
|
||||
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false)
|
||||
private int minimumMappingQuality = 20;
|
||||
private Map<GenomeLoc, IntervalStratification> intervalMap = null; // maps each interval => statistics
|
||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||
private Set<String> samples = null; // all the samples being processed
|
||||
private static final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||
private static final Allele UNCOVERED_ALLELE = Allele.create("A", true); // avoid creating the 'fake' ref allele for uncovered intervals multiple times
|
||||
|
||||
@Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false)
|
||||
private int minimumCoverage = 5;
|
||||
|
||||
@Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false)
|
||||
private int maximumCoverage = 700;
|
||||
|
||||
@Argument(fullName = "minimum_median_depth", shortName = "med", doc = "The minimum allowable median coverage, used for calling LOW_MEDIAN_DEPTH", required = false)
|
||||
private int minMedianDepth = 10;
|
||||
|
||||
@Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false)
|
||||
private int maxInsertSize = 500;
|
||||
|
||||
@Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed percentage of samples containing a call for the interval to adopt the call ", required = false)
|
||||
private double votePercentage = 0.50;
|
||||
|
||||
@Argument(fullName = "low_median_depth_status_threshold", shortName = "stMED", doc = "The percentage of the loci needed for calling LOW_MEDIAN_DEPTH", required = false)
|
||||
private double lowMedianDepthPercentage = 0.20;
|
||||
|
||||
@Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The percentage of the loci needed for calling BAD_MATE", required = false)
|
||||
private double badMateStatusThreshold = 0.50;
|
||||
|
||||
@Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The percentage of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false)
|
||||
private double coverageStatusThreshold = 0.20;
|
||||
|
||||
@Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The percentage of the loci needed for calling EXCESSIVE_COVERAGE", required = false)
|
||||
private double excessiveCoverageThreshold = 0.20;
|
||||
|
||||
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The percentage of the loci needed for calling POOR_QUALITY", required = false)
|
||||
private double qualityStatusThreshold = 0.50;
|
||||
|
||||
@Argument(fullName = "print_debug_log", shortName = "dl", doc = "Used only for debugging the walker. Prints extra info to screen", required = false)
|
||||
private boolean debug = false;
|
||||
|
||||
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // maps each interval => statistics
|
||||
private PeekableIterator<GenomeLoc> intervalListIterator; // an iterator to go over all the intervals provided as we traverse the genome
|
||||
private Set<String> samples = null; // all the samples being processed
|
||||
private final Allele SYMBOLIC_ALLELE = Allele.create("<DT>", false); // avoid creating the symbolic allele multiple times
|
||||
private ThresHolder thresholds = null;
|
||||
private static final int INITIAL_HASH_SIZE = 500000;
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
if (getToolkit().getIntervals() == null)
|
||||
throw new UserException("This tool only works if you provide one or more intervals. ( Use the -L argument )");
|
||||
if (getToolkit().getIntervals() == null || getToolkit().getIntervals().isEmpty())
|
||||
throw new UserException("This tool only works if you provide one or more intervals (use the -L argument). If you want to run whole genome, use -T DepthOfCoverage instead.");
|
||||
|
||||
thresholds = new ThresHolder(minimumBaseQuality, minimumMappingQuality, minimumCoverage, maximumCoverage, minMedianDepth, maxInsertSize, votePercentage, lowMedianDepthPercentage, badMateStatusThreshold, coverageStatusThreshold, excessiveCoverageThreshold, qualityStatusThreshold);
|
||||
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>();
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStratification>(INITIAL_HASH_SIZE);
|
||||
intervalListIterator = new PeekableIterator<GenomeLoc>(getToolkit().getIntervals().iterator());
|
||||
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header
|
||||
vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header
|
||||
// get all of the unique sample names for the VCF Header
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples));
|
||||
|
||||
// pre load all the statistics classes because it is costly to operate on the JVM and we only want to do it once.
|
||||
loadAllPlugins(thresholds);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc refLocus = ref.getLocus();
|
||||
|
||||
removePastIntervals(refLocus, ref.getBase()); // process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
addNewOverlappingIntervals(refLocus); // add all new intervals that may overlap this reference locus
|
||||
// process and remove any intervals in the map that are don't overlap the current locus anymore
|
||||
// and add all new intervals that may overlap this reference locus
|
||||
outputFinishedIntervals(refLocus, ref.getBase());
|
||||
addNewOverlappingIntervals(refLocus);
|
||||
|
||||
for (IntervalStatistics intervalStatistics : intervalMap.values())
|
||||
intervalStatistics.addLocus(context, ref, thresholds); // Add current locus to stats
|
||||
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
|
||||
for (IntervalStratification intervalStratification : intervalMap.values())
|
||||
intervalStratification.addLocus(context);
|
||||
|
||||
return 1L;
|
||||
}
|
||||
|
|
@ -212,53 +186,40 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
for (GenomeLoc interval : intervalMap.keySet())
|
||||
outputStatsToVCF(intervalMap.get(interval), Allele.create("A", true));
|
||||
}
|
||||
outputStatsToVCF(intervalMap.get(interval), UNCOVERED_ALLELE);
|
||||
|
||||
private GenomeLoc getIntervalMapSpan() {
|
||||
GenomeLoc loc = null;
|
||||
for (GenomeLoc interval : intervalMap.keySet()) {
|
||||
if (loc == null) {
|
||||
loc = interval;
|
||||
} else
|
||||
loc = interval.union(loc);
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
while (interval != null) {
|
||||
outputStatsToVCF(createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
return loc;
|
||||
}
|
||||
|
||||
private GenomeLoc getFinishedIntervalSpan(GenomeLoc pos) {
|
||||
GenomeLoc loc = null;
|
||||
for (GenomeLoc interval : intervalMap.keySet()) {
|
||||
if (interval.isBefore(pos)) {
|
||||
if (loc == null)
|
||||
loc = interval;
|
||||
else
|
||||
loc = interval.union(loc);
|
||||
}
|
||||
}
|
||||
|
||||
return loc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all intervals that are behind the current reference locus from the intervalMap
|
||||
* Outputs all intervals that are behind the current reference locus
|
||||
*
|
||||
* @param refLocus the current reference locus
|
||||
* @param refBase the reference allele
|
||||
*/
|
||||
private void removePastIntervals(GenomeLoc refLocus, byte refBase) {
|
||||
// if there are statistics to output/ check to see that we can output them in order
|
||||
if (getFinishedIntervalSpan(refLocus) != null &&
|
||||
getIntervalMapSpan().getStart() == getFinishedIntervalSpan(refLocus).getStart()) {
|
||||
private void outputFinishedIntervals(final GenomeLoc refLocus, final byte refBase) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
for (GenomeLoc interval : intervalMap.keySet()) {
|
||||
if (interval.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(interval), Allele.create(refBase, true));
|
||||
intervalMap.remove(interval);
|
||||
}
|
||||
// output empty statistics for uncovered intervals
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
final IntervalStratification stats = intervalMap.get(interval);
|
||||
outputStatsToVCF(stats != null ? stats : createIntervalStatistic(interval), UNCOVERED_ALLELE);
|
||||
if (stats != null) intervalMap.remove(interval);
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// remove any potential leftover interval in intervalMap (this will only happen when we have overlapping intervals)
|
||||
for (GenomeLoc key : intervalMap.keySet()) {
|
||||
if (key.isBefore(refLocus)) {
|
||||
outputStatsToVCF(intervalMap.get(key), Allele.create(refBase, true));
|
||||
intervalMap.remove(key);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -269,17 +230,9 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
*/
|
||||
private void addNewOverlappingIntervals(GenomeLoc refLocus) {
|
||||
GenomeLoc interval = intervalListIterator.peek();
|
||||
|
||||
// skip any intervals with no coverage that we have passed
|
||||
while (interval != null && interval.isBefore(refLocus)) {
|
||||
intervalListIterator.next(); // discard the interval (we've already added it to the map)
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
|
||||
// add any intervals that overlap this one
|
||||
while (interval != null && !interval.isPast(refLocus)) {
|
||||
intervalMap.put(interval, createIntervalStatistic(interval));
|
||||
intervalListIterator.next(); // discard the interval (we've already added it to the map)
|
||||
intervalListIterator.next();
|
||||
interval = intervalListIterator.peek();
|
||||
}
|
||||
}
|
||||
|
|
@ -290,7 +243,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param stats The statistics of the interval
|
||||
* @param refAllele the reference allele
|
||||
*/
|
||||
private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) {
|
||||
private void outputStatsToVCF(IntervalStratification stats, Allele refAllele) {
|
||||
GenomeLoc interval = stats.getInterval();
|
||||
|
||||
|
||||
|
|
@ -302,37 +255,26 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
alleles.add(SYMBOLIC_ALLELE);
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles);
|
||||
|
||||
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF
|
||||
vcb.filters(new HashSet<String>(statusesToStrings(stats.callableStatuses(thresholds), true)));
|
||||
vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR);
|
||||
vcb.filters(new LinkedHashSet<String>(statusToStrings(stats.callableStatuses(), true)));
|
||||
|
||||
attributes.put(VCFConstants.END_KEY, interval.getStop());
|
||||
attributes.put(ThresHolder.AVG_INTERVAL_DP_KEY, stats.averageCoverage());
|
||||
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
|
||||
|
||||
vcb = vcb.attributes(attributes);
|
||||
if (debug) {
|
||||
System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage());
|
||||
}
|
||||
for (String sample : samples) {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sample);
|
||||
|
||||
SampleStatistics sampleStat = stats.getSample(sample);
|
||||
gb.attribute(ThresHolder.AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage());
|
||||
gb.attribute("Q1", sampleStat.getQuantileDepth(0.25));
|
||||
gb.attribute("MED", sampleStat.getQuantileDepth(0.50));
|
||||
gb.attribute("Q3", sampleStat.getQuantileDepth(0.75));
|
||||
SampleStratification sampleStat = stats.getSampleStatistics(sample);
|
||||
gb.attribute(AVG_INTERVAL_DP_KEY, sampleStat.averageCoverage(interval.size()));
|
||||
|
||||
if (debug) {
|
||||
System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads());
|
||||
}
|
||||
gb.filters(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds), false));
|
||||
gb.filters(statusToStrings(stats.getSampleStatistics(sample).callableStatuses(), false));
|
||||
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
vcb = vcb.genotypes(genotypes);
|
||||
|
||||
|
||||
vcfWriter.add(vcb.make());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -341,17 +283,74 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
* @param statuses the set of statuses to be converted
|
||||
* @return a matching set of strings
|
||||
*/
|
||||
private List<String> statusesToStrings(Set<CallableStatus> statuses, final boolean includePASS) {
|
||||
List<String> output = new ArrayList<String>(statuses.size());
|
||||
private List<String> statusToStrings(Iterable<CallableStatus> statuses, final boolean isInfoField) {
|
||||
List<String> output = new LinkedList<String>();
|
||||
|
||||
for (CallableStatus status : statuses)
|
||||
if ( includePASS || status != CallableStatus.PASS ) // adding pass => results in a filter for genotypes
|
||||
if ( isInfoField || status != CallableStatus.PASS )
|
||||
output.add(status.name());
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
private IntervalStatistics createIntervalStatistic(GenomeLoc interval) {
|
||||
return new IntervalStatistics(samples, interval);
|
||||
private IntervalStratification createIntervalStatistic(GenomeLoc interval) {
|
||||
return new IntervalStratification(samples, interval, thresholds);
|
||||
}
|
||||
|
||||
protected static void loadAllPlugins(final ThresHolder thresholds) {
|
||||
for (Class<?> stat : new PluginManager<LocusMetric>(LocusMetric.class).getPlugins()) {
|
||||
try {
|
||||
final LocusMetric stats = (LocusMetric) stat.newInstance();
|
||||
stats.initialize(thresholds);
|
||||
thresholds.locusMetricList.add(stats);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(stat, e);
|
||||
}
|
||||
}
|
||||
|
||||
for (Class<?> stat : new PluginManager<SampleMetric>(SampleMetric.class).getPlugins()) {
|
||||
try {
|
||||
final SampleMetric stats = (SampleMetric) stat.newInstance();
|
||||
stats.initialize(thresholds);
|
||||
thresholds.sampleMetricList.add(stats);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(stat, e);
|
||||
}
|
||||
}
|
||||
|
||||
for (Class<?> stat : new PluginManager<IntervalMetric>(IntervalMetric.class).getPlugins()) {
|
||||
try {
|
||||
final IntervalMetric stats = (IntervalMetric) stat.newInstance();
|
||||
stats.initialize(thresholds);
|
||||
thresholds.intervalMetricList.add(stats);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(stat, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the header lines for the VCF writer
|
||||
*
|
||||
* @return A set of VCF header lines
|
||||
*/
|
||||
private static Set<VCFHeaderLine> getHeaderInfo() {
|
||||
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// INFO fields for overall data
|
||||
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
|
||||
headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
|
||||
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
|
||||
|
||||
// FORMAT fields for each genotype
|
||||
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average sample depth across the interval. Sum of the sample specific depth in all loci divided by interval size."));
|
||||
|
||||
// FILTER fields
|
||||
for (CallableStatus stat : CallableStatus.values())
|
||||
headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));
|
||||
|
||||
return headerLines;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:30 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
interface IntervalMetric extends Metric {
|
||||
}
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
final class IntervalStratification extends AbstractStratification {
|
||||
private final Map<String, AbstractStratification> samples;
|
||||
private final GenomeLoc interval;
|
||||
private final ThresHolder thresholds;
|
||||
|
||||
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
||||
this.interval = interval;
|
||||
this.thresholds = thresholds;
|
||||
this.samples = new HashMap<String, AbstractStratification>(samples.size());
|
||||
for (String sample : samples)
|
||||
this.samples.put(sample, new SampleStratification(interval, thresholds));
|
||||
}
|
||||
|
||||
public SampleStratification getSampleStatistics(String sample) {
|
||||
return (SampleStratification) samples.get(sample);
|
||||
}
|
||||
|
||||
public GenomeLoc getInterval() {
|
||||
return interval;
|
||||
}
|
||||
|
||||
public int getNSamples() {
|
||||
return samples.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* The function to populate data into the Statistics from the walker.
|
||||
* This takes the input and manages passing the data to the SampleStatistics and Locus Statistics
|
||||
*
|
||||
* @param context The alignment context given from the walker
|
||||
*/
|
||||
public void addLocus(AlignmentContext context) {
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
|
||||
Map<String, ReadBackedPileup> samplePileups = pileup.getPileupsForSamples(samples.keySet());
|
||||
|
||||
for (Map.Entry<String, ReadBackedPileup> entry : samplePileups.entrySet()) {
|
||||
String sample = entry.getKey();
|
||||
ReadBackedPileup samplePileup = entry.getValue();
|
||||
SampleStratification sampleStratification = (SampleStratification) samples.get(sample);
|
||||
|
||||
if (sampleStratification == null)
|
||||
throw new ReviewedStingException(String.format("Trying to add locus statistics to a sample (%s) that doesn't exist in the Interval.", sample));
|
||||
|
||||
sampleStratification.addLocus(context.getLocation(), samplePileup);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<AbstractStratification> getElements() {
|
||||
return samples.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<CallableStatus> callableStatuses() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// check if any of the votes pass the threshold
|
||||
final int nSamples = getNSamples();
|
||||
for (Map.Entry<CallableStatus, Integer> entry : getStatusTally().entrySet()) {
|
||||
if ((double) entry.getValue() / nSamples > thresholds.votePercentageThreshold) {
|
||||
output.add(entry.getKey());
|
||||
}
|
||||
}
|
||||
|
||||
output.addAll(queryStatus(thresholds.intervalMetricList, this));
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:29 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
interface LocusMetric extends Metric {
|
||||
public CallableStatus sampleStatus (SampleStratification sampleStratification);
|
||||
}
|
||||
|
|
@ -44,55 +44,30 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
|
||||
import java.util.Arrays;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:44 PM
|
||||
*/
|
||||
// simple node class for storing kmer sequences
|
||||
@Invariant("kmer > 0")
|
||||
public class DeBruijnVertex {
|
||||
final class LocusMetricCoverageGap implements LocusMetric {
|
||||
private double threshold;
|
||||
private static final CallableStatus CALL = CallableStatus.COVERAGE_GAPS;
|
||||
|
||||
protected final byte[] sequence;
|
||||
public final int kmer;
|
||||
|
||||
public DeBruijnVertex( final byte[] sequence, final int kmer ) {
|
||||
this.sequence = sequence.clone();
|
||||
this.kmer = kmer;
|
||||
@Override
|
||||
public void initialize(ThresHolder thresholds) {
|
||||
threshold = thresholds.coverageStatusThreshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals( Object v ) {
|
||||
return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence);
|
||||
public CallableStatus status(AbstractStratification statistics) {
|
||||
final LocusStratification locusStratification = (LocusStratification) statistics;
|
||||
return locusStratification.getRawCoverage() == 0 ? CALL : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
|
||||
return Arrays.hashCode(sequence);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return new String(sequence);
|
||||
}
|
||||
|
||||
public String getSuffixString() {
|
||||
return new String(getSuffix());
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public byte[] getSequence() {
|
||||
return sequence.clone();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public byte[] getSuffix() {
|
||||
return Arrays.copyOfRange( sequence, kmer - 1, sequence.length );
|
||||
public CallableStatus sampleStatus(SampleStratification sampleStratification) {
|
||||
return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:44 PM
|
||||
*/
|
||||
final class LocusMetricExcessiveCoverage implements LocusMetric {
|
||||
private int excessiveCoverage;
|
||||
private double threshold;
|
||||
private static final CallableStatus CALL = CallableStatus.EXCESSIVE_COVERAGE ;
|
||||
|
||||
@Override
|
||||
public void initialize(ThresHolder thresholds) {
|
||||
this.excessiveCoverage = thresholds.maximumCoverage;
|
||||
this.threshold = thresholds.coverageStatusThreshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus status(AbstractStratification statistics) {
|
||||
final LocusStratification locusStratification = (LocusStratification) statistics;
|
||||
return locusStratification.getCoverage() > excessiveCoverage ? CALL : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus sampleStatus(SampleStratification sampleStratification) {
|
||||
return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:44 PM
|
||||
*/
|
||||
final class LocusMetricLowCoverage implements LocusMetric {
|
||||
private int minCoverage;
|
||||
private double threshold;
|
||||
private static final CallableStatus CALL = CallableStatus.LOW_COVERAGE ;
|
||||
|
||||
@Override
|
||||
public void initialize(ThresHolder thresholds) {
|
||||
this.minCoverage = thresholds.minimumCoverage;
|
||||
this.threshold = thresholds.coverageStatusThreshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus status(AbstractStratification statistics) {
|
||||
final LocusStratification locusStratification = (LocusStratification) statistics;
|
||||
final long raw = locusStratification.getRawCoverage();
|
||||
return raw > 0 && raw < minCoverage ? CALL: null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus sampleStatus(SampleStratification sampleStratification) {
|
||||
return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:44 PM
|
||||
*/
|
||||
final class LocusMetricPoorQuality implements LocusMetric {
|
||||
private int minCoverage;
|
||||
private double threshold;
|
||||
private static final CallableStatus CALL = CallableStatus.POOR_QUALITY ;
|
||||
|
||||
@Override
|
||||
public void initialize(ThresHolder thresholds) {
|
||||
this.minCoverage = thresholds.minimumCoverage;
|
||||
this.threshold = thresholds.coverageStatusThreshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus status(AbstractStratification statistics) {
|
||||
final LocusStratification locusStratification = (LocusStratification) statistics;
|
||||
return locusStratification.getCoverage() < minCoverage && locusStratification.getRawCoverage() >= minCoverage ? CALL: null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus sampleStatus(SampleStratification sampleStratification) {
|
||||
return PluginUtils.genericSampleStatus(sampleStratification, CALL, threshold);
|
||||
}
|
||||
}
|
||||
|
|
@ -44,61 +44,54 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
class LocusStatistics {
|
||||
private final int coverage;
|
||||
private final int rawCoverage;
|
||||
final class LocusStratification extends AbstractStratification {
|
||||
private long coverage;
|
||||
private long rawCoverage;
|
||||
private final List<Metric> locusStatisticsList;
|
||||
|
||||
public LocusStatistics() {
|
||||
this.coverage = 0;
|
||||
this.rawCoverage = 0;
|
||||
public LocusStratification(ThresHolder thresholds) {
|
||||
this(0,0,thresholds);
|
||||
}
|
||||
|
||||
public LocusStatistics(int coverage, int rawCoverage) {
|
||||
protected LocusStratification(int coverage, int rawCoverage, ThresHolder thresholds) {
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
this.locusStatisticsList = thresholds.locusMetricList;
|
||||
}
|
||||
|
||||
public int getCoverage() {
|
||||
return coverage;
|
||||
}
|
||||
@Override
|
||||
public long getCoverage() {return coverage;}
|
||||
public long getRawCoverage() {return rawCoverage;}
|
||||
|
||||
public int getRawCoverage() {
|
||||
return rawCoverage;
|
||||
public void addLocus(final int coverage, final int rawCoverage) {
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates all applicable statuses from the coverages in this locus
|
||||
*
|
||||
* @param thresholds the class contains the statistical threshold for making calls
|
||||
* @return a set of all statuses that apply
|
||||
*/
|
||||
public Set<CallableStatus> callableStatuses(ThresHolder thresholds) {
|
||||
Set<CallableStatus> output = new HashSet<CallableStatus>();
|
||||
|
||||
// if too much coverage
|
||||
if (getCoverage() > thresholds.getMaximumCoverage())
|
||||
output.add(CallableStatus.EXCESSIVE_COVERAGE);
|
||||
|
||||
// if not enough coverage
|
||||
if (getCoverage() < thresholds.getMinimumCoverage()) {
|
||||
// was there a lot of low Qual coverage?
|
||||
if (getRawCoverage() >= thresholds.getMinimumCoverage())
|
||||
output.add(CallableStatus.POOR_QUALITY);
|
||||
// no?
|
||||
else {
|
||||
// is there any coverage?
|
||||
if (getRawCoverage() > 0)
|
||||
output.add(CallableStatus.LOW_COVERAGE);
|
||||
else
|
||||
output.add(CallableStatus.COVERAGE_GAPS);
|
||||
public List<CallableStatus> callableStatuses() {
|
||||
List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
for (Metric stats : locusStatisticsList) {
|
||||
CallableStatus status = stats.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<AbstractStratification> getElements() {
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 4/23/13
|
||||
*/
|
||||
interface Metric {
|
||||
public void initialize(ThresHolder thresholds);
|
||||
public CallableStatus status (AbstractStratification statistic);
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 4/21/13
|
||||
* Time: 11:23 AM
|
||||
*/
|
||||
final class PluginUtils {
|
||||
public static CallableStatus genericSampleStatus (final SampleStratification sampleStratification, final CallableStatus CALL, final double threshold) {
|
||||
final Map<CallableStatus, Integer> totals = sampleStratification.getStatusTally();
|
||||
final int size = sampleStratification.getIntervalSize();
|
||||
final int statusCount = totals.containsKey(CALL) ? totals.get(CALL) : 0;
|
||||
return ( (double) statusCount / size) >= threshold ? CALL: null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:30 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
interface SampleMetric extends Metric {
|
||||
}
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:44 PM
|
||||
*/
|
||||
final class SampleMetricBadMates implements SampleMetric {
|
||||
private static final CallableStatus CALL = CallableStatus.NO_READS ;
|
||||
|
||||
private double threshold;
|
||||
private double votingThreshold;
|
||||
|
||||
@Override
|
||||
public void initialize(ThresHolder thresholds) {
|
||||
threshold = thresholds.badMateStatusThreshold;
|
||||
votingThreshold = thresholds.votePercentageThreshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus status(AbstractStratification statistics) {
|
||||
final SampleStratification sampleStratification = (SampleStratification) statistics;
|
||||
final int nReads = sampleStratification.getnReads();
|
||||
return nReads > 0 && (double) sampleStratification.getnBadMates() / nReads > threshold ? CALL : null;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
/**
|
||||
* User: carneiro
|
||||
* Date: 4/20/13
|
||||
* Time: 11:44 PM
|
||||
*/
|
||||
final class SampleMetricNoReads implements SampleMetric {
|
||||
private static final CallableStatus CALL = CallableStatus.NO_READS;
|
||||
@Override
|
||||
public void initialize(ThresHolder thresholds) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public CallableStatus status(AbstractStratification statistics) {
|
||||
final SampleStratification sampleStratification = (SampleStratification) statistics;
|
||||
return sampleStratification.getnReads() == 0 ? CALL : null;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The statistics calculator for a specific sample given the interval
|
||||
*/
|
||||
final class SampleStratification extends AbstractStratification {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<AbstractStratification> loci;
|
||||
private final ThresHolder thresholds;
|
||||
|
||||
private int nReads = -1;
|
||||
private int nBadMates = -1;
|
||||
|
||||
public SampleStratification(final GenomeLoc interval, final ThresHolder thresholds) {
|
||||
this.interval = interval;
|
||||
this.loci = new ArrayList<AbstractStratification>(interval.size());
|
||||
this.thresholds = thresholds;
|
||||
nReads = 0;
|
||||
nBadMates = 0;
|
||||
|
||||
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
|
||||
for (int i = 0; i < interval.size(); i++)
|
||||
this.loci.add(new LocusStratification(thresholds));
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple Getters
|
||||
*/
|
||||
public int getIntervalSize() {return interval.size();}
|
||||
public int getnReads() {return nReads;}
|
||||
public int getnBadMates() {return nBadMates;}
|
||||
|
||||
/**
|
||||
* Adds a locus to the interval wide stats
|
||||
*
|
||||
* @param locus The locus given as a GenomeLoc
|
||||
* @param pileup The pileup of that locus, this exclusively contains the sample
|
||||
*/
|
||||
public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) {
|
||||
if (!interval.containsP(locus))
|
||||
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval));
|
||||
|
||||
// a null pileup means there nothing to add
|
||||
if (pileup != null) {
|
||||
final int locusIndex = locus.getStart() - interval.getStart();
|
||||
final int rawCoverage = pileup.depthOfCoverage();
|
||||
final int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.minimumBaseQuality, thresholds.minimumMappingQuality).depthOfCoverage();
|
||||
final LocusStratification locusData = (LocusStratification) loci.get(locusIndex);
|
||||
locusData.addLocus(coverage, rawCoverage);
|
||||
|
||||
// process all the reads in this pileup (tallying number of reads and bad mates)
|
||||
for (GATKSAMRecord read : pileup.getReads())
|
||||
processRead(read);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<AbstractStratification> getElements() {
|
||||
return loci;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<CallableStatus> callableStatuses() {
|
||||
final List<CallableStatus> output = new LinkedList<CallableStatus>();
|
||||
|
||||
// get the tally of all the locus callable statuses
|
||||
for (Metric locusStat : thresholds.locusMetricList) {
|
||||
final CallableStatus status = ((LocusMetric) locusStat).sampleStatus(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
|
||||
// get the sample specific statitics statuses
|
||||
for (Metric sampleStat : thresholds.sampleMetricList) {
|
||||
final CallableStatus status = sampleStat.status(this);
|
||||
if (status != null) {
|
||||
output.add(status);
|
||||
}
|
||||
}
|
||||
|
||||
// special case, if there are no reads, then there is no sense reporting coverage gaps.
|
||||
if (output.contains(CallableStatus.NO_READS) && output.contains(CallableStatus.COVERAGE_GAPS))
|
||||
output.remove(CallableStatus.COVERAGE_GAPS);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Account for the read and check it for any statistics necessary. Reads are marked in the temporary
|
||||
* attribute "seen" to make sure they're not counted twice.
|
||||
*
|
||||
* @param read the read
|
||||
*/
|
||||
private void processRead(GATKSAMRecord read) {
|
||||
if (read.getTemporaryAttribute("seen") == null) {
|
||||
nReads++;
|
||||
if (read.getReadPairedFlag() && !read.getProperPairFlag())
|
||||
nBadMates++;
|
||||
read.setTemporaryAttribute("seen", true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -44,131 +44,101 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
class ThresHolder {
|
||||
public static final String AVG_INTERVAL_DP_KEY = "AVG_INTERVAL_DP";
|
||||
public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5);
|
||||
final class ThresHolder {
|
||||
|
||||
private final int minimumBaseQuality;
|
||||
private final int minimumMappingQuality;
|
||||
/**
|
||||
* Only bases with quality greater than this will be considered in the coverage metrics.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality", shortName = "BQ", doc = "The minimum Base Quality that is considered for calls", required = false)
|
||||
public int minimumBaseQuality = 20;
|
||||
|
||||
private final int minimumCoverage;
|
||||
private final int maximumCoverage;
|
||||
private final int minimumMedianDepth;
|
||||
/**
|
||||
* Only reads with mapping quality greater than this will be considered in the coverage metrics.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "MQ", doc = "The minimum read mapping quality considered for calls", required = false)
|
||||
public int minimumMappingQuality = 20;
|
||||
|
||||
private final int maximumInsertSize;
|
||||
/**
|
||||
* If at any locus, a sample has less coverage than this, it will be reported as LOW_COVERAGE
|
||||
*/
|
||||
@Argument(fullName = "minimum_coverage", shortName = "min", doc = "The minimum allowable coverage, used for calling LOW_COVERAGE", required = false)
|
||||
public int minimumCoverage = 5;
|
||||
|
||||
private final double votePercentageThreshold;
|
||||
private final double lowMedianDepthThreshold;
|
||||
private final double badMateStatusThreshold;
|
||||
private final double coverageStatusThreshold;
|
||||
private final double excessiveCoverageThreshold;
|
||||
private final double qualityStatusThreshold;
|
||||
/**
|
||||
* If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE
|
||||
*/
|
||||
@Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false)
|
||||
public int maximumCoverage = 700;
|
||||
|
||||
public ThresHolder(int minimumBaseQuality,
|
||||
int minimumMappingQuality,
|
||||
int minimumCoverage,
|
||||
int maximumCoverage,
|
||||
int minimumMedianDepth,
|
||||
int maximumInsertSize,
|
||||
double votePercentageThreshold,
|
||||
double lowMedianDepthThreshold,
|
||||
double badMateStatusThreshold,
|
||||
double coverageStatusThreshold,
|
||||
double excessiveCoverageThreshold,
|
||||
double qualityStatusThreshold) {
|
||||
/**
|
||||
* If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE
|
||||
*/
|
||||
@Argument(fullName = "maximum_insert_size", shortName = "ins", doc = "The maximum allowed distance between a read and its mate", required = false)
|
||||
public int maximumInsertSize = 500;
|
||||
|
||||
/**
|
||||
* The proportion of samples that must have a status for it to filter the entire interval. Example: 8 out of 10 samples have low coverage status on the interval,
|
||||
* with a threshold higher than 0.2, this interval will be filtered as LOW_COVERAGE.
|
||||
*/
|
||||
@Argument(fullName = "voting_status_threshold", shortName = "stV", doc = "The needed proportion of samples containing a call for the interval to adopt the call ", required = false)
|
||||
public double votePercentageThreshold = 0.50;
|
||||
|
||||
/**
|
||||
* The proportion of reads in the loci that must have bad mates for the sample to be reported as BAD_MATE
|
||||
*/
|
||||
@Argument(fullName = "bad_mate_status_threshold", shortName = "stBM", doc = "The proportion of the loci needed for calling BAD_MATE", required = false)
|
||||
public double badMateStatusThreshold = 0.50;
|
||||
|
||||
/**
|
||||
* The proportion of loci in a sample that must fall under the LOW_COVERAGE or COVERAGE_GAPS category for the sample to be reported as either (or both)
|
||||
*/
|
||||
@Argument(fullName = "coverage_status_threshold", shortName = "stC", doc = "The proportion of the loci needed for calling LOW_COVERAGE and COVERAGE_GAPS", required = false)
|
||||
public double coverageStatusThreshold = 0.20;
|
||||
|
||||
/**
|
||||
* The proportion of loci in a sample that must fall under the EXCESSIVE_COVERAGE category for the sample to be reported as EXCESSIVE_COVERAGE
|
||||
*/
|
||||
@Argument(fullName = "excessive_coverage_status_threshold", shortName = "stXC", doc = "The proportion of the loci needed for calling EXCESSIVE_COVERAGE", required = false)
|
||||
public double excessiveCoverageThreshold = 0.20;
|
||||
|
||||
/**
|
||||
* The proportion of loci in a sample that must fall under the LOW_QUALITY category for the sample to be reported as LOW_QUALITY
|
||||
*/
|
||||
@Argument(fullName = "quality_status_threshold", shortName = "stQ", doc = "The proportion of the loci needed for calling POOR_QUALITY", required = false)
|
||||
public double qualityStatusThreshold = 0.50;
|
||||
|
||||
public final List<Metric> locusMetricList = new LinkedList<Metric>();
|
||||
public final List<Metric> sampleMetricList = new LinkedList<Metric>();
|
||||
public final List<Metric> intervalMetricList = new LinkedList<Metric>();
|
||||
|
||||
public ThresHolder() {}
|
||||
|
||||
public ThresHolder(final int minimumBaseQuality,
|
||||
final int minimumMappingQuality,
|
||||
final int minimumCoverage,
|
||||
final int maximumCoverage,
|
||||
final int maximumInsertSize,
|
||||
final double votePercentageThreshold,
|
||||
final double badMateStatusThreshold,
|
||||
final double coverageStatusThreshold,
|
||||
final double excessiveCoverageThreshold,
|
||||
final double qualityStatusThreshold) {
|
||||
this.minimumBaseQuality = minimumBaseQuality;
|
||||
this.minimumMappingQuality = minimumMappingQuality;
|
||||
this.minimumCoverage = minimumCoverage;
|
||||
this.maximumCoverage = maximumCoverage;
|
||||
this.minimumMedianDepth = minimumMedianDepth;
|
||||
this.maximumInsertSize = maximumInsertSize;
|
||||
this.votePercentageThreshold = votePercentageThreshold;
|
||||
this.lowMedianDepthThreshold = lowMedianDepthThreshold;
|
||||
this.badMateStatusThreshold = badMateStatusThreshold;
|
||||
this.coverageStatusThreshold = coverageStatusThreshold;
|
||||
this.excessiveCoverageThreshold = excessiveCoverageThreshold;
|
||||
this.qualityStatusThreshold = qualityStatusThreshold;
|
||||
}
|
||||
|
||||
public int getMinimumCoverage() {
|
||||
return minimumCoverage;
|
||||
}
|
||||
|
||||
public int getMaximumCoverage() {
|
||||
return maximumCoverage;
|
||||
}
|
||||
|
||||
public int getMinimumMedianDepth() {
|
||||
return minimumMedianDepth;
|
||||
}
|
||||
|
||||
public int getMaximumInsertSize() {
|
||||
return maximumInsertSize;
|
||||
}
|
||||
|
||||
public double getVotePercentageThreshold() {
|
||||
return votePercentageThreshold;
|
||||
}
|
||||
|
||||
public double getLowMedianDepthThreshold() {
|
||||
return lowMedianDepthThreshold;
|
||||
}
|
||||
|
||||
public double getBadMateStatusThreshold() {
|
||||
return badMateStatusThreshold;
|
||||
}
|
||||
|
||||
public double getCoverageStatusThreshold() {
|
||||
return coverageStatusThreshold;
|
||||
}
|
||||
|
||||
public double getExcessiveCoverageThreshold() {
|
||||
return excessiveCoverageThreshold;
|
||||
}
|
||||
|
||||
public double getQualityStatusThreshold() {
|
||||
return qualityStatusThreshold;
|
||||
}
|
||||
|
||||
public int getFilteredCoverage(ReadBackedPileup pileup) {
|
||||
return pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the header lines for the VCF writer
|
||||
*
|
||||
* @return A set of VCF header lines
|
||||
*/
|
||||
public static Set<VCFHeaderLine> getHeaderInfo() {
|
||||
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// INFO fields for overall data
|
||||
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
|
||||
headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
|
||||
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
|
||||
|
||||
// FORMAT fields for each genotype
|
||||
// todo -- find the appropriate VCF constants
|
||||
headerLines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
|
||||
headerLines.add(new VCFFormatHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
|
||||
headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution."));
|
||||
headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution."));
|
||||
headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution."));
|
||||
|
||||
|
||||
// FILTER fields
|
||||
for (CallableStatus stat : CallableStatus.values())
|
||||
headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description));
|
||||
|
||||
return headerLines;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,320 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* The statistics calculator for a specific sample given the interval
|
||||
*/
|
||||
class SampleStatistics {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<LocusStatistics> loci;
|
||||
|
||||
private int[] preSortedDepths = null;
|
||||
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
|
||||
|
||||
private int nReads = -1;
|
||||
private int nBadMates = -1;
|
||||
|
||||
private SampleStatistics(GenomeLoc interval, ArrayList<LocusStatistics> loci) {
|
||||
this.interval = interval;
|
||||
this.loci = loci;
|
||||
nReads = 0;
|
||||
nBadMates = 0;
|
||||
}
|
||||
|
||||
public SampleStatistics(GenomeLoc interval) {
|
||||
this(interval, new ArrayList<LocusStatistics>(interval.size()));
|
||||
|
||||
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
|
||||
for (int i = 0; i < interval.size(); i++)
|
||||
this.loci.add(new LocusStatistics());
|
||||
|
||||
}
|
||||
|
||||
public long totalCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
calculateTotalCoverage();
|
||||
return preComputedTotalCoverage;
|
||||
}
|
||||
|
||||
public double averageCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
calculateTotalCoverage();
|
||||
return (double) preComputedTotalCoverage / loci.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the callable statuses of the entire sample
|
||||
*
|
||||
* @param thresholds the class contains the statistical threshold for making calls
|
||||
* @return the callable statuses of the entire sample
|
||||
*/
|
||||
public Set<CallableStatus> getCallableStatuses(ThresHolder thresholds) {
|
||||
// We check if reads are present ot prevent div / 0 exceptions
|
||||
if (nReads == 0) {
|
||||
return Collections.singleton(CallableStatus.NO_READS);
|
||||
}
|
||||
|
||||
Set<CallableStatus> output = new HashSet<CallableStatus>();
|
||||
Map<CallableStatus, Double> totals = new HashMap<CallableStatus, Double>(CallableStatus.values().length);
|
||||
|
||||
// initialize map
|
||||
for (CallableStatus status : CallableStatus.values())
|
||||
totals.put(status, 0.0);
|
||||
|
||||
// sum up all the callable statuses for each locus
|
||||
for (int i = 0; i < interval.size(); i++) {
|
||||
for (CallableStatus status : callableStatus(i, thresholds)) {
|
||||
double count = totals.get(status);
|
||||
|
||||
totals.put(status, count + 1);
|
||||
}
|
||||
}
|
||||
|
||||
double intervalSize = interval.size();
|
||||
|
||||
if (((double) nBadMates / nReads) >= thresholds.getBadMateStatusThreshold())
|
||||
output.add(CallableStatus.BAD_MATE);
|
||||
|
||||
if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) >= thresholds.getCoverageStatusThreshold())
|
||||
output.add(CallableStatus.COVERAGE_GAPS);
|
||||
|
||||
if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) >= thresholds.getCoverageStatusThreshold())
|
||||
output.add(CallableStatus.LOW_COVERAGE);
|
||||
|
||||
if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) >= thresholds.getExcessiveCoverageThreshold())
|
||||
output.add(CallableStatus.EXCESSIVE_COVERAGE);
|
||||
|
||||
if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold())
|
||||
output.add(CallableStatus.POOR_QUALITY);
|
||||
|
||||
if (totals.get(CallableStatus.REF_N) > 0)
|
||||
output.add(CallableStatus.REF_N);
|
||||
|
||||
|
||||
if (output.isEmpty()) {
|
||||
output.add(CallableStatus.PASS);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a locus to the interval wide stats
|
||||
*
|
||||
* @param locus The locus given as a GenomeLoc
|
||||
* @param pileup The pileup of that locus, this exclusively contains the sample
|
||||
* @param thresholds the class contains the statistical threshold for making calls
|
||||
*/
|
||||
public void addLocus(GenomeLoc locus, ReadBackedPileup pileup, ThresHolder thresholds) {
|
||||
if (!interval.containsP(locus))
|
||||
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval %s", locus, interval));
|
||||
|
||||
// a null pileup means there nothing ot add
|
||||
if (pileup != null) {
|
||||
|
||||
int locusIndex = locus.getStart() - interval.getStart();
|
||||
|
||||
int rawCoverage = pileup.depthOfCoverage();
|
||||
int coverage = thresholds.getFilteredCoverage(pileup);
|
||||
|
||||
LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage);
|
||||
|
||||
loci.set(locusIndex, locusData);
|
||||
|
||||
for (GATKSAMRecord read : pileup.getReads())
|
||||
processRead(read, thresholds);
|
||||
}
|
||||
}
|
||||
|
||||
private void processRead(GATKSAMRecord read, ThresHolder thresholds) {
|
||||
// Was this read already processed?
|
||||
if (read.getTemporaryAttribute("checkedBadMate") == null) {
|
||||
nReads++;
|
||||
if (!hasValidMate(read, thresholds))
|
||||
nBadMates++;
|
||||
read.setTemporaryAttribute("checkedBadMate", true);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the callable status of a given locus without taking the reference base into account.
|
||||
*
|
||||
* @param locusIndex location in the genome to inquire (only one locus)
|
||||
* @param thresholds the class contains the statistical threshold for making calls
|
||||
* @return the callable status of a locus
|
||||
*/
|
||||
private Set<CallableStatus> callableStatus(int locusIndex, ThresHolder thresholds) {
|
||||
LocusStatistics locus = loci.get(locusIndex);
|
||||
|
||||
return locus.callableStatuses(thresholds);
|
||||
}
|
||||
|
||||
private void calculateTotalCoverage() {
|
||||
preComputedTotalCoverage = 0;
|
||||
for (LocusStatistics locus : loci)
|
||||
preComputedTotalCoverage += locus.getCoverage();
|
||||
}
|
||||
|
||||
public double getQuantileDepth(double percentage) {
|
||||
if (preSortedDepths == null)
|
||||
getDepthsAsSortedArray();
|
||||
|
||||
return getQuartile(preSortedDepths, percentage);
|
||||
}
|
||||
|
||||
static double getQuartile(int[] data, double percentage) {
|
||||
int size = data.length;
|
||||
if (size == 1)
|
||||
return (double) data[0];
|
||||
|
||||
if (percentage == 0.5) {
|
||||
return getMedian(data);
|
||||
}
|
||||
|
||||
double position = (size - 1.0) / 2;
|
||||
if (percentage == 0.25) {
|
||||
// if the position is a whole number
|
||||
return getMedian(Arrays.copyOfRange(data, 0, (int) position + 1));
|
||||
|
||||
}
|
||||
if (percentage == 0.75) {
|
||||
if (position % 1 == 0) {
|
||||
return getMedian(Arrays.copyOfRange(data, (int) position, size));
|
||||
} else {
|
||||
return getMedian(Arrays.copyOfRange(data, (int) position + 1, size));
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Assumes data is sorted
|
||||
private static double getMedian(int[] data) {
|
||||
double size = (double) data.length;
|
||||
if (size == 1)
|
||||
return (double) data[0];
|
||||
|
||||
double position = (size - 1.0) / 2;
|
||||
|
||||
if (position % 1 == 0)
|
||||
return (double) data[(int) position];
|
||||
|
||||
else {
|
||||
double high = (double) data[(int) Math.ceil(position)];
|
||||
double low = (double) data[(int) Math.floor(position)];
|
||||
|
||||
return (high + low) / 2;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void getDepthsAsSortedArray() {
|
||||
preSortedDepths = new int[loci.size()];
|
||||
|
||||
for (int i = 0; i < loci.size(); i++)
|
||||
preSortedDepths[i] = loci.get(i).getCoverage();
|
||||
|
||||
Arrays.sort(preSortedDepths);
|
||||
}
|
||||
|
||||
boolean hasValidMate(GATKSAMRecord read, ThresHolder thresholds) {
|
||||
/** Check the following
|
||||
* Does it have a pair?
|
||||
* reasonable insert size?
|
||||
* inverted?
|
||||
* same orientation?
|
||||
* same contig?
|
||||
* is pair mapped?
|
||||
* todo - is forced mate?
|
||||
*
|
||||
*/
|
||||
|
||||
// has NO pair
|
||||
if (!read.getReadPairedFlag())
|
||||
return false;
|
||||
|
||||
// different contigs
|
||||
if (!read.getMateReferenceIndex().equals(read.getReferenceIndex()))
|
||||
return false;
|
||||
|
||||
// unmapped
|
||||
if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag())
|
||||
return false;
|
||||
|
||||
// same orientation
|
||||
if (read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag())
|
||||
return false;
|
||||
|
||||
// inverted
|
||||
if (read.getReadNegativeStrandFlag() ==
|
||||
read.getAlignmentStart() < read.getMateAlignmentStart())
|
||||
return false;
|
||||
|
||||
// TODO note: IGV uses a different algorithm for insert size, there should be a common util class that does this for you
|
||||
// mates are too far apart
|
||||
if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public int getnReads() {
|
||||
return nReads;
|
||||
}
|
||||
|
||||
public int getnBadMates() {
|
||||
return nBadMates;
|
||||
}
|
||||
}
|
||||
|
|
@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
* @param capBaseQualsAtMappingQual Cap base at mapping qual
|
||||
* @param minBaseQual Minimum base quality to consider
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
* @return Number of bases added - only good bases actually added to GLs are counted.
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) {
|
||||
// Number of [A C G T]'s in pileup, in that order
|
||||
|
|
@ -235,28 +235,29 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
for (byte b: BaseUtils.BASES)
|
||||
numSeenBases.add(0);
|
||||
|
||||
if (hasReferenceSampleData) {
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
for (byte base:BaseUtils.BASES) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (base == obsBase?1:0));
|
||||
|
||||
}
|
||||
|
||||
int nGoodBases = 0;
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
for (byte base:BaseUtils.BASES) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (base == obsBase?1:0));
|
||||
|
||||
}
|
||||
if (VERBOSE)
|
||||
System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3));
|
||||
nGoodBases++;
|
||||
}
|
||||
|
||||
if (VERBOSE)
|
||||
System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3));
|
||||
|
||||
computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup);
|
||||
return pileup.getNumberOfElements();
|
||||
return nGoodBases;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -281,7 +282,8 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: loop throught pileup to compute likalihoods just on base qualities
|
||||
// no error model: loop through pileup to compute likelihoods just on base qualities
|
||||
// In this case, vector numObservations is not used directly for GL computation
|
||||
for (final PileupElement elt : pileup) {
|
||||
final byte obsBase = elt.getBase();
|
||||
final byte qual = qualToUse(elt, true, true, mbq);
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
final ReadBackedPileup pileup = context.getBasePileup();
|
||||
if (pileup != null) {
|
||||
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
|
||||
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog);
|
||||
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
|
||||
b.PL(genotypeLikelihoods);
|
||||
b.DP(getFilteredDepth(pileup));
|
||||
genotypes.add(b.make());
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
final Double contamination = UAC.getSampleContamination().get(sample.getKey());
|
||||
if( contamination > 0.0 ) //no need to enter if no contamination reduction
|
||||
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog);
|
||||
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination);
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup(pileup);
|
||||
|
||||
|
|
|
|||
|
|
@ -113,12 +113,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false)
|
||||
public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25;
|
||||
|
||||
/**
|
||||
* This argument informs the prior probability of having an indel at a site.
|
||||
*/
|
||||
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
|
||||
public double INDEL_HETEROZYGOSITY = 1.0/8000;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName = "indelGapContinuationPenalty", shortName = "indelGCP", doc = "Indel gap continuation penalty, as Phred-scaled probability. I.e., 30 => 10^-30/10", required = false)
|
||||
public byte INDEL_GAP_CONTINUATION_PENALTY = 10;
|
||||
|
|
@ -238,7 +232,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION;
|
||||
this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE;
|
||||
this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY;
|
||||
this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY;
|
||||
this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY;
|
||||
this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO;
|
||||
|
|
|
|||
|
|
@ -86,17 +86,17 @@ import java.util.*;
|
|||
* both single sample data and multi-sample data.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* The read data from which to make variant calls.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A raw, unfiltered, highly sensitive callset in VCF format.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Example generic command for multi-sample SNP calling</h2>
|
||||
* <h3>Example generic command for multi-sample SNP calling</h3>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R resources/Homo_sapiens_assembly18.fasta \
|
||||
|
|
@ -117,7 +117,7 @@ import java.util.*;
|
|||
* argument descriptions below.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Example command for generating calls at all sites</h2>
|
||||
* <h3>Example command for generating calls at all sites</h3>
|
||||
* <pre>
|
||||
* java -jar /path/to/GenomeAnalysisTK.jar \
|
||||
* -l INFO \
|
||||
|
|
@ -128,7 +128,7 @@ import java.util.*;
|
|||
* --output_mode EMIT_ALL_SITES
|
||||
* </pre>
|
||||
*
|
||||
* <h2>Caveats</h2>
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and
|
||||
* file formats are likely to change.</li>
|
||||
|
|
@ -167,7 +167,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
@Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
||||
|
|
@ -180,7 +180,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
* A raw, unfiltered, highly sensitive callset in VCF format.
|
||||
*/
|
||||
//@Gather(className = "org.broadinstitute.sting.queue.extensions.gatk.CatVariantsGatherer")
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
@Output(doc="File to which variants should be written")
|
||||
protected VariantContextWriter writer = null;
|
||||
|
||||
@Hidden
|
||||
|
|
@ -205,7 +205,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
protected List<String> annotationsToExclude = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
* If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups.
|
||||
* Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
|
|
|||
|
|
@ -159,8 +159,8 @@ public class UnifiedGenotyperEngine {
|
|||
this.N = samples.size() * ploidy;
|
||||
log10AlleleFrequencyPriorsSNPs = new double[N+1];
|
||||
log10AlleleFrequencyPriorsIndels = new double[N+1];
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity,UAC.inputPrior);
|
||||
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY, UAC.inputPrior);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
|
||||
|
|
@ -385,11 +385,23 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
|
||||
|
||||
// TODO TODO TODO TODO
|
||||
// REFACTOR THIS FUNCTION, TOO UNWIELDY!!
|
||||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( afcm.get() == null ) {
|
||||
afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
|
||||
}
|
||||
|
||||
// if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call
|
||||
if (!canVCbeGenotyped(vc)) {
|
||||
if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext)
|
||||
return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext);
|
||||
else
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
// estimate our confidence in a reference call and return
|
||||
if ( vc.getNSamples() == 0 ) {
|
||||
if ( limitedContext )
|
||||
|
|
@ -544,6 +556,23 @@ public class UnifiedGenotyperEngine {
|
|||
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed.
|
||||
* @param vc Input VC
|
||||
* @return Status check
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
protected boolean canVCbeGenotyped(final VariantContext vc) {
|
||||
// protect against too many alternate alleles that we can't even run AF on:
|
||||
if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
|
||||
logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
|
||||
" alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart());
|
||||
return false;
|
||||
}
|
||||
else return true;
|
||||
|
||||
}
|
||||
|
||||
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
|
||||
|
||||
if ( !BaseUtils.isRegularBase(refContext.getBase()) )
|
||||
|
|
@ -570,9 +599,9 @@ public class UnifiedGenotyperEngine {
|
|||
int numDeletions = 0;
|
||||
for ( final PileupElement p : rawContext.getBasePileup() ) {
|
||||
if ( p.isDeletion() )
|
||||
numDeletions++;
|
||||
numDeletions += p.getRepresentativeCount();
|
||||
}
|
||||
if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) {
|
||||
if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -581,20 +610,8 @@ public class UnifiedGenotyperEngine {
|
|||
return stratifiedContexts;
|
||||
}
|
||||
|
||||
private final static double[] binomialProbabilityDepthCache = new double[10000];
|
||||
private final static double REF_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5);
|
||||
|
||||
static {
|
||||
for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) {
|
||||
binomialProbabilityDepthCache[i] = MathUtils.log10BinomialProbability(i, 0, REF_BINOMIAL_PROB_LOG10_0_5);
|
||||
}
|
||||
}
|
||||
|
||||
private final double getRefBinomialProbLog10(final int depth) {
|
||||
if ( depth < binomialProbabilityDepthCache.length )
|
||||
return binomialProbabilityDepthCache[depth];
|
||||
else
|
||||
return MathUtils.log10BinomialProbability(depth, 0, REF_BINOMIAL_PROB_LOG10_0_5);
|
||||
return MathUtils.log10BinomialProbability(depth, 0);
|
||||
}
|
||||
|
||||
private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map<String, AlignmentContext> contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
|
||||
|
|
@ -722,17 +739,45 @@ public class UnifiedGenotyperEngine {
|
|||
return GGAmodel;
|
||||
}
|
||||
|
||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
|
||||
/**
|
||||
* Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
|
||||
* where Pr(AC=i) = theta/i where theta is heterozygosity
|
||||
* @param N Number of chromosomes
|
||||
* @param priors (output) array to be filled with priors
|
||||
* @param heterozygosity default heterozygosity to use, if inputPriors is empty
|
||||
* @param inputPriors Input priors to use (in which case heterozygosity is ignored)
|
||||
*/
|
||||
public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List<Double> inputPriors) {
|
||||
|
||||
|
||||
double sum = 0.0;
|
||||
|
||||
// for each i
|
||||
for (int i = 1; i <= N; i++) {
|
||||
final double value = theta / (double)i;
|
||||
priors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
if (!inputPriors.isEmpty()) {
|
||||
// user-specified priors
|
||||
if (inputPriors.size() != N)
|
||||
throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
|
||||
|
||||
int idx = 1;
|
||||
for (final double prior: inputPriors) {
|
||||
if (prior < 0.0)
|
||||
throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
|
||||
priors[idx++] = Math.log10(prior);
|
||||
sum += prior;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// for each i
|
||||
for (int i = 1; i <= N; i++) {
|
||||
final double value = heterozygosity / (double)i;
|
||||
priors[i] = Math.log10(value);
|
||||
sum += value;
|
||||
}
|
||||
}
|
||||
|
||||
// protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
|
||||
if (sum > 1.0) {
|
||||
throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
|
||||
}
|
||||
// null frequency for AF=0 is (1 - sum(all other frequencies))
|
||||
priors[0] = Math.log10(1.0 - sum);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ public class AFCalcFactory {
|
|||
/** original biallelic exact model, for testing only */
|
||||
EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2),
|
||||
|
||||
/** implementation that supports any sample ploidy */
|
||||
/** implementation that supports any sample ploidy. Currently not available for the HaplotypeCaller */
|
||||
EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1);
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -111,7 +112,7 @@ public class AFCalcTestBuilder {
|
|||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001, new ArrayList<Double>());
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
|
|
|
|||
|
|
@ -52,18 +52,22 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -73,30 +77,38 @@ import java.util.*;
|
|||
*/
|
||||
|
||||
public class DeBruijnAssembler extends LocalAssemblyEngine {
|
||||
private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class);
|
||||
|
||||
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
|
||||
private static final byte MIN_QUALITY = (byte) 16;
|
||||
|
||||
// TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should
|
||||
// TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where
|
||||
// TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 25;
|
||||
private static final int GRAPH_KMER_STEP = 6;
|
||||
|
||||
// Smith-Waterman parameters originally copied from IndelRealigner, only used during GGA mode
|
||||
private static final double SW_MATCH = 5.0; // 1.0;
|
||||
private static final double SW_MISMATCH = -10.0; //-1.0/3.0;
|
||||
private static final double SW_GAP = -22.0; //-1.0-1.0/3.0;
|
||||
private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
|
||||
private final boolean debug;
|
||||
private final boolean debugGraphTransformations;
|
||||
private final int minKmer;
|
||||
private final boolean allowCyclesInKmerGraphToGeneratePaths;
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final PrintStream GRAPH_WRITER;
|
||||
private final List<DeBruijnAssemblyGraph> graphs = new ArrayList<DeBruijnAssemblyGraph>();
|
||||
private final int MIN_KMER;
|
||||
private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms;
|
||||
|
||||
private int PRUNE_FACTOR = 2;
|
||||
|
||||
public DeBruijnAssembler(final boolean debug, final PrintStream graphWriter, final int minKmer) {
|
||||
|
||||
protected DeBruijnAssembler() {
|
||||
this(false, -1, 11, false);
|
||||
}
|
||||
|
||||
public DeBruijnAssembler(final boolean debug,
|
||||
final int debugGraphTransformations,
|
||||
final int minKmer,
|
||||
final boolean allowCyclesInKmerGraphToGeneratePaths) {
|
||||
super();
|
||||
DEBUG = debug;
|
||||
GRAPH_WRITER = graphWriter;
|
||||
MIN_KMER = minKmer;
|
||||
this.debug = debug;
|
||||
this.debugGraphTransformations = debugGraphTransformations > 0;
|
||||
this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = debugGraphTransformations;
|
||||
this.minKmer = minKmer;
|
||||
this.allowCyclesInKmerGraphToGeneratePaths = allowCyclesInKmerGraphToGeneratePaths;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -105,150 +117,119 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
* @param refHaplotype reference haplotype object
|
||||
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
|
||||
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
|
||||
* @param PRUNE_FACTOR prune kmers from the graph if their weight is <= this value
|
||||
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
|
||||
* @return a non-empty list of all the haplotypes that are produced during assembly
|
||||
*/
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List<VariantContext> activeAllelesToGenotype ) {
|
||||
public List<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype ) {
|
||||
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
|
||||
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
|
||||
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
|
||||
if( PRUNE_FACTOR < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// set the pruning factor for this run of the assembly engine
|
||||
this.PRUNE_FACTOR = PRUNE_FACTOR;
|
||||
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
||||
// create the graphs
|
||||
createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
final List<SeqGraph> graphs = createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
|
||||
// print the graphs if the appropriate debug option has been turned on
|
||||
if( GRAPH_WRITER != null ) {
|
||||
printGraphs();
|
||||
if( graphWriter != null ) {
|
||||
printGraphs(graphs);
|
||||
}
|
||||
|
||||
// find the best paths in the graphs and return them as haplotypes
|
||||
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
|
||||
return findBestPaths( graphs, refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
|
||||
}
|
||||
|
||||
@Requires({"reads != null", "refHaplotype != null"})
|
||||
protected void createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
|
||||
graphs.clear();
|
||||
protected List<SeqGraph> createDeBruijnGraphs( final List<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
|
||||
final List<SeqGraph> graphs = new LinkedList<SeqGraph>();
|
||||
|
||||
final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1;
|
||||
if( maxKmer < MIN_KMER ) { return; } // Reads are too small for assembly so don't try to create any assembly graphs
|
||||
|
||||
if( maxKmer < minKmer) {
|
||||
// Reads are too small for assembly so don't try to create any assembly graphs
|
||||
return Collections.emptyList();
|
||||
}
|
||||
// create the graph for each possible kmer
|
||||
for( int kmer = maxKmer; kmer >= MIN_KMER; kmer -= GRAPH_KMER_STEP ) {
|
||||
final DeBruijnAssemblyGraph graph = createGraphFromSequences( reads, kmer, refHaplotype, DEBUG );
|
||||
for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) {
|
||||
if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms)
|
||||
continue;
|
||||
|
||||
if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads");
|
||||
DeBruijnGraph graph = createGraphFromSequences( reads, kmer, refHaplotype);
|
||||
if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object
|
||||
// do a series of steps to clean up the raw assembly graph to make it analysis-ready
|
||||
pruneGraph(graph, PRUNE_FACTOR);
|
||||
cleanNonRefPaths(graph);
|
||||
mergeNodes(graph);
|
||||
if( graph.getReferenceSourceVertex() != null ) { // if the graph contains interesting variation from the reference
|
||||
sanityCheckReferenceGraph(graph, refHaplotype);
|
||||
graphs.add(graph);
|
||||
if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor);
|
||||
|
||||
if ( shouldErrorCorrectKmers() ) {
|
||||
throw new UserException("Error correction no longer supported because of the " +
|
||||
"incredibly naive way this was implemented. The command line argument remains because some" +
|
||||
" future subsystem will actually go and error correct the reads");
|
||||
}
|
||||
|
||||
final SeqGraph seqGraph = toSeqGraph(graph);
|
||||
|
||||
if ( seqGraph != null ) { // if the graph contains interesting variation from the reference
|
||||
sanityCheckReferenceGraph(seqGraph, refHaplotype);
|
||||
graphs.add(seqGraph);
|
||||
|
||||
if ( debugGraphTransformations ) // we only want to use one graph size
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return graphs;
|
||||
}
|
||||
|
||||
@Requires({"graph != null"})
|
||||
protected static void mergeNodes( final DeBruijnAssemblyGraph graph ) {
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
private SeqGraph toSeqGraph(final DeBruijnGraph deBruijnGraph) {
|
||||
final SeqGraph seqGraph = deBruijnGraph.convertToSequenceGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.1.dot"), pruneFactor);
|
||||
|
||||
for( final DeBruijnEdge e : graph.edgeSet() ) {
|
||||
final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e);
|
||||
final DeBruijnVertex incomingVertex = graph.getEdgeSource(e);
|
||||
if( !outgoingVertex.equals(incomingVertex) && graph.outDegreeOf(incomingVertex) == 1 && graph.inDegreeOf(outgoingVertex) == 1 &&
|
||||
graph.inDegreeOf(incomingVertex) <= 1 && graph.outDegreeOf(outgoingVertex) <= 1 && graph.isReferenceNode(incomingVertex) == graph.isReferenceNode(outgoingVertex) ) {
|
||||
final Set<DeBruijnEdge> outEdges = graph.outgoingEdgesOf(outgoingVertex);
|
||||
final Set<DeBruijnEdge> inEdges = graph.incomingEdgesOf(incomingVertex);
|
||||
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
} else if( inEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
} else if( outEdges.size() == 1 ) {
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
}
|
||||
// TODO -- we need to come up with a consistent pruning algorithm. The current pruning algorithm
|
||||
// TODO -- works well but it doesn't differentiate between an isolated chain that doesn't connect
|
||||
// TODO -- to anything from one that's actually has good support along the chain but just happens
|
||||
// TODO -- to have a connection in the middle that has weight of < pruneFactor. Ultimately
|
||||
// TODO -- the pruning algorithm really should be an error correction algorithm that knows more
|
||||
// TODO -- about the structure of the data and can differentiate between an infrequent path but
|
||||
// TODO -- without evidence against it (such as occurs when a region is hard to get any reads through)
|
||||
// TODO -- from a error with lots of weight going along another similar path
|
||||
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
|
||||
seqGraph.zipLinearChains();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.2.zipped.dot"), pruneFactor);
|
||||
|
||||
final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer );
|
||||
graph.addVertex(addedVertex);
|
||||
for( final DeBruijnEdge edge : outEdges ) {
|
||||
graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.isRef(), edge.getMultiplicity()));
|
||||
}
|
||||
for( final DeBruijnEdge edge : inEdges ) {
|
||||
graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.isRef(), edge.getMultiplicity()));
|
||||
}
|
||||
// now go through and prune the graph, removing vertices no longer connected to the reference chain
|
||||
// IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight
|
||||
// edges to maintain graph connectivity.
|
||||
seqGraph.pruneGraph(pruneFactor);
|
||||
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();
|
||||
|
||||
graph.removeVertex( incomingVertex );
|
||||
graph.removeVertex( outgoingVertex );
|
||||
foundNodesToMerge = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.3.pruned.dot"), pruneFactor);
|
||||
seqGraph.simplifyGraph();
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.4.merged.dot"), pruneFactor);
|
||||
|
||||
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
|
||||
// happen in cases where for example the reference somehow manages to acquire a cycle, or
|
||||
// where the entire assembly collapses back into the reference sequence.
|
||||
if ( seqGraph.getReferenceSourceVertex() == null || seqGraph.getReferenceSinkVertex() == null )
|
||||
return null;
|
||||
|
||||
seqGraph.removePathsNotConnectedToRef();
|
||||
seqGraph.simplifyGraph();
|
||||
if ( seqGraph.vertexSet().size() == 1 ) {
|
||||
// we've perfectly assembled into a single reference haplotype, add a empty seq vertex to stop
|
||||
// the code from blowing up.
|
||||
// TODO -- ref properties should really be on the vertices, not the graph itself
|
||||
final SeqVertex complete = seqGraph.vertexSet().iterator().next();
|
||||
final SeqVertex dummy = new SeqVertex("");
|
||||
seqGraph.addVertex(dummy);
|
||||
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
|
||||
}
|
||||
if ( debugGraphTransformations ) seqGraph.printGraph(new File("sequenceGraph.5.final.dot"), pruneFactor);
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
|
||||
protected static void cleanNonRefPaths( final DeBruijnAssemblyGraph graph ) {
|
||||
if( graph.getReferenceSourceVertex() == null || graph.getReferenceSinkVertex() == null ) {
|
||||
return;
|
||||
}
|
||||
// Remove non-ref edges connected before and after the reference path
|
||||
final Set<DeBruijnEdge> edgesToCheck = new HashSet<DeBruijnEdge>();
|
||||
edgesToCheck.addAll(graph.incomingEdgesOf(graph.getReferenceSourceVertex()));
|
||||
while( !edgesToCheck.isEmpty() ) {
|
||||
final DeBruijnEdge e = edgesToCheck.iterator().next();
|
||||
if( !e.isRef() ) {
|
||||
edgesToCheck.addAll( graph.incomingEdgesOf(graph.getEdgeSource(e)) );
|
||||
graph.removeEdge(e);
|
||||
}
|
||||
edgesToCheck.remove(e);
|
||||
}
|
||||
edgesToCheck.addAll(graph.outgoingEdgesOf(graph.getReferenceSinkVertex()));
|
||||
while( !edgesToCheck.isEmpty() ) {
|
||||
final DeBruijnEdge e = edgesToCheck.iterator().next();
|
||||
if( !e.isRef() ) {
|
||||
edgesToCheck.addAll( graph.outgoingEdgesOf(graph.getEdgeTarget(e)) );
|
||||
graph.removeEdge(e);
|
||||
}
|
||||
edgesToCheck.remove(e);
|
||||
}
|
||||
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
graph.removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
protected static void pruneGraph( final DeBruijnAssemblyGraph graph, final int pruneFactor ) {
|
||||
final List<DeBruijnEdge> edgesToRemove = new ArrayList<DeBruijnEdge>();
|
||||
for( final DeBruijnEdge e : graph.edgeSet() ) {
|
||||
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
|
||||
edgesToRemove.add(e);
|
||||
}
|
||||
}
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
graph.removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
protected static void sanityCheckReferenceGraph(final DeBruijnAssemblyGraph graph, final Haplotype refHaplotype) {
|
||||
protected <T extends BaseVertex> void sanityCheckReferenceGraph(final BaseGraph<T> graph, final Haplotype refHaplotype) {
|
||||
if( graph.getReferenceSourceVertex() == null ) {
|
||||
throw new IllegalStateException("All reference graphs must have a reference source vertex.");
|
||||
}
|
||||
|
|
@ -263,86 +244,131 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
@Requires({"reads != null", "KMER_LENGTH > 0", "refHaplotype != null"})
|
||||
protected static DeBruijnAssemblyGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
|
||||
|
||||
final DeBruijnAssemblyGraph graph = new DeBruijnAssemblyGraph();
|
||||
@Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"})
|
||||
protected DeBruijnGraph createGraphFromSequences( final List<GATKSAMRecord> reads, final int kmerLength, final Haplotype refHaplotype ) {
|
||||
final DeBruijnGraph graph = new DeBruijnGraph(kmerLength);
|
||||
final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph);
|
||||
|
||||
// First pull kmers from the reference haplotype and add them to the graph
|
||||
final byte[] refSequence = refHaplotype.getBases();
|
||||
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
if( !graph.addKmersToGraph(Arrays.copyOfRange(refSequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(refSequence, iii + 1, iii + 1 + KMER_LENGTH), true) ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) )
|
||||
// something went wrong, so abort right now with a null graph
|
||||
return null;
|
||||
|
||||
// now go through the graph already seeded with the reference sequence and add the read kmers to it
|
||||
if ( ! addReadKmersToGraph(builder, reads) )
|
||||
// some problem was detected adding the reads to the graph, return null to indicate we failed
|
||||
return null;
|
||||
|
||||
graph.cleanNonRefPaths();
|
||||
return graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the high-quality kmers from the reads to the graph
|
||||
*
|
||||
* @param builder a debruijn graph builder to add the read kmers to
|
||||
* @param reads a non-null list of reads whose kmers we want to add to the graph
|
||||
* @return true if we successfully added the read kmers to the graph without corrupting it in some way
|
||||
*/
|
||||
protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List<GATKSAMRecord> reads) {
|
||||
final int kmerLength = builder.getKmerSize();
|
||||
|
||||
// Next pull kmers out of every read and throw them on the graph
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
|
||||
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
// if the qualities of all the bases in the kmers are high enough
|
||||
boolean badKmer = false;
|
||||
for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) {
|
||||
if( qualities[jjj] < MIN_QUALITY ) {
|
||||
badKmer = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !badKmer ) {
|
||||
final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
|
||||
if ( sequence.length > kmerLength + KMER_OVERLAP ) {
|
||||
int lastGood = -1; // the index of the last good base we've seen
|
||||
for( int end = 0; end < sequence.length; end++ ) {
|
||||
if ( qualities[end] < minBaseQualityToUseInAssembly ) {
|
||||
lastGood = -1; // reset the last good base
|
||||
} else if ( lastGood == -1 ) {
|
||||
lastGood = end; // we're at a good base, the last good one is us
|
||||
} else if ( end - kmerLength >= lastGood ) {
|
||||
// end - kmerLength (the start) is after the lastGood base, so that kmer is good
|
||||
final int start = end - kmerLength;
|
||||
// how many observations of this kmer have we seen? A normal read counts for 1, but
|
||||
// a reduced read might imply a higher multiplicity for our the edge
|
||||
int countNumber = 1;
|
||||
if( read.isReducedRead() ) {
|
||||
if ( read.isReducedRead() ) {
|
||||
// compute mean number of reduced read counts in current kmer span
|
||||
// precise rounding can make a difference with low consensus counts
|
||||
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, iii, iii + KMER_LENGTH));
|
||||
// TODO -- optimization: should extend arrayMax function to take start stop values
|
||||
countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end));
|
||||
}
|
||||
|
||||
final byte[] kmer1 = Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH);
|
||||
final byte[] kmer2 = Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH);
|
||||
|
||||
for( int kkk=0; kkk < countNumber; kkk++ ) {
|
||||
graph.addKmersToGraph(kmer1, kmer2, false);
|
||||
}
|
||||
builder.addKmerPairFromSeqToGraph(sequence, start, countNumber);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return graph;
|
||||
|
||||
builder.flushKmersToGraph(false);
|
||||
|
||||
// always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way
|
||||
return true;
|
||||
}
|
||||
|
||||
protected void printGraphs() {
|
||||
GRAPH_WRITER.println("digraph assemblyGraphs {");
|
||||
for( final DeBruijnAssemblyGraph graph : graphs ) {
|
||||
for( final DeBruijnEdge edge : graph.edgeSet() ) {
|
||||
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
|
||||
}
|
||||
if( edge.isRef() ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
if( !edge.isRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
|
||||
}
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + new String(graph.getAdditionalSequence(v)) + "\"]");
|
||||
}
|
||||
/**
|
||||
* Add the kmers from the reference sequence to the DeBruijnGraph
|
||||
*
|
||||
* @param builder the graph to add the reference kmers to. Must be empty
|
||||
* @param refSequence the reference sequence from which we'll get our kmers
|
||||
* @return true if we succeeded in creating a good graph from the reference sequence, false otherwise
|
||||
*/
|
||||
protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) {
|
||||
if ( builder == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( builder.getGraph().vertexSet().size() != 0 )
|
||||
throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph());
|
||||
if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null");
|
||||
|
||||
final int kmerLength = builder.getKmerSize();
|
||||
if( refSequence.length < kmerLength + KMER_OVERLAP ) {
|
||||
// not enough reference sequence to build a kmer graph of this length, return null
|
||||
return false;
|
||||
}
|
||||
GRAPH_WRITER.println("}");
|
||||
|
||||
final int kmersInSequence = refSequence.length - kmerLength + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
builder.addKmerPairFromSeqToGraph(refSequence, iii, 1);
|
||||
}
|
||||
builder.flushKmersToGraph(true);
|
||||
|
||||
// we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices
|
||||
if ( builder.getGraph().vertexSet().size() != kmersInSequence ) {
|
||||
if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected void printGraphs(final List<SeqGraph> graphs) {
|
||||
final int writeFirstGraphWithSizeSmallerThan = 50;
|
||||
|
||||
graphWriter.println("digraph assemblyGraphs {");
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
if ( debugGraphTransformations && graph.getKmerSize() >= writeFirstGraphWithSizeSmallerThan ) {
|
||||
logger.info("Skipping writing of graph with kmersize " + graph.getKmerSize());
|
||||
continue;
|
||||
}
|
||||
|
||||
graph.printGraph(graphWriter, false, pruneFactor);
|
||||
|
||||
if ( debugGraphTransformations )
|
||||
break;
|
||||
}
|
||||
|
||||
graphWriter.println("}");
|
||||
}
|
||||
|
||||
@Requires({"refWithPadding.length > refHaplotype.getBases().length", "refLoc.containsP(activeRegionWindow)"})
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
private List<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
|
||||
private List<Haplotype> findBestPaths( final List<SeqGraph> graphs, final Haplotype refHaplotype, final byte[] refWithPadding, final GenomeLoc refLoc, final List<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
|
||||
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
// TODO -- this use of an array with contains lower may be a performance problem returning in an O(N^2) algorithm
|
||||
final List<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
|
||||
refHaplotype.setAlignmentStartHapwrtRef(activeRegionWindow.getStart() - refLoc.getStart());
|
||||
final Cigar c = new Cigar();
|
||||
|
|
@ -361,8 +387,14 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
for( final DeBruijnAssemblyGraph graph : graphs ) {
|
||||
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
|
||||
for( final SeqGraph graph : graphs ) {
|
||||
final SeqVertex source = graph.getReferenceSourceVertex();
|
||||
final SeqVertex sink = graph.getReferenceSinkVertex();
|
||||
if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph);
|
||||
|
||||
final KBestPaths<SeqVertex> pathFinder = new KBestPaths<SeqVertex>(allowCyclesInKmerGraphToGeneratePaths);
|
||||
for ( final Path<SeqVertex> path : pathFinder.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH, source, sink) ) {
|
||||
// logger.info("Found path " + path);
|
||||
Haplotype h = new Haplotype( path.getBases() );
|
||||
if( !returnHaplotypes.contains(h) ) {
|
||||
final Cigar cigar = path.calculateCigar();
|
||||
|
|
@ -383,12 +415,16 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
if( !returnHaplotypes.contains(h) ) {
|
||||
h.setAlignmentStartHapwrtRef(activeRegionStart);
|
||||
h.setCigar( leftAlignedCigar );
|
||||
h.setCigar(leftAlignedCigar);
|
||||
h.setScore(path.getScore());
|
||||
returnHaplotypes.add(h);
|
||||
|
||||
if ( debug )
|
||||
logger.info("Adding haplotype " + h.getCigar() + " from debruijn graph with kmer " + graph.getKmerSize());
|
||||
|
||||
// for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
if( !activeAllelesToGenotype.isEmpty() ) {
|
||||
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), refWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
final Map<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h, refWithPadding, refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
|
||||
|
|
@ -409,17 +445,24 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
// add genome locs to the haplotypes
|
||||
for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
|
||||
|
||||
if ( returnHaplotypes.size() < returnHaplotypes.size() )
|
||||
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
|
||||
|
||||
if( debug ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
|
||||
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
|
||||
} else {
|
||||
System.out.println("Found only the reference haplotype in the assembly graph.");
|
||||
logger.info("Found only the reference haplotype in the assembly graph.");
|
||||
}
|
||||
for( final Haplotype h : returnHaplotypes ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() );
|
||||
logger.info( h.toString() );
|
||||
logger.info( "> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() );
|
||||
}
|
||||
}
|
||||
|
||||
return returnHaplotypes;
|
||||
}
|
||||
|
||||
|
|
@ -430,7 +473,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
* @param refWithPadding the full reference byte array with padding which encompasses the active region
|
||||
* @return a haplotype fully extended to encompass the active region
|
||||
*/
|
||||
@Requires({"haplotype != null", "activeRegionStart > 0", "refWithPadding != null", "refWithPadding.length > 0"})
|
||||
@Requires({"haplotype != null", "activeRegionStart >= 0", "refWithPadding != null", "refWithPadding.length > 0"})
|
||||
@Ensures({"result != null", "result.getCigar() != null"})
|
||||
private Haplotype extendPartialHaplotype( final Haplotype haplotype, final int activeRegionStart, final byte[] refWithPadding ) {
|
||||
final Cigar cigar = haplotype.getCigar();
|
||||
|
|
@ -438,7 +481,8 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
byte[] newHaplotypeBases = haplotype.getBases();
|
||||
int refPos = activeRegionStart;
|
||||
int hapPos = 0;
|
||||
for( CigarElement ce : cigar.getCigarElements() ) {
|
||||
for( int iii = 0; iii < cigar.getCigarElements().size(); iii++ ) {
|
||||
final CigarElement ce = cigar.getCigarElement(iii);
|
||||
switch (ce.getOperator()) {
|
||||
case M:
|
||||
refPos += ce.getLength();
|
||||
|
|
@ -450,16 +494,17 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
newCigar.add(ce);
|
||||
break;
|
||||
case D:
|
||||
refPos += ce.getLength();
|
||||
newCigar.add(ce);
|
||||
break;
|
||||
case X:
|
||||
newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos),
|
||||
ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()),
|
||||
Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length)));
|
||||
refPos += ce.getLength();
|
||||
hapPos += ce.getLength();
|
||||
newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M));
|
||||
if( iii == 0 || iii == cigar.getCigarElements().size() - 1 ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( Arrays.copyOfRange(newHaplotypeBases, 0, hapPos),
|
||||
ArrayUtils.addAll(Arrays.copyOfRange(refWithPadding, refPos, refPos + ce.getLength()),
|
||||
Arrays.copyOfRange(newHaplotypeBases, hapPos, newHaplotypeBases.length)));
|
||||
hapPos += ce.getLength();
|
||||
refPos += ce.getLength();
|
||||
newCigar.add(new CigarElement(ce.getLength(), CigarOperator.M));
|
||||
} else {
|
||||
refPos += ce.getLength();
|
||||
newCigar.add(ce);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalStateException("Unsupported cigar operator detected: " + ce.getOperator());
|
||||
|
|
@ -496,7 +541,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
* @return the left-aligned cigar
|
||||
*/
|
||||
@Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"})
|
||||
protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
|
||||
protected Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
|
||||
final Cigar cigarToReturn = new Cigar();
|
||||
Cigar cigarToAlign = new Cigar();
|
||||
for (int i = 0; i < cigar.numCigarElements(); i++) {
|
||||
|
|
@ -537,7 +582,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
private boolean addHaplotypeForGGA( final Haplotype haplotype, final byte[] ref, final List<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) {
|
||||
if( haplotype == null ) { return false; }
|
||||
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SWParameterSet.STANDARD_NGS );
|
||||
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
|
||||
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 || swConsensus.getAlignmentStart2wrt1() < 0 ) { // protect against unhelpful haplotype alignments
|
||||
|
|
@ -566,7 +611,7 @@ public class DeBruijnAssembler extends LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
final Haplotype h = new Haplotype( newHaplotypeBases );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SWParameterSet.STANDARD_NGS );
|
||||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
if ( haplotype.isArtificialHaplotype() ) {
|
||||
|
|
|
|||
|
|
@ -1,321 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 2/6/13
|
||||
*/
|
||||
|
||||
public class DeBruijnAssemblyGraph extends DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> {
|
||||
|
||||
public DeBruijnAssemblyGraph() {
|
||||
super(DeBruijnEdge.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph)
|
||||
*/
|
||||
public boolean isReferenceNode( final DeBruijnVertex v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final DeBruijnEdge e : edgesOf(v) ) {
|
||||
if( e.isRef() ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a source node
|
||||
*/
|
||||
public boolean isSource( final DeBruijnVertex v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
return inDegreeOf(v) == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull out the additional sequence implied by traversing this node in the graph
|
||||
* @param v the vertex from which to pull out the additional base sequence
|
||||
* @return non-null byte array
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getAdditionalSequence( final DeBruijnVertex v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); }
|
||||
return ( isSource(v) ? v.getSequence() : v.getSuffix() );
|
||||
}
|
||||
|
||||
/**
|
||||
* @param e the edge to test
|
||||
* @return true if this edge is a reference source edge
|
||||
*/
|
||||
public boolean isRefSource( final DeBruijnEdge e ) {
|
||||
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
|
||||
for( final DeBruijnEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference source
|
||||
*/
|
||||
public boolean isRefSource( final DeBruijnVertex v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param e the edge to test
|
||||
* @return true if this edge is a reference sink edge
|
||||
*/
|
||||
public boolean isRefSink( final DeBruijnEdge e ) {
|
||||
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
|
||||
for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference sink
|
||||
*/
|
||||
public boolean isRefSink( final DeBruijnVertex v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph
|
||||
*/
|
||||
public DeBruijnVertex getReferenceSourceVertex( ) {
|
||||
for( final DeBruijnVertex v : vertexSet() ) {
|
||||
if( isReferenceNode(v) && isRefSource(v) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph
|
||||
*/
|
||||
public DeBruijnVertex getReferenceSinkVertex( ) {
|
||||
for( final DeBruijnVertex v : vertexSet() ) {
|
||||
if( isReferenceNode(v) && isRefSink(v) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse the graph and get the next reference vertex if it exists
|
||||
* @param v the current vertex, can be null
|
||||
* @return the next reference vertex if it exists
|
||||
*/
|
||||
public DeBruijnVertex getNextReferenceVertex( final DeBruijnVertex v ) {
|
||||
if( v == null ) { return null; }
|
||||
for( final DeBruijnEdge edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) {
|
||||
return getEdgeTarget(edgeToTest);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse the graph and get the previous reference vertex if it exists
|
||||
* @param v the current vertex, can be null
|
||||
* @return the previous reference vertex if it exists
|
||||
*/
|
||||
public DeBruijnVertex getPrevReferenceVertex( final DeBruijnVertex v ) {
|
||||
if( v == null ) { return null; }
|
||||
for( final DeBruijnEdge edgeToTest : incomingEdgesOf(v) ) {
|
||||
if( isReferenceNode(getEdgeSource(edgeToTest)) ) {
|
||||
return getEdgeSource(edgeToTest);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does a reference path exist between the two vertices?
|
||||
* @param fromVertex from this vertex, can be null
|
||||
* @param toVertex to this vertex, can be null
|
||||
* @return true if a reference path exists in the graph between the two vertices
|
||||
*/
|
||||
public boolean referencePathExists(final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex) {
|
||||
DeBruijnVertex v = fromVertex;
|
||||
if( v == null ) {
|
||||
return false;
|
||||
}
|
||||
v = getNextReferenceVertex(v);
|
||||
if( v == null ) {
|
||||
return false;
|
||||
}
|
||||
while( !v.equals(toVertex) ) {
|
||||
v = getNextReferenceVertex(v);
|
||||
if( v == null ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Walk along the reference path in the graph and pull out the corresponding bases
|
||||
* @param fromVertex starting vertex
|
||||
* @param toVertex ending vertex
|
||||
* @param includeStart should the starting vertex be included in the path
|
||||
* @param includeStop should the ending vertex be included in the path
|
||||
* @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
|
||||
*/
|
||||
public byte[] getReferenceBytes( final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex, final boolean includeStart, final boolean includeStop ) {
|
||||
if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); }
|
||||
if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); }
|
||||
|
||||
byte[] bytes = null;
|
||||
DeBruijnVertex v = fromVertex;
|
||||
if( includeStart ) {
|
||||
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
|
||||
}
|
||||
v = getNextReferenceVertex(v); // advance along the reference path
|
||||
while( v != null && !v.equals(toVertex) ) {
|
||||
bytes = ArrayUtils.addAll( bytes, getAdditionalSequence(v) );
|
||||
v = getNextReferenceVertex(v); // advance along the reference path
|
||||
}
|
||||
if( includeStop && v != null && v.equals(toVertex)) {
|
||||
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull kmers out of the given long sequence and throw them on in the graph
|
||||
* @param sequence byte array holding the sequence with which to build the assembly graph
|
||||
* @param KMER_LENGTH the desired kmer length to use
|
||||
* @param isRef if true the kmers added to the graph will have reference edges linking them
|
||||
*/
|
||||
public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
|
||||
if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add edge to assembly graph connecting the two kmers
|
||||
* @param kmer1 the source kmer for the edge
|
||||
* @param kmer2 the target kmer for the edge
|
||||
* @param isRef true if the added edge is a reference edge
|
||||
* @return will return false if trying to add a reference edge which creates a cycle in the assembly graph
|
||||
*/
|
||||
public boolean addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef ) {
|
||||
if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
|
||||
if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
|
||||
if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); }
|
||||
|
||||
final int numVertexBefore = vertexSet().size();
|
||||
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length );
|
||||
addVertex(v1);
|
||||
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length );
|
||||
addVertex(v2);
|
||||
if( isRef && vertexSet().size() == numVertexBefore ) { return false; }
|
||||
|
||||
final DeBruijnEdge targetEdge = getEdge(v1, v2);
|
||||
if ( targetEdge == null ) {
|
||||
addEdge(v1, v2, new DeBruijnEdge( isRef ));
|
||||
} else {
|
||||
if( isRef ) {
|
||||
targetEdge.setIsRef( true );
|
||||
}
|
||||
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print out the graph in the dot language for visualization
|
||||
* @param GRAPH_WRITER PrintStream to write to
|
||||
*/
|
||||
public void printGraph( final PrintStream GRAPH_WRITER ) {
|
||||
if( GRAPH_WRITER == null ) { throw new IllegalArgumentException("PrintStream cannot be null."); }
|
||||
|
||||
GRAPH_WRITER.println("digraph assembly {");
|
||||
for( final DeBruijnEdge edge : edgeSet() ) {
|
||||
GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + "label=\""+ edge.getMultiplicity() +"\"" + "];");
|
||||
if( edge.isRef() ) {
|
||||
GRAPH_WRITER.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
}
|
||||
for( final DeBruijnVertex v : vertexSet() ) {
|
||||
final String label = ( inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() );
|
||||
GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]");
|
||||
}
|
||||
GRAPH_WRITER.println("}");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph;
|
||||
|
||||
/**
|
||||
* Fast approach to building a DeBruijnGraph
|
||||
*
|
||||
* Follows the model:
|
||||
*
|
||||
* for each X that has bases for the final graph:
|
||||
* addKmer pair (single kmer with kmer size + 1 spanning the pair)
|
||||
*
|
||||
* flushKmersToGraph
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 4/7/13
|
||||
* Time: 4:14 PM
|
||||
*/
|
||||
public class DeBruijnGraphBuilder {
|
||||
/** The size of the kmer graph we want to build */
|
||||
private final int kmerSize;
|
||||
|
||||
/** The graph we're going to add kmers to */
|
||||
private final DeBruijnGraph graph;
|
||||
|
||||
/** keeps counts of all kmer pairs added since the last flush */
|
||||
private final KMerCounter counter;
|
||||
|
||||
/**
|
||||
* Create a new builder that will write out kmers to graph
|
||||
*
|
||||
* @param graph a non-null graph that can contain already added kmers
|
||||
*/
|
||||
public DeBruijnGraphBuilder(final DeBruijnGraph graph) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null");
|
||||
this.kmerSize = graph.getKmerSize();
|
||||
this.graph = graph;
|
||||
this.counter = new KMerCounter(kmerSize + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* The graph we're building
|
||||
* @return a non-null graph
|
||||
*/
|
||||
public DeBruijnGraph getGraph() {
|
||||
return graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* The kmer size of our graph
|
||||
* @return positive integer
|
||||
*/
|
||||
public int getKmerSize() {
|
||||
return kmerSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this
|
||||
* graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The
|
||||
* edge between them is added with isRef and multiplicity
|
||||
*
|
||||
* @param sequence a sequence of bases from which we want to extract a pair of kmers
|
||||
* @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize()
|
||||
* @param multiplicity what's the multiplicity of the edge between these two kmers
|
||||
*/
|
||||
public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) {
|
||||
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
|
||||
if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
|
||||
if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length);
|
||||
final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1);
|
||||
addKmerPair(kmerPair, multiplicity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a single kmer pair to this builder
|
||||
* @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1
|
||||
* will have an edge added to this
|
||||
* @param multiplicity the desired multiplicity of this edge
|
||||
*/
|
||||
public void addKmerPair(final Kmer kmerPair, final int multiplicity) {
|
||||
if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length());
|
||||
counter.addKmer(kmerPair, multiplicity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes the currently added kmers to the graph
|
||||
*
|
||||
* After this function is called the builder is reset to an empty state
|
||||
*
|
||||
* This flushing is expensive, so many kmers should be added to the builder before flushing. The most
|
||||
* efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases)
|
||||
* then and do one flush when completed
|
||||
*
|
||||
* @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges?
|
||||
*/
|
||||
public void flushKmersToGraph(final boolean addRefEdges) {
|
||||
for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) {
|
||||
final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases();
|
||||
final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases();
|
||||
graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount());
|
||||
}
|
||||
counter.clear();
|
||||
}
|
||||
}
|
||||
|
|
@ -48,44 +48,84 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.EventMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.MergeVariantsAcrossHaplotypes;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
private final static Logger logger = Logger.getLogger(GenotypingEngine.class);
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
private final VariantAnnotatorEngine annotationEngine;
|
||||
private final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine, final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ) {
|
||||
public GenotypingEngine( final boolean DEBUG, final VariantAnnotatorEngine annotationEngine,
|
||||
final boolean USE_FILTERED_READ_MAP_FOR_ANNOTATIONS,
|
||||
final MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger) {
|
||||
this.DEBUG = DEBUG;
|
||||
this.annotationEngine = annotationEngine;
|
||||
this.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS = USE_FILTERED_READ_MAP_FOR_ANNOTATIONS;
|
||||
noCall.add(Allele.NO_CALL);
|
||||
this.crossHaplotypeEventMerger = crossHaplotypeEventMerger;
|
||||
}
|
||||
|
||||
/**
|
||||
* Carries the result of a call to #assignGenotypeLikelihoods
|
||||
*/
|
||||
public static class CalledHaplotypes {
|
||||
private final List<VariantContext> calls;
|
||||
private final Set<Haplotype> calledHaplotypes;
|
||||
|
||||
protected CalledHaplotypes(final List<VariantContext> calls, final Set<Haplotype> calledHaplotypes) {
|
||||
if ( calls == null ) throw new IllegalArgumentException("calls cannot be null");
|
||||
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
|
||||
if ( Utils.xor(calls.isEmpty(), calledHaplotypes.isEmpty()) )
|
||||
throw new IllegalArgumentException("Calls and calledHaplotypes should both be empty or both not but got calls=" + calls + " calledHaplotypes=" + calledHaplotypes);
|
||||
this.calls = calls;
|
||||
this.calledHaplotypes = calledHaplotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of calls made at this location
|
||||
* @return a non-null (but potentially empty) list of calls
|
||||
*/
|
||||
public List<VariantContext> getCalls() {
|
||||
return calls;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of haplotypes that we actually called (i.e., underlying one of the VCs in getCalls().
|
||||
* @return a non-null set of haplotypes
|
||||
*/
|
||||
public Set<Haplotype> getCalledHaplotypes() {
|
||||
return calledHaplotypes;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point of class - given a particular set of haplotypes, samples and reference context, compute
|
||||
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
|
||||
*
|
||||
* The list of samples we're working with is obtained from the haplotypeReadMap
|
||||
*
|
||||
* @param UG_engine UG Engine with basic input parameters
|
||||
* @param haplotypes Haplotypes to assign likelihoods to
|
||||
* @param samples Samples to genotype
|
||||
* @param haplotypeReadMap Map from reads->(haplotypes,likelihoods)
|
||||
* @param perSampleFilteredReadList
|
||||
* @param ref Reference bytes at active region
|
||||
|
|
@ -93,113 +133,40 @@ public class GenotypingEngine {
|
|||
* @param activeRegionWindow Active window
|
||||
* @param genomeLocParser GenomeLocParser
|
||||
* @param activeAllelesToGenotype Alleles to genotype
|
||||
* @return List of VC's with genotyped events
|
||||
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
|
||||
*/
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
@Ensures("result != null")
|
||||
// TODO - can this be refactored? this is hard to follow!
|
||||
public List<VariantContext> assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine,
|
||||
final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
public CalledHaplotypes assignGenotypeLikelihoods( final UnifiedGenotyperEngine UG_engine,
|
||||
final List<Haplotype> haplotypes,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow,
|
||||
final GenomeLocParser genomeLocParser,
|
||||
final List<VariantContext> activeAllelesToGenotype ) {
|
||||
// sanity check input arguments
|
||||
if (UG_engine == null)
|
||||
throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
|
||||
if (haplotypes == null || haplotypes.isEmpty())
|
||||
throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
||||
if (samples == null || samples.isEmpty())
|
||||
throw new IllegalArgumentException("samples input must be non-empty and non-null, got "+samples);
|
||||
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty())
|
||||
throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
|
||||
if (ref == null || ref.length == 0 )
|
||||
throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
|
||||
if (refLoc == null || refLoc.getStop()-refLoc.getStart()+1 != ref.length)
|
||||
throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||
if (activeRegionWindow == null )
|
||||
throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||
if (activeAllelesToGenotype == null )
|
||||
throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
||||
if (genomeLocParser == null )
|
||||
throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
|
||||
if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine);
|
||||
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
|
||||
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap);
|
||||
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got "+ref);
|
||||
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
|
||||
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
|
||||
if (activeAllelesToGenotype == null ) throw new IllegalArgumentException("activeAllelesToGenotype must be non-null, got "+activeAllelesToGenotype);
|
||||
if (genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser must be non-null, got "+genomeLocParser);
|
||||
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) );
|
||||
if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
System.out.println( ">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( !in_GGA_mode && samples.size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, samples, haplotypeReadMap, startPosKeySet, ref, refLoc );
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
|
||||
}
|
||||
if( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
}
|
||||
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
|
||||
// that carry events among the haplotypes
|
||||
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype);
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
final Set<Haplotype> calledHaplotypes = new HashSet<Haplotype>();
|
||||
final List<VariantContext> returnCalls = new ArrayList<VariantContext>();
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final List<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
if( !in_GGA_mode ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final Map<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
priorityList.add(vc.getSource());
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
int compCount = 0;
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
if( compVC.getStart() == loc ) {
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
List<Allele> alleleSet = new ArrayList<Allele>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
|
||||
// check if this event is already in the list of events due to a repeat in the input alleles track
|
||||
final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make();
|
||||
boolean alreadyExists = false;
|
||||
for( final VariantContext eventToTest : eventsAtThisLoc ) {
|
||||
if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) {
|
||||
alreadyExists = true;
|
||||
}
|
||||
}
|
||||
if( !alreadyExists ) {
|
||||
priorityList.add(vcSourceName);
|
||||
eventsAtThisLoc.add(candidateEventToAdd);
|
||||
}
|
||||
alleleCount++;
|
||||
}
|
||||
}
|
||||
compCount++;
|
||||
}
|
||||
}
|
||||
final List<VariantContext> eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype);
|
||||
|
||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
|
|
@ -207,7 +174,7 @@ public class GenotypingEngine {
|
|||
final Map<Event, List<Haplotype>> eventMapper = createEventMapper(loc, eventsAtThisLoc, haplotypes);
|
||||
|
||||
// Sanity check the priority list for mistakes
|
||||
validatePriorityList( priorityList, eventsAtThisLoc );
|
||||
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
|
@ -225,75 +192,157 @@ public class GenotypingEngine {
|
|||
final Map<Allele, List<Haplotype>> alleleMapper = createAlleleMapper(mergeMap, eventMapper);
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
}
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );
|
||||
|
||||
final GenotypesContext genotypes = calculateGLsForThisEvent( samples, alleleReadMap, mergedVC );
|
||||
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
|
||||
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
if( call != null ) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
|
||||
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) );
|
||||
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
|
||||
VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, call);
|
||||
|
||||
VariantContext annotatedCall = call;
|
||||
if( annotatedCall.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
|
||||
}
|
||||
|
||||
annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, annotatedCall);
|
||||
|
||||
// maintain the set of all called haplotypes
|
||||
for ( final Allele calledAllele : call.getAlleles() )
|
||||
calledHaplotypes.addAll(alleleMapper.get(calledAllele));
|
||||
|
||||
returnCalls.add( annotatedCall );
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
return new CalledHaplotypes(returnCalls, calledHaplotypes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Go through the haplotypes we assembled, and decompose them into their constituent variant contexts
|
||||
*
|
||||
* @param haplotypes the list of haplotypes we're working with
|
||||
* @param haplotypeReadMap map from samples -> the per read allele likelihoods
|
||||
* @param ref the reference bases (over the same interval as the haplotypes)
|
||||
* @param refLoc the span of the reference bases
|
||||
* @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode)
|
||||
* @return
|
||||
*/
|
||||
private TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG);
|
||||
|
||||
if ( in_GGA_mode ) startPosKeySet.clear();
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if ( !in_GGA_mode ) {
|
||||
// run the event merger if we're not in GGA mode
|
||||
final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc);
|
||||
if ( mergedAnything )
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
|
||||
}
|
||||
|
||||
if ( in_GGA_mode ) {
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
}
|
||||
|
||||
return startPosKeySet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the priority list (just the list of sources for these variant context) used to merge overlapping events into common reference view
|
||||
* @param vcs a list of variant contexts
|
||||
* @return the list of the sources of vcs in the same order
|
||||
*/
|
||||
private List<String> makePriorityList(final List<VariantContext> vcs) {
|
||||
final List<String> priorityList = new LinkedList<String>();
|
||||
for ( final VariantContext vc : vcs ) priorityList.add(vc.getSource());
|
||||
return priorityList;
|
||||
}
|
||||
|
||||
private List<VariantContext> getVCsAtThisLocation(final List<Haplotype> haplotypes,
|
||||
final int loc,
|
||||
final List<VariantContext> activeAllelesToGenotype) {
|
||||
// the overlapping events to merge into a common reference view
|
||||
final List<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final EventMap eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
int compCount = 0;
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
if( compVC.getStart() == loc ) {
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
List<Allele> alleleSet = new ArrayList<Allele>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount;
|
||||
// check if this event is already in the list of events due to a repeat in the input alleles track
|
||||
final VariantContext candidateEventToAdd = new VariantContextBuilder(compVC).alleles(alleleSet).source(vcSourceName).make();
|
||||
boolean alreadyExists = false;
|
||||
for( final VariantContext eventToTest : eventsAtThisLoc ) {
|
||||
if( eventToTest.hasSameAllelesAs(candidateEventToAdd) ) {
|
||||
alreadyExists = true;
|
||||
}
|
||||
}
|
||||
if( !alreadyExists ) {
|
||||
eventsAtThisLoc.add(candidateEventToAdd);
|
||||
}
|
||||
alleleCount++;
|
||||
}
|
||||
}
|
||||
compCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return eventsAtThisLoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele
|
||||
* @param samples List of samples to genotype
|
||||
* @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods
|
||||
* @param mergedVC Input VC with event to genotype
|
||||
* @return GenotypesContext object wrapping genotype objects with PLs
|
||||
*/
|
||||
@Requires({"samples != null","alleleReadMap!= null", "mergedVC != null"})
|
||||
@Requires({"alleleReadMap!= null", "mergedVC != null"})
|
||||
@Ensures("result != null")
|
||||
private GenotypesContext calculateGLsForThisEvent( final List<String> samples, final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap, final VariantContext mergedVC ) {
|
||||
final GenotypesContext genotypes = GenotypesContext.create(samples.size());
|
||||
private GenotypesContext calculateGLsForThisEvent( final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap, final VariantContext mergedVC ) {
|
||||
final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size());
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
for( final String sample : samples ) {
|
||||
for( final String sample : alleleReadMap.keySet() ) {
|
||||
final int numHaplotypes = mergedVC.getAlleles().size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles());
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
genotypes.add(new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make());
|
||||
}
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
private void validatePriorityList( final List<String> priorityList, final List<VariantContext> eventsAtThisLoc ) {
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if( !priorityList.contains(vc.getSource()) ) {
|
||||
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
for( final String name : priorityList ) {
|
||||
boolean found = false;
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if(vc.getSource().equals(name)) { found = true; break; }
|
||||
}
|
||||
if( !found ) {
|
||||
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, PerReadAlleleLikelihoodMap> filterToOnlyOverlappingReads( final GenomeLocParser parser,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
|
|
@ -337,10 +386,10 @@ public class GenotypingEngine {
|
|||
protected static void cleanUpSymbolicUnassembledEvents( final List<Haplotype> haplotypes ) {
|
||||
final List<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
for( final VariantContext vc : h.getEventMap().getVariantContexts() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
for( final Haplotype h2 : haplotypes ) {
|
||||
for( final VariantContext vc2 : h2.getEventMap().values() ) {
|
||||
for( final VariantContext vc2 : h2.getEventMap().getVariantContexts() ) {
|
||||
if( vc.getStart() == vc2.getStart() && (vc2.isIndel() || vc2.isMNP()) ) { // unfortunately symbolic alleles can't currently be combined with non-point events
|
||||
haplotypesToRemove.add(h);
|
||||
break;
|
||||
|
|
@ -356,8 +405,7 @@ public class GenotypingEngine {
|
|||
// BUGBUG: ugh, too complicated
|
||||
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final Map<Allele, List<Haplotype>> alleleMapper,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog ) {
|
||||
final double downsamplingFraction ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
|
||||
|
|
@ -374,165 +422,13 @@ public class GenotypingEngine {
|
|||
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
|
||||
}
|
||||
}
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
|
||||
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
return alleleReadMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO - comment me, clean me, refactor me!
|
||||
* @param haplotypes
|
||||
* @param samples
|
||||
* @param haplotypeReadMap
|
||||
* @param startPosKeySet
|
||||
* @param ref
|
||||
* @param refLoc
|
||||
*/
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final List<Haplotype> haplotypes,
|
||||
final List<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
|
||||
final TreeSet<Integer> startPosKeySet,
|
||||
final byte[] ref,
|
||||
final GenomeLoc refLoc ) {
|
||||
|
||||
final int MAX_SIZE_TO_COMBINE = 15;
|
||||
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
|
||||
if( startPosKeySet.size() <= 1 ) { return; }
|
||||
|
||||
boolean mapWasUpdated = true;
|
||||
while( mapWasUpdated ) {
|
||||
mapWasUpdated = false;
|
||||
|
||||
// loop over the set of start locations and consider pairs that start near each other
|
||||
final Iterator<Integer> iter = startPosKeySet.iterator();
|
||||
int thisStart = iter.next();
|
||||
while( iter.hasNext() ) {
|
||||
final int nextStart = iter.next();
|
||||
if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) {
|
||||
boolean isBiallelic = true;
|
||||
VariantContext thisVC = null;
|
||||
VariantContext nextVC = null;
|
||||
double x11 = Double.NEGATIVE_INFINITY;
|
||||
double x12 = Double.NEGATIVE_INFINITY;
|
||||
double x21 = Double.NEGATIVE_INFINITY;
|
||||
double x22 = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// only make complex substitutions out of consecutive biallelic sites
|
||||
final VariantContext thisHapVC = h.getEventMap().get(thisStart);
|
||||
if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype
|
||||
if( thisVC == null ) {
|
||||
thisVC = thisHapVC;
|
||||
} else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) {
|
||||
isBiallelic = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
final VariantContext nextHapVC = h.getEventMap().get(nextStart);
|
||||
if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype
|
||||
if( nextVC == null ) {
|
||||
nextVC = nextHapVC;
|
||||
} else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) {
|
||||
isBiallelic = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// count up the co-occurrences of the events for the R^2 calculation
|
||||
for( final String sample : samples ) {
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( Collections.singleton(sample), haplotypeReadMap, Collections.singletonList(Allele.create(h, true)) )[0][0];
|
||||
if( thisHapVC == null ) {
|
||||
if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); }
|
||||
else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); }
|
||||
} else {
|
||||
if( nextHapVC == null ) { x21 = MathUtils.approximateLog10SumLog10(x21, haplotypeLikelihood); }
|
||||
else { x22 = MathUtils.approximateLog10SumLog10(x22, haplotypeLikelihood); }
|
||||
}
|
||||
}
|
||||
}
|
||||
if( thisVC == null || nextVC == null ) {
|
||||
continue;
|
||||
}
|
||||
if( isBiallelic ) {
|
||||
final double R2 = calculateR2LD( Math.pow(10.0, x11), Math.pow(10.0, x12), Math.pow(10.0, x21), Math.pow(10.0, x22) );
|
||||
if( DEBUG ) {
|
||||
System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
|
||||
System.out.println("-- " + thisVC);
|
||||
System.out.println("-- " + nextVC);
|
||||
}
|
||||
if( R2 > MERGE_EVENTS_R2_THRESHOLD ) {
|
||||
|
||||
final VariantContext mergedVC = createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
|
||||
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final Map<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) {
|
||||
eventMap.remove(thisStart);
|
||||
eventMap.remove(nextStart);
|
||||
eventMap.put(mergedVC.getStart(), mergedVC);
|
||||
}
|
||||
}
|
||||
startPosKeySet.add(mergedVC.getStart());
|
||||
boolean containsStart = false;
|
||||
boolean containsNext = false;
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final Map<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) ) { containsStart = true; }
|
||||
if( eventMap.containsKey(nextStart) ) { containsNext = true; }
|
||||
}
|
||||
if(!containsStart) { startPosKeySet.remove(thisStart); }
|
||||
if(!containsNext) { startPosKeySet.remove(nextStart); }
|
||||
|
||||
if( DEBUG ) { System.out.println("====> " + mergedVC); }
|
||||
mapWasUpdated = true;
|
||||
break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
|
||||
}
|
||||
}
|
||||
}
|
||||
thisStart = nextStart;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BUGBUG: make this merge function more general
|
||||
protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
final int thisStart = thisVC.getStart();
|
||||
final int nextStart = nextVC.getStart();
|
||||
byte[] refBases = new byte[]{};
|
||||
byte[] altBases = new byte[]{};
|
||||
refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases());
|
||||
altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases());
|
||||
int locus;
|
||||
for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
|
||||
final byte refByte = ref[locus - refLoc.getStart()];
|
||||
refBases = ArrayUtils.add(refBases, refByte);
|
||||
altBases = ArrayUtils.add(altBases, refByte);
|
||||
}
|
||||
refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel
|
||||
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
|
||||
|
||||
int iii = 0;
|
||||
if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele
|
||||
while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; }
|
||||
}
|
||||
final List<Allele> mergedAlleles = new ArrayList<Allele>();
|
||||
mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) );
|
||||
mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) );
|
||||
return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make();
|
||||
}
|
||||
|
||||
protected static double calculateR2LD( final double x11, final double x12, final double x21, final double x22 ) {
|
||||
final double total = x11 + x12 + x21 + x22;
|
||||
final double pa1b1 = x11 / total;
|
||||
final double pa1b2 = x12 / total;
|
||||
final double pa2b1 = x21 / total;
|
||||
final double pa1 = pa1b1 + pa1b2;
|
||||
final double pb1 = pa1b1 + pa2b1;
|
||||
return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) );
|
||||
}
|
||||
|
||||
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) {
|
||||
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<Allele, List<Haplotype>>();
|
||||
for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) {
|
||||
|
|
@ -559,8 +455,8 @@ public class GenotypingEngine {
|
|||
alleles.add(h.getArtificialRefAllele());
|
||||
alleles.add(h.getArtificialAltAllele());
|
||||
final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype")
|
||||
.alleles(alleles)
|
||||
.loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() );
|
||||
.alleles(alleles)
|
||||
.loc(refVC.getChr(), refVC.getStart(), refVC.getStart() + h.getArtificialRefAllele().length() - 1).make() );
|
||||
if( eventMapper.containsKey(artificialVC) ) {
|
||||
eventMapper.get(artificialVC).add(h);
|
||||
}
|
||||
|
|
@ -588,6 +484,10 @@ public class GenotypingEngine {
|
|||
if( eventToTest.getKey().equals(new Event(null)) )
|
||||
continue;
|
||||
|
||||
// only try to disambiguate for alleles that have had haplotypes previously assigned above
|
||||
if( eventToTest.getValue().isEmpty() )
|
||||
continue;
|
||||
|
||||
final Haplotype artificialHaplotype = eventToTest.getValue().get(0);
|
||||
if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) {
|
||||
matchingEvent = eventToTest.getKey();
|
||||
|
|
@ -648,6 +548,11 @@ public class GenotypingEngine {
|
|||
return eventAllelesForSample;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd ) {
|
||||
return new EventMap(haplotype, ref, refLoc, sourceNameToAdd);
|
||||
}
|
||||
|
||||
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
|
||||
for( final VariantContext vc : list ) {
|
||||
if( vc.hasSameAllelesAs(vcToTest) ) {
|
||||
|
|
@ -657,91 +562,7 @@ public class GenotypingEngine {
|
|||
return false;
|
||||
}
|
||||
|
||||
protected static Map<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) {
|
||||
final Map<Integer,VariantContext> vcs = new LinkedHashMap<Integer,VariantContext>();
|
||||
|
||||
int refPos = alignmentStartHapwrtRef;
|
||||
if( refPos < 0 ) { return null; } // Protection against SW failures
|
||||
int alignmentPos = 0;
|
||||
|
||||
for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) {
|
||||
final CigarElement ce = cigar.getCigarElement(cigarIndex);
|
||||
final int elementLength = ce.getLength();
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
{
|
||||
final List<Allele> insertionAlleles = new ArrayList<Allele>();
|
||||
final int insertionStart = refLoc.getStart() + refPos - 1;
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) ) {
|
||||
insertionAlleles.add( Allele.create(refByte, true) );
|
||||
}
|
||||
if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { // if the insertion isn't completely resolved in the haplotype then make it a symbolic allele
|
||||
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
} else {
|
||||
byte[] insertionBases = new byte[]{};
|
||||
insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base
|
||||
insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ));
|
||||
if( BaseUtils.isAllRegularBases(insertionBases) ) {
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
}
|
||||
}
|
||||
if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
}
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case S:
|
||||
{
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case D:
|
||||
{
|
||||
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base
|
||||
final List<Allele> deletionAlleles = new ArrayList<Allele>();
|
||||
final int deletionStart = refLoc.getStart() + refPos - 1;
|
||||
final byte refByte = ref[refPos-1];
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
|
||||
deletionAlleles.add( Allele.create(deletionBases, true) );
|
||||
deletionAlleles.add( Allele.create(refByte, false) );
|
||||
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
|
||||
}
|
||||
refPos += elementLength;
|
||||
break;
|
||||
}
|
||||
case M:
|
||||
case EQ:
|
||||
case X:
|
||||
{
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
final byte refByte = ref[refPos];
|
||||
final byte altByte = alignment[alignmentPos];
|
||||
if( refByte != altByte ) { // SNP!
|
||||
if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) {
|
||||
final List<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refByte, true ) );
|
||||
snpAlleles.add( Allele.create( altByte, false ) );
|
||||
vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make());
|
||||
}
|
||||
}
|
||||
refPos++;
|
||||
alignmentPos++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
|
||||
}
|
||||
}
|
||||
return vcs;
|
||||
}
|
||||
|
||||
private static class Event {
|
||||
protected static class Event {
|
||||
public VariantContext vc;
|
||||
|
||||
public Event( final VariantContext vc ) {
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
|
|
@ -56,7 +55,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -67,27 +67,30 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalcul
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileState;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.*;
|
||||
import org.broadinstitute.sting.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -96,17 +99,17 @@ import java.util.*;
|
|||
/**
|
||||
* Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* Input bam file(s) from which to make calls
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* VCF file with raw, unrecalibrated SNP and indel calls.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
|
|
@ -120,7 +123,7 @@ import java.util.*;
|
|||
* -o output.raw.snps.indels.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h2>Caveats</h2>
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.</li>
|
||||
* </ul>
|
||||
|
|
@ -132,33 +135,60 @@ import java.util.*;
|
|||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
|
||||
@ActiveRegionTraversalParameters(extension=85, maxRegion=300)
|
||||
@ActiveRegionTraversalParameters(extension=200, maxRegion=300)
|
||||
@ReadFilters({HCMappingQualityFilter.class})
|
||||
@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250)
|
||||
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
|
||||
|
||||
/**
|
||||
* A raw, unfiltered, highly sensitive callset in VCF format.
|
||||
*/
|
||||
@Output(doc="File to which variants should be written", required = true)
|
||||
@Output(doc="File to which variants should be written")
|
||||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
|
||||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
/**
|
||||
* The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here
|
||||
* does not include uninformative reads so that not every input read is emitted to the bam.
|
||||
* The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only.
|
||||
* Note that the output here does not include uninformative reads so that not every input read is emitted to the bam.
|
||||
*
|
||||
* Turning on this mode may result in serious performance cost for the HC. It's really only appropriate to
|
||||
* use in specific areas where you want to better understand why the HC is making specific calls.
|
||||
*
|
||||
* The reads are written out containing a HC tag (integer) that encodes which haplotype each read best matches
|
||||
* according to the haplotype caller's likelihood calculation. The use of this tag is primarily intended
|
||||
* to allow good coloring of reads in IGV. Simply go to Color Alignments By > Tag and enter HC to more
|
||||
* easily see which reads go with these haplotype.
|
||||
*
|
||||
* Note that the haplotypes (called or all, depending on mode) are emitted as single reads covering the entire
|
||||
* active region, coming from read HC and a special read group.
|
||||
*
|
||||
* Note that only reads that are actually informative about the haplotypes are emitted. By informative we mean
|
||||
* that there's a meaningful difference in the likelihood of the read coming from one haplotype compared to
|
||||
* its next best haplotype.
|
||||
*
|
||||
* The best way to visualize the output of this mode is with IGV. Tell IGV to color the alignments by tag,
|
||||
* and give it the HC tag, so you can see which reads support each haplotype. Finally, you can tell IGV
|
||||
* to group by sample, which will separate the potential haplotypes from the reads. All of this can be seen
|
||||
* in the following screenshot: https://www.dropbox.com/s/xvy7sbxpf13x5bp/haplotypecaller%20bamout%20for%20docs.png
|
||||
*
|
||||
*/
|
||||
@Hidden
|
||||
@Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false)
|
||||
@Advanced
|
||||
@Output(fullName="bamOutput", shortName="bamout", doc="File to which assembled haplotypes should be written", required = false, defaultToStdout = false)
|
||||
protected StingSAMFileWriter bamWriter = null;
|
||||
private SAMFileHeader bamHeader = null;
|
||||
private long uniqueNameCounter = 1;
|
||||
private final static String readGroupId = "ArtificialHaplotype";
|
||||
private HaplotypeBAMWriter haplotypeBAMWriter;
|
||||
|
||||
/**
|
||||
* The type of BAM output we want to see.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="bamWriterType", shortName="bamWriterType", doc="How should haplotypes be written to the BAM?", required = false)
|
||||
public HaplotypeBAMWriter.Type bamWriterType = HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES;
|
||||
|
||||
/**
|
||||
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
|
||||
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
|
||||
|
||||
|
|
@ -166,8 +196,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 2;
|
||||
protected int MIN_PRUNE_FACTOR = 0;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false)
|
||||
|
|
@ -175,7 +206,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
|
||||
@Advanced
|
||||
@Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false)
|
||||
protected int maxNumHaplotypesInPopulation = 13;
|
||||
protected int maxNumHaplotypesInPopulation = 25;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false)
|
||||
|
|
@ -188,9 +219,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* the mates contig and alignment start. If this flag is provided the haplotype caller will see such reads,
|
||||
* and may make use of them in assembly and calling, where possible.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName="includeUmappedReads", shortName="unmapped", doc="If provided, unmapped reads with chromosomal coordinates (i.e., those placed to their maps) will be included in the assembly and calling", required = false)
|
||||
protected boolean includeUnmappedReads = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
|
||||
protected boolean USE_ALLELES_TRIGGER = false;
|
||||
|
||||
|
|
@ -202,6 +235,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@Argument(fullName="justDetermineActiveRegions", shortName="justDetermineActiveRegions", doc = "If specified, the HC won't actually do any assembly or calling, it'll just run the upfront active region determination code. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean justDetermineActiveRegions = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="dontGenotype", shortName="dontGenotype", doc = "If specified, the HC will do any assembly but won't do calling. Useful for benchmarking and scalability testing", required=false)
|
||||
protected boolean dontGenotype = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="errorCorrectKmers", shortName="errorCorrectKmers", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false)
|
||||
protected boolean errorCorrectKmers = false;
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
|
|
@ -216,6 +257,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Advanced
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
|
@ -228,6 +270,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
|
||||
|
||||
|
|
@ -235,9 +278,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false)
|
||||
protected boolean mergeVariantsViaLD = false;
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
*/
|
||||
|
|
@ -247,9 +295,27 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
@ArgumentCollection
|
||||
private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
|
||||
protected boolean DEBUG;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false)
|
||||
protected int debugGraphTransformations = -1;
|
||||
|
||||
@Hidden // TODO -- not currently useful
|
||||
@Argument(fullName="useLowQualityBasesForAssembly", shortName="useLowQualityBasesForAssembly", doc="If specified, we will include low quality bases when doing the assembly", required = false)
|
||||
protected boolean useLowQualityBasesForAssembly = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="dontTrimActiveRegions", shortName="dontTrimActiveRegions", doc="If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping", required = false)
|
||||
protected boolean dontTrimActiveRegions = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false)
|
||||
protected boolean allowCyclesInKmerGraphToGeneratePaths = false;
|
||||
|
||||
|
||||
// the UG engines
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
private UnifiedGenotyperEngine UG_engine_simple_genotyper = null;
|
||||
|
|
@ -271,6 +337,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 500;
|
||||
|
||||
// include at least this many bases around an event for calling it
|
||||
private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20;
|
||||
private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150;
|
||||
|
||||
// the maximum extent into the full active region extension that we're willing to go in genotyping our events
|
||||
private final static int MAX_GENOTYPING_ACTIVE_REGION_EXTENSION = 25;
|
||||
|
||||
private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument
|
||||
private final static int minReadsPerAlignmentStart = 5; // TODO -- should be an argument
|
||||
|
||||
// bases with quality less than or equal to this value are trimmed off the tails of the reads
|
||||
private static final byte MIN_TAIL_QUALITY = 20;
|
||||
|
||||
|
|
@ -291,6 +367,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY )
|
||||
throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel);
|
||||
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
samplesList.addAll( samples );
|
||||
|
|
@ -349,12 +428,21 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
assemblyEngine = new DeBruijnAssembler( DEBUG, graphWriter, minKmer );
|
||||
// setup the assembler
|
||||
assemblyEngine = new DeBruijnAssembler(DEBUG, debugGraphTransformations, minKmer, allowCyclesInKmerGraphToGeneratePaths);
|
||||
assemblyEngine.setErrorCorrectKmers(errorCorrectKmers);
|
||||
assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR);
|
||||
if ( graphWriter != null ) assemblyEngine.setGraphWriter(graphWriter);
|
||||
if ( useLowQualityBasesForAssembly ) assemblyEngine.setMinBaseQualityToUseInAssembly((byte)1);
|
||||
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS );
|
||||
|
||||
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();
|
||||
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS, variantMerger );
|
||||
|
||||
if ( bamWriter != null )
|
||||
setupBamWriter();
|
||||
haplotypeBAMWriter = HaplotypeBAMWriter.create(bamWriterType, bamWriter, getToolkit().getSAMFileHeader());
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -391,12 +479,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
public ActivityProfileState isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
|
||||
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
|
||||
if( !allelesToGenotype.contains(vc) ) {
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
|
||||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
final VariantContext vcFromAllelesRod = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), false, logger, UG_engine.getUAC().alleles);
|
||||
if( vcFromAllelesRod != null ) {
|
||||
allelesToGenotype.add(vcFromAllelesRod); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
|
||||
return new ActivityProfileState(ref.getLocus(), 1.0);
|
||||
}
|
||||
}
|
||||
|
|
@ -423,7 +508,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
final byte qual = p.getQual();
|
||||
if( p.isDeletion() || qual > (byte) 18) {
|
||||
int AA = 0; final int AB = 1; int BB = 2;
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
if( p.isNextToSoftClip() ) {
|
||||
|
|
@ -454,87 +539,240 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||
public Integer map( final ActiveRegion originalActiveRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||
if ( justDetermineActiveRegions )
|
||||
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
|
||||
return 1;
|
||||
|
||||
final List<VariantContext> activeAllelesToGenotype = new ArrayList<VariantContext>();
|
||||
if( !originalActiveRegion.isActive() ) { return 0; } // Not active so nothing to do!
|
||||
|
||||
final List<VariantContext> activeAllelesToGenotype = new ArrayList<VariantContext>();
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : allelesToGenotype ) {
|
||||
if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
|
||||
if( originalActiveRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
|
||||
activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode
|
||||
}
|
||||
}
|
||||
allelesToGenotype.removeAll( activeAllelesToGenotype );
|
||||
// No alleles found in this region so nothing to do!
|
||||
if ( activeAllelesToGenotype.isEmpty() ) { return 0; }
|
||||
} else {
|
||||
if( originalActiveRegion.size() == 0 ) { return 0; } // No reads here so nothing to do!
|
||||
}
|
||||
|
||||
if( !activeRegion.isActive() ) { return 0; } // Not active so nothing to do!
|
||||
if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do!
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
|
||||
// run the local assembler, getting back a collection of information on how we should proceed
|
||||
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
|
||||
|
||||
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
|
||||
// abort early if something is out of the acceptable range
|
||||
if( assemblyResult.haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
if (dontGenotype) return 1; // user requested we not proceed
|
||||
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
|
||||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) );
|
||||
// filter out reads from genotyping which fail mapping quality based criteria
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final List<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ?
|
||||
likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes );
|
||||
if( assemblyResult.regionForGenotyping.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
|
||||
|
||||
for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
||||
bestHaplotypes,
|
||||
samplesList,
|
||||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
fullReferenceWithPadding,
|
||||
paddedReferenceLoc,
|
||||
activeRegion.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
activeAllelesToGenotype ) ) {
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesForGenotyping(assemblyResult.haplotypes, stratifiedReadMap);
|
||||
|
||||
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
||||
bestHaplotypes,
|
||||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
assemblyResult.fullReferenceWithPadding,
|
||||
assemblyResult.paddedReferenceLoc,
|
||||
assemblyResult.regionForGenotyping.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
activeAllelesToGenotype );
|
||||
|
||||
for( final VariantContext call : calledHaplotypes.getCalls() ) {
|
||||
// TODO -- uncomment this line once ART-based walkers have a proper RefMetaDataTracker.
|
||||
// annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call);
|
||||
vcfWriter.add( call );
|
||||
}
|
||||
|
||||
if ( bamWriter != null ) {
|
||||
// write the haplotypes to the bam
|
||||
for ( Haplotype haplotype : haplotypes )
|
||||
writeHaplotype(haplotype, paddedReferenceLoc, bestHaplotypes.contains(haplotype));
|
||||
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(assemblyResult.haplotypes, assemblyResult.paddedReferenceLoc,
|
||||
bestHaplotypes,
|
||||
calledHaplotypes.getCalledHaplotypes(),
|
||||
stratifiedReadMap);
|
||||
}
|
||||
|
||||
// we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently
|
||||
final Map<Allele, Haplotype> alleleToHaplotypeMap = new HashMap<Allele, Haplotype>(haplotypes.size());
|
||||
for ( final Haplotype haplotype : haplotypes )
|
||||
alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype);
|
||||
if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
|
||||
|
||||
// next, output the interesting reads for each sample aligned against the appropriate haplotype
|
||||
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
|
||||
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final Allele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue());
|
||||
if ( bestAllele != Allele.NO_CALL )
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele), paddedReferenceLoc.getStart());
|
||||
}
|
||||
return 1; // One active region was processed during this map call
|
||||
}
|
||||
|
||||
private final static class AssemblyResult {
|
||||
final List<Haplotype> haplotypes;
|
||||
final ActiveRegion regionForGenotyping;
|
||||
final byte[] fullReferenceWithPadding;
|
||||
final GenomeLoc paddedReferenceLoc;
|
||||
|
||||
private AssemblyResult(List<Haplotype> haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc) {
|
||||
this.haplotypes = haplotypes;
|
||||
this.regionForGenotyping = regionForGenotyping;
|
||||
this.fullReferenceWithPadding = fullReferenceWithPadding;
|
||||
this.paddedReferenceLoc = paddedReferenceLoc;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* High-level function that runs the assembler on the active region reads,
|
||||
* returning a data structure with the resulting information needed
|
||||
* for further HC steps
|
||||
*
|
||||
* @param activeRegion the region we should assemble
|
||||
* @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty)
|
||||
* @return the AssemblyResult describing how to proceed with genotyping
|
||||
*/
|
||||
protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List<VariantContext> activeAllelesToGenotype) {
|
||||
// Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails
|
||||
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true);
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING);
|
||||
final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion);
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype );
|
||||
|
||||
if ( ! dontTrimActiveRegions ) {
|
||||
return trimActiveRegion(activeRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
} else {
|
||||
// we don't want to or cannot create a trimmed active region, so go ahead and use the old one
|
||||
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim down the active region to just enough to properly genotype the events among the haplotypes
|
||||
*
|
||||
* This function merely creates the region, but it doesn't populate the reads back into the region
|
||||
*
|
||||
* @param region our full active region
|
||||
* @param haplotypes the list of haplotypes we've created from assembly
|
||||
* @param ref the reference bases over the full padded location
|
||||
* @param refLoc the span of the reference bases
|
||||
* @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
|
||||
*/
|
||||
private ActiveRegion createTrimmedRegion(final ActiveRegion region, final List<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc) {
|
||||
EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, DEBUG);
|
||||
final TreeSet<VariantContext> allContexts = EventMap.getAllVariantContexts(haplotypes);
|
||||
final GenomeLocParser parser = getToolkit().getGenomeLocParser();
|
||||
|
||||
if ( allContexts.isEmpty() ) // no variants, so just return the current region
|
||||
return null;
|
||||
|
||||
final List<VariantContext> withinActiveRegion = new LinkedList<VariantContext>();
|
||||
int pad = PADDING_AROUND_SNPS_FOR_CALLING;
|
||||
GenomeLoc trimLoc = null;
|
||||
for ( final VariantContext vc : allContexts ) {
|
||||
final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
|
||||
if ( region.getLocation().overlapsP(vcLoc) ) {
|
||||
if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
|
||||
pad = PADDING_AROUND_OTHERS_FOR_CALLING;
|
||||
trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
|
||||
withinActiveRegion.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
// we don't actually have anything in the region after removing variants that don't overlap the region's full location
|
||||
if ( trimLoc == null ) return null;
|
||||
|
||||
return 1; // One active region was processed during this map call
|
||||
final GenomeLoc maxSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(region.getLocation(), MAX_GENOTYPING_ACTIVE_REGION_EXTENSION);
|
||||
final GenomeLoc idealSpan = getToolkit().getGenomeLocParser().createPaddedGenomeLoc(trimLoc, pad);
|
||||
final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
|
||||
|
||||
final ActiveRegion trimmedRegion = region.trim(finalSpan);
|
||||
if ( DEBUG ) {
|
||||
logger.info("events : " + withinActiveRegion);
|
||||
logger.info("trimLoc : " + trimLoc);
|
||||
logger.info("pad : " + pad);
|
||||
logger.info("idealSpan : " + idealSpan);
|
||||
logger.info("maxSpan : " + maxSpan);
|
||||
logger.info("finalSpan : " + finalSpan);
|
||||
logger.info("regionSpan : " + trimmedRegion.getExtendedLoc() + " size is " + trimmedRegion.getExtendedLoc().size());
|
||||
}
|
||||
return trimmedRegion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim down the active region to just enough to properly genotype the events among the haplotypes
|
||||
*
|
||||
* @param originalActiveRegion our full active region
|
||||
* @param haplotypes the list of haplotypes we've created from assembly
|
||||
* @param fullReferenceWithPadding the reference bases over the full padded location
|
||||
* @param paddedReferenceLoc the span of the reference bases
|
||||
* @return an AssemblyResult containing the trimmed active region with all of the reads we should use
|
||||
* trimmed down as well, and a revised set of haplotypes. If trimming failed this function
|
||||
* may choose to use the originalActiveRegion without modification
|
||||
*/
|
||||
private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion,
|
||||
final List<Haplotype> haplotypes,
|
||||
final byte[] fullReferenceWithPadding,
|
||||
final GenomeLoc paddedReferenceLoc) {
|
||||
final ActiveRegion trimmedActiveRegion = createTrimmedRegion(originalActiveRegion, haplotypes, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
|
||||
if ( trimmedActiveRegion == null )
|
||||
return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
|
||||
// trim down the haplotypes
|
||||
final Set<Haplotype> haplotypeSet = new HashSet<Haplotype>(haplotypes.size());
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
|
||||
if ( trimmed != null ) {
|
||||
haplotypeSet.add(trimmed);
|
||||
} else if ( DEBUG ) {
|
||||
logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc());
|
||||
}
|
||||
}
|
||||
|
||||
// create the final list of trimmed haplotypes
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<Haplotype>(haplotypeSet);
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() );
|
||||
|
||||
if ( DEBUG ) {
|
||||
logger.info("Trimming haplotypes reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
|
||||
for ( final Haplotype remaining: trimmedHaplotypes ) {
|
||||
logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// trim down the reads and add them to the trimmed active region
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<GATKSAMRecord>(originalActiveRegion.getReads().size());
|
||||
for( final GATKSAMRecord read : originalActiveRegion.getReads() ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
|
||||
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
trimmedReads.add(clippedRead);
|
||||
}
|
||||
}
|
||||
trimmedActiveRegion.clearReads();
|
||||
trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
|
||||
|
||||
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the best N haplotypes according to their likelihoods, if appropriate
|
||||
*
|
||||
* @param haplotypes a list of haplotypes to consider
|
||||
* @param stratifiedReadMap a map from samples -> read likelihoods
|
||||
* @return the list of haplotypes to genotype
|
||||
*/
|
||||
protected List<Haplotype> selectBestHaplotypesForGenotyping(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
if ( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
return haplotypes;
|
||||
} else {
|
||||
return likelihoodCalculationEngine.selectBestHaplotypesFromEachSample(haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -564,8 +802,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
final List<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( activeRegion.getReads() );
|
||||
activeRegion.clearReads();
|
||||
|
|
@ -581,20 +819,33 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
for( final GATKSAMRecord myRead : finalizedReadList ) {
|
||||
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
|
||||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
GATKSAMRecord clippedRead = useLowQualityBasesForAssembly ? postAdapterRead : ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
|
||||
// revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
|
||||
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
|
||||
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
|
||||
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
|
||||
// TODO -- reference haplotype start must be removed
|
||||
clippedRead = ReadClipper.revertSoftClippedBases(clippedRead);
|
||||
|
||||
// uncomment to remove hard clips from consideration at all
|
||||
//clippedRead = ReadClipper.hardClipSoftClippedBases(clippedRead);
|
||||
|
||||
clippedRead = ReadClipper.hardClipToRegion( clippedRead, activeRegion.getExtendedLoc().getStart(), activeRegion.getExtendedLoc().getStop() );
|
||||
if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
//logger.info("Keeping read " + clippedRead + " start " + clippedRead.getAlignmentStart() + " end " + clippedRead.getAlignmentEnd());
|
||||
readsToUse.add(clippedRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
activeRegion.addAll(ReadUtils.sortReadsByCoordinate(readsToUse));
|
||||
|
||||
activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart));
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < 24 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
if( rec.getReadLength() < 10 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
readsToRemove.add(rec);
|
||||
}
|
||||
}
|
||||
|
|
@ -624,92 +875,5 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
return returnMap;
|
||||
}
|
||||
|
||||
private void setupBamWriter() {
|
||||
// prepare the bam header
|
||||
bamHeader = new SAMFileHeader();
|
||||
bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary());
|
||||
bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
|
||||
// include the original read groups plus a new artificial one for the haplotypes
|
||||
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(getToolkit().getSAMFileHeader().getReadGroups());
|
||||
final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId);
|
||||
rg.setSample("HC");
|
||||
rg.setSequencingCenter("BI");
|
||||
readGroups.add(rg);
|
||||
bamHeader.setReadGroups(readGroups);
|
||||
|
||||
bamWriter.setPresorted(false);
|
||||
bamWriter.writeHeader(bamHeader);
|
||||
}
|
||||
|
||||
private void writeHaplotype(final Haplotype haplotype, final GenomeLoc paddedRefLoc, final boolean isAmongBestHaplotypes) {
|
||||
final GATKSAMRecord record = new GATKSAMRecord(bamHeader);
|
||||
record.setReadBases(haplotype.getBases());
|
||||
record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
|
||||
record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
|
||||
record.setCigar(haplotype.getCigar());
|
||||
record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0);
|
||||
record.setReadName("HC" + uniqueNameCounter++);
|
||||
record.setReadUnmappedFlag(false);
|
||||
record.setReferenceIndex(paddedRefLoc.getContigIndex());
|
||||
record.setAttribute(SAMTag.RG.toString(), readGroupId);
|
||||
record.setFlags(16);
|
||||
bamWriter.addAlignment(record);
|
||||
}
|
||||
|
||||
private void writeReadAgainstHaplotype(final GATKSAMRecord read, final Haplotype haplotype, final int referenceStart) {
|
||||
|
||||
final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), read.getReadBases(), 5.0, -10.0, -22.0, -1.2);
|
||||
final int readStartOnHaplotype = swPairwiseAlignment.getAlignmentStart2wrt1();
|
||||
final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;
|
||||
read.setAlignmentStart(readStartOnReference);
|
||||
|
||||
final Cigar cigar = generateReadCigarFromHaplotype(read, readStartOnHaplotype, haplotype.getCigar());
|
||||
read.setCigar(cigar);
|
||||
|
||||
bamWriter.addAlignment(read);
|
||||
}
|
||||
|
||||
private Cigar generateReadCigarFromHaplotype(final GATKSAMRecord read, final int readStartOnHaplotype, final Cigar haplotypeCigar) {
|
||||
|
||||
int currentReadPos = 0;
|
||||
int currentHapPos = 0;
|
||||
final List<CigarElement> readCigarElements = new ArrayList<CigarElement>();
|
||||
|
||||
for ( final CigarElement cigarElement : haplotypeCigar.getCigarElements() ) {
|
||||
|
||||
if ( cigarElement.getOperator() == CigarOperator.D ) {
|
||||
if ( currentReadPos > 0 )
|
||||
readCigarElements.add(cigarElement);
|
||||
} else if ( cigarElement.getOperator() == CigarOperator.M || cigarElement.getOperator() == CigarOperator.I ) {
|
||||
|
||||
final int elementLength = cigarElement.getLength();
|
||||
final int nextReadPos = currentReadPos + elementLength;
|
||||
final int nextHapPos = currentHapPos + elementLength;
|
||||
|
||||
// do we want this element?
|
||||
if ( currentReadPos > 0 ) {
|
||||
// do we want the entire element?
|
||||
if ( nextReadPos < read.getReadLength() ) {
|
||||
readCigarElements.add(cigarElement);
|
||||
currentReadPos = nextReadPos;
|
||||
}
|
||||
// otherwise, we can finish up and return the cigar
|
||||
else {
|
||||
readCigarElements.add(new CigarElement(read.getReadLength() - currentReadPos, cigarElement.getOperator()));
|
||||
return new Cigar(readCigarElements);
|
||||
}
|
||||
}
|
||||
// do we want part of the element to start?
|
||||
else if ( currentReadPos == 0 && nextHapPos > readStartOnHaplotype ) {
|
||||
currentReadPos = Math.min(nextHapPos - readStartOnHaplotype, read.getReadLength());
|
||||
readCigarElements.add(new CigarElement(currentReadPos, cigarElement.getOperator()));
|
||||
}
|
||||
|
||||
currentHapPos = nextHapPos;
|
||||
}
|
||||
}
|
||||
|
||||
return new Cigar(readCigarElements);
|
||||
}
|
||||
}
|
||||
|
|
@ -58,8 +58,8 @@ import org.broadinstitute.sting.gatk.walkers.Reference;
|
|||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.help.HelpConstants;
|
||||
import org.broadinstitute.variant.vcf.VCFHeader;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
|
|
@ -84,17 +84,17 @@ import java.util.*;
|
|||
* From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants.
|
||||
* Records are annotated with the set and status attributes.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* 2 variant files to resolve.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A single consensus VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx1g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
@ -125,7 +125,7 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
|
|||
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||
public List<RodBinding<VariantContext>> variants;
|
||||
|
||||
@Output(doc="File to which variants should be written", required=true)
|
||||
@Output(doc="File to which variants should be written")
|
||||
protected VariantContextWriter baseWriter = null;
|
||||
private VariantContextWriter writer;
|
||||
|
||||
|
|
@ -360,8 +360,8 @@ public class HaplotypeResolver extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
// order results by start position
|
||||
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1));
|
||||
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2));
|
||||
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype, false, 0, swConsensus1.getCigar()), refContext.getBases(), refContext.getWindow(), source1));
|
||||
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype, false, 0, swConsensus2.getCigar()), refContext.getBases(), refContext.getWindow(), source2));
|
||||
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
|
||||
// TODO -- handle errors appropriately
|
||||
logger.debug("No source alleles; aborting at " + refContext.getLocus());
|
||||
|
|
|
|||
|
|
@ -1,445 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks, rpoplin
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
|
||||
// This is different from most graph traversals because we want to test paths from any source node to any sink node.
|
||||
public class KBestPaths {
|
||||
|
||||
// static access only
|
||||
protected KBestPaths() { }
|
||||
private static int MAX_PATHS_TO_HOLD = 100;
|
||||
|
||||
protected static class MyInt { public int val = 0; }
|
||||
|
||||
// class to keep track of paths
|
||||
protected static class Path {
|
||||
|
||||
// the last vertex seen in the path
|
||||
private final DeBruijnVertex lastVertex;
|
||||
|
||||
// the list of edges comprising the path
|
||||
private final List<DeBruijnEdge> edges;
|
||||
|
||||
// the scores for the path
|
||||
private final int totalScore;
|
||||
|
||||
// the graph from which this path originated
|
||||
private final DeBruijnAssemblyGraph graph;
|
||||
|
||||
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
|
||||
// these values were chosen via optimization against the NA12878 knowledge base
|
||||
private static final double SW_MATCH = 20.0;
|
||||
private static final double SW_MISMATCH = -15.0;
|
||||
private static final double SW_GAP = -26.0;
|
||||
private static final double SW_GAP_EXTEND = -1.1;
|
||||
private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes();
|
||||
|
||||
public Path( final DeBruijnVertex initialVertex, final DeBruijnAssemblyGraph graph ) {
|
||||
lastVertex = initialVertex;
|
||||
edges = new ArrayList<DeBruijnEdge>(0);
|
||||
totalScore = 0;
|
||||
this.graph = graph;
|
||||
}
|
||||
|
||||
public Path( final Path p, final DeBruijnEdge edge ) {
|
||||
if( !p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); }
|
||||
|
||||
graph = p.graph;
|
||||
lastVertex = p.graph.getEdgeTarget(edge);
|
||||
edges = new ArrayList<DeBruijnEdge>(p.edges);
|
||||
edges.add(edge);
|
||||
totalScore = p.totalScore + edge.getMultiplicity();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this path contain the given edge
|
||||
* @param edge the given edge to test
|
||||
* @return true if the edge is found in this path
|
||||
*/
|
||||
public boolean containsEdge( final DeBruijnEdge edge ) {
|
||||
if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
|
||||
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
if( e.equals(graph, edge) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the number of times this edge appears in the path
|
||||
* @param edge the given edge to test
|
||||
* @return number of times this edge appears in the path
|
||||
*/
|
||||
public int numInPath( final DeBruijnEdge edge ) {
|
||||
if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
|
||||
|
||||
int numInPath = 0;
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
if( e.equals(graph, edge) ) {
|
||||
numInPath++;
|
||||
}
|
||||
}
|
||||
|
||||
return numInPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this path contain a reference edge?
|
||||
* @return true if the path contains a reference edge
|
||||
*/
|
||||
public boolean containsRefEdge() {
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
if( e.isRef() ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public List<DeBruijnEdge> getEdges() { return edges; }
|
||||
|
||||
public int getScore() { return totalScore; }
|
||||
|
||||
public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
|
||||
|
||||
/**
|
||||
* The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes
|
||||
* @return non-null sequence of bases corresponding to this path
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getBases() {
|
||||
if( edges.size() == 0 ) { return graph.getAdditionalSequence(lastVertex); }
|
||||
|
||||
byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edges.get(0)));
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
|
||||
}
|
||||
return bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble
|
||||
* @return non-null Cigar string with reference length equal to the refHaplotype's reference length
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Cigar calculateCigar() {
|
||||
|
||||
final Cigar cigar = new Cigar();
|
||||
// special case for paths that start on reference but not at the reference source node
|
||||
if( edges.get(0).isRef() && !graph.isRefSource(edges.get(0)) ) {
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edges.get(0))).getCigarElements() ) {
|
||||
cigar.add(ce);
|
||||
}
|
||||
}
|
||||
|
||||
// reset the bubble state machine
|
||||
final BubbleStateMachine bsm = new BubbleStateMachine(cigar);
|
||||
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
if( e.equals(graph, edges.get(0)) ) {
|
||||
advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
|
||||
}
|
||||
advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e );
|
||||
}
|
||||
|
||||
// special case for paths that don't end on reference
|
||||
if( bsm.inBubble ) {
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
} else if( edges.get(edges.size()-1).isRef() && !graph.isRefSink(edges.get(edges.size()-1)) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edges.get(edges.size()-1)), null).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
}
|
||||
|
||||
return AlignmentUtils.consolidateCigar(bsm.cigar);
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the bubble state machine by incorporating the next node in the path.
|
||||
* @param bsm the current bubble state machine
|
||||
* @param node the node to be incorporated
|
||||
* @param e the edge which generated this node in the path
|
||||
*/
|
||||
@Requires({"bsm != null", "graph != null", "node != null"})
|
||||
private void advanceBubbleStateMachine( final BubbleStateMachine bsm, final DeBruijnVertex node, final DeBruijnEdge e ) {
|
||||
if( graph.isReferenceNode( node ) ) {
|
||||
if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else
|
||||
if( e !=null && !e.isRef() ) {
|
||||
if( graph.referencePathExists( graph.getEdgeSource(e), node) ) {
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
|
||||
} else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) );
|
||||
} else {
|
||||
bsm.inBubble = true;
|
||||
bsm.bubbleBytes = null;
|
||||
bsm.lastSeenReferenceNode = graph.getEdgeSource(e);
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
}
|
||||
} else {
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
|
||||
}
|
||||
} else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
} else { // close the bubble and use a local SW to determine the Cigar string
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
bsm.inBubble = false;
|
||||
bsm.bubbleBytes = null;
|
||||
bsm.lastSeenReferenceNode = null;
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
|
||||
}
|
||||
} else { // non-ref vertex
|
||||
if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
} else { // open up a bubble
|
||||
bsm.inBubble = true;
|
||||
bsm.bubbleBytes = null;
|
||||
bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null );
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble
|
||||
* @param bubbleBytes the bytes that comprise the alternate allele path in this bubble
|
||||
* @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex)
|
||||
* @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex)
|
||||
* @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble
|
||||
*/
|
||||
@Requires({"graph != null"})
|
||||
@Ensures({"result != null"})
|
||||
private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final DeBruijnVertex fromVertex, final DeBruijnVertex toVertex ) {
|
||||
final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null);
|
||||
|
||||
final Cigar returnCigar = new Cigar();
|
||||
|
||||
// add padding to anchor ref/alt bases in the SW matrix
|
||||
byte[] padding = STARTING_SW_ANCHOR_BYTES;
|
||||
boolean goodAlignment = false;
|
||||
SWPairwiseAlignment swConsensus = null;
|
||||
while( !goodAlignment && padding.length < 1000 ) {
|
||||
padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time
|
||||
final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding );
|
||||
final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding );
|
||||
swConsensus = new SWPairwiseAlignment( reference, alternate, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) {
|
||||
goodAlignment = true;
|
||||
}
|
||||
}
|
||||
if( !goodAlignment ) {
|
||||
returnCigar.add(new CigarElement(1, CigarOperator.N));
|
||||
return returnCigar;
|
||||
}
|
||||
|
||||
final Cigar swCigar = swConsensus.getCigar();
|
||||
if( swCigar.numCigarElements() > 6 ) { // this bubble is too divergent from the reference
|
||||
returnCigar.add(new CigarElement(1, CigarOperator.N));
|
||||
} else {
|
||||
int skipElement = -1;
|
||||
if( fromVertex == null ) {
|
||||
for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
|
||||
final CigarElement ce = swCigar.getCigarElement(iii);
|
||||
if( ce.getOperator().equals(CigarOperator.D) ) {
|
||||
skipElement = iii;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (toVertex == null ) {
|
||||
for( int iii = swCigar.numCigarElements() - 1; iii >= 0; iii-- ) {
|
||||
final CigarElement ce = swCigar.getCigarElement(iii);
|
||||
if( ce.getOperator().equals(CigarOperator.D) ) {
|
||||
skipElement = iii;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
|
||||
// now we need to remove the padding from the cigar string
|
||||
int length = swCigar.getCigarElement(iii).getLength();
|
||||
if( iii == 0 ) { length -= padding.length; }
|
||||
if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; }
|
||||
if( length > 0 ) {
|
||||
returnCigar.add(new CigarElement(length, (skipElement == iii ? CigarOperator.X : swCigar.getCigarElement(iii).getOperator())));
|
||||
}
|
||||
}
|
||||
if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) {
|
||||
throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar());
|
||||
}
|
||||
}
|
||||
|
||||
return returnCigar;
|
||||
}
|
||||
|
||||
// class to keep track of the bubble state machine
|
||||
protected static class BubbleStateMachine {
|
||||
public boolean inBubble = false;
|
||||
public byte[] bubbleBytes = null;
|
||||
public DeBruijnVertex lastSeenReferenceNode = null;
|
||||
public Cigar cigar = null;
|
||||
|
||||
public BubbleStateMachine( final Cigar initialCigar ) {
|
||||
inBubble = false;
|
||||
bubbleBytes = null;
|
||||
lastSeenReferenceNode = null;
|
||||
cigar = initialCigar;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static class PathComparatorTotalScore implements Comparator<Path>, Serializable {
|
||||
@Override
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path1.totalScore - path2.totalScore;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse the graph and pull out the best k paths.
|
||||
* Paths are scored via their comparator function. The default being PathComparatorTotalScore()
|
||||
* @param graph the graph from which to pull paths
|
||||
* @param k the number of paths to find
|
||||
* @return a list with at most k top-scoring paths from the graph
|
||||
*/
|
||||
@Ensures({"result != null", "result.size() <= k"})
|
||||
public static List<Path> getKBestPaths( final DeBruijnAssemblyGraph graph, final int k ) {
|
||||
if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); }
|
||||
if( k > MAX_PATHS_TO_HOLD/2 ) { throw new IllegalArgumentException("Asked for more paths than internal parameters allow for."); }
|
||||
|
||||
final ArrayList<Path> bestPaths = new ArrayList<Path>();
|
||||
|
||||
// run a DFS for best paths
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 ) {
|
||||
findBestPaths(new Path(v, graph), bestPaths);
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(bestPaths, new PathComparatorTotalScore() );
|
||||
Collections.reverse(bestPaths);
|
||||
return bestPaths.subList(0, Math.min(k, bestPaths.size()));
|
||||
}
|
||||
|
||||
private static void findBestPaths( final Path path, final List<Path> bestPaths ) {
|
||||
findBestPaths(path, bestPaths, new MyInt());
|
||||
}
|
||||
|
||||
private static void findBestPaths( final Path path, final List<Path> bestPaths, final MyInt n ) {
|
||||
|
||||
// did we hit the end of a path?
|
||||
if ( allOutgoingEdgesHaveBeenVisited(path) ) {
|
||||
if( path.containsRefEdge() ) {
|
||||
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
|
||||
// clean out some low scoring paths
|
||||
Collections.sort(bestPaths, new PathComparatorTotalScore() );
|
||||
for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); } // BUGBUG: assumes MAX_PATHS_TO_HOLD >> 20
|
||||
}
|
||||
bestPaths.add(path);
|
||||
}
|
||||
} else if( n.val > 10000) {
|
||||
// do nothing, just return
|
||||
} else {
|
||||
// recursively run DFS
|
||||
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
|
||||
edgeArrayList.addAll(path.graph.outgoingEdgesOf(path.lastVertex));
|
||||
Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator());
|
||||
Collections.reverse(edgeArrayList);
|
||||
for ( final DeBruijnEdge edge : edgeArrayList ) {
|
||||
// make sure the edge is not already in the path
|
||||
if ( path.containsEdge(edge) )
|
||||
continue;
|
||||
|
||||
final Path newPath = new Path(path, edge);
|
||||
n.val++;
|
||||
findBestPaths(newPath, bestPaths, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param path the path to test
|
||||
* @return true if all the outgoing edges at the end of this path have already been visited
|
||||
*/
|
||||
private static boolean allOutgoingEdgesHaveBeenVisited( final Path path ) {
|
||||
for( final DeBruijnEdge edge : path.graph.outgoingEdgesOf(path.lastVertex) ) {
|
||||
if( !path.containsEdge(edge) ) { // TODO -- investigate allowing numInPath < 2 to allow cycles
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* generic utility class that counts kmers
|
||||
*
|
||||
* Basically you add kmers to the counter, and it tells you how many occurrences of each kmer it's seen.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/8/13
|
||||
* Time: 1:16 PM
|
||||
*/
|
||||
public class KMerCounter {
|
||||
//private final static Logger logger = Logger.getLogger(KMerCounter.class);
|
||||
|
||||
/**
|
||||
* A map of for each kmer to its num occurrences in addKmers
|
||||
*/
|
||||
private final Map<Kmer, CountedKmer> countsByKMer = new HashMap<Kmer, CountedKmer>();
|
||||
private final int kmerLength;
|
||||
|
||||
/**
|
||||
* Create a new kmer counter
|
||||
*
|
||||
* @param kmerLength the length of kmers we'll be counting to error correct, must be >= 1
|
||||
*/
|
||||
public KMerCounter(final int kmerLength) {
|
||||
if ( kmerLength < 1 ) throw new IllegalArgumentException("kmerLength must be > 0 but got " + kmerLength);
|
||||
this.kmerLength = kmerLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the count of kmer in this kmer counter
|
||||
* @param kmer a non-null counter to get
|
||||
* @return a positive integer
|
||||
*/
|
||||
public int getKmerCount(final Kmer kmer) {
|
||||
if ( kmer == null ) throw new IllegalArgumentException("kmer cannot be null");
|
||||
final CountedKmer counted = countsByKMer.get(kmer);
|
||||
return counted == null ? 0 : counted.count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an unordered collection of the counted kmers in this counter
|
||||
* @return a non-null collection
|
||||
*/
|
||||
public Collection<CountedKmer> getCountedKmers() {
|
||||
return countsByKMer.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all current counts, resetting the counter to an empty state
|
||||
*/
|
||||
public void clear() {
|
||||
countsByKMer.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a kmer that occurred kmerCount times
|
||||
*
|
||||
* @param kmer a kmer
|
||||
* @param kmerCount the number of occurrences
|
||||
*/
|
||||
public void addKmer(final Kmer kmer, final int kmerCount) {
|
||||
if ( kmer.length() != kmerLength ) throw new IllegalArgumentException("bad kmer length " + kmer + " expected size " + kmerLength);
|
||||
if ( kmerCount < 0 ) throw new IllegalArgumentException("bad kmerCount " + kmerCount);
|
||||
|
||||
CountedKmer countFromMap = countsByKMer.get(kmer);
|
||||
if ( countFromMap == null ) {
|
||||
countFromMap = new CountedKmer(kmer);
|
||||
countsByKMer.put(kmer, countFromMap);
|
||||
}
|
||||
countFromMap.count += kmerCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder b = new StringBuilder("KMerCounter{");
|
||||
b.append("counting ").append(countsByKMer.size()).append(" distinct kmers");
|
||||
b.append("\n}");
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
protected static class CountedKmer implements Comparable<CountedKmer> {
|
||||
final Kmer kmer;
|
||||
int count = 0;
|
||||
|
||||
private CountedKmer(final Kmer kmer) {
|
||||
this.kmer = kmer;
|
||||
}
|
||||
|
||||
public Kmer getKmer() {
|
||||
return kmer;
|
||||
}
|
||||
|
||||
public int getCount() {
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "CountedKmer{" +
|
||||
"kmer='" + kmer + '\'' +
|
||||
", count=" + count +
|
||||
'}';
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(CountedKmer o) {
|
||||
return o.count - count;
|
||||
}
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
// Protected methods for testing purposes only
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* For testing purposes only
|
||||
*/
|
||||
protected void addKmer(final String rawKmer, final int kmerCount) {
|
||||
addKmer(new Kmer(rawKmer), kmerCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing purposes
|
||||
*
|
||||
* @param kmers
|
||||
*/
|
||||
protected void addKmers(final String ... kmers) {
|
||||
for ( final String kmer : kmers )
|
||||
addKmer(kmer, 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,207 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Fast wrapper for byte[] kmers
|
||||
*
|
||||
* This objects has several important features that make it better than using a raw byte[] for a kmer:
|
||||
*
|
||||
* -- Can create kmer from a range of a larger byte[], allowing us to avoid Array.copyOfRange
|
||||
* -- Fast equals and hashcode methods
|
||||
* -- can get actual byte[] of the kmer, even if it's from a larger byte[], and this operation
|
||||
* only does the work of that operation once, updating its internal state
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 4/8/13
|
||||
* Time: 7:54 AM
|
||||
*/
|
||||
public class Kmer {
|
||||
// this values may be updated in the course of interacting with this kmer
|
||||
private byte[] bases;
|
||||
protected int start;
|
||||
|
||||
// two constants
|
||||
final protected int length;
|
||||
final protected int hash;
|
||||
|
||||
/**
|
||||
* Create a new kmer using all bases in kmer
|
||||
* @param kmer a non-null byte[]
|
||||
*/
|
||||
public Kmer(byte[] kmer) {
|
||||
this(kmer, 0, kmer.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new kmer based on the string kmer
|
||||
*
|
||||
* This is not a good method to use for performance
|
||||
*
|
||||
* @param kmer the bases as a string
|
||||
*/
|
||||
public Kmer(final String kmer) {
|
||||
this(kmer.getBytes());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new kmer backed by the bases in bases, spanning start -> start + length
|
||||
*
|
||||
* Under no circumstances can bases be modified anywhere in the client code. This does not make a copy
|
||||
* of bases for performance reasons
|
||||
*
|
||||
* @param bases an array of bases
|
||||
* @param start the start of the kmer in bases, must be >= 0 and < bases.length
|
||||
* @param length the length of the kmer. Must be >= 0 and start + length < bases.length
|
||||
*/
|
||||
public Kmer(final byte[] bases, final int start, final int length) {
|
||||
if ( bases == null ) throw new IllegalArgumentException("bases cannot be null");
|
||||
if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start);
|
||||
if ( length < 0 ) throw new IllegalArgumentException("length must be >= 0 but got " + length);
|
||||
if ( (start + length) > bases.length ) throw new IllegalArgumentException("start + length " + (start + length) + " must be <= bases.length " + bases.length + " but got " + start + " with length " + length);
|
||||
this.bases = bases;
|
||||
this.start = start;
|
||||
this.length = length;
|
||||
this.hash = myHashCode(bases, start, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new kmer that's a shallow copy of kmer
|
||||
* @param kmer the kmer to shallow copy
|
||||
*/
|
||||
public Kmer(final Kmer kmer) {
|
||||
this.bases = kmer.bases;
|
||||
this.start = kmer.start;
|
||||
this.length = kmer.length;
|
||||
this.hash = kmer.hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a derived shallow kmer that starts at newStart and has newLength bases
|
||||
* @param newStart the new start of kmer, where 0 means that start of the kmer, 1 means skip the first base
|
||||
* @param newLength the new length
|
||||
* @return a new kmer based on the data in this kmer. Does not make a copy, so shares most of the data
|
||||
*/
|
||||
public Kmer subKmer(final int newStart, final int newLength) {
|
||||
return new Kmer(bases, start + newStart, newLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the bases of this kmer. May create a copy of the bases, depending on how this kmer was constructed.
|
||||
*
|
||||
* Note that this function is efficient in that if it needs to copy the bases this only occurs once.
|
||||
*
|
||||
* @return a non-null byte[] containing length() bases of this kmer, regardless of how this kmer was created
|
||||
*/
|
||||
public byte[] bases() {
|
||||
if ( start != 0 || bases.length != length ) {
|
||||
// update operation. Rip out the exact byte[] and update start so we don't ever do this again
|
||||
bases = Arrays.copyOfRange(bases, start, start + length);
|
||||
start = 0;
|
||||
}
|
||||
|
||||
return bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* The length of this kmer
|
||||
* @return an integer >= 0
|
||||
*/
|
||||
public int length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Kmer{" + new String(bases()) + "}";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
final Kmer kmer = (Kmer) o;
|
||||
|
||||
// very fast test. If hash aren't equal you are done, otherwise compare the bases
|
||||
if ( hash != kmer.hash ) return false;
|
||||
if ( length != kmer.length ) return false;
|
||||
|
||||
for ( int i = 0; i < length; i++ )
|
||||
if ( bases[start + i] != kmer.bases[kmer.start + i] )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method that computes the hashcode for this kmer based only the bases in
|
||||
* a[], starting at start and running length bases
|
||||
*
|
||||
* @param a a non-null bases array
|
||||
* @param start where to start in bases
|
||||
* @param length the length of the bases
|
||||
* @return a hashcode value appropriate for a[start] -> a[start + length]
|
||||
*/
|
||||
private static int myHashCode(final byte a[], final int start, final int length) {
|
||||
if (a == null)
|
||||
return 0;
|
||||
|
||||
int result = 1;
|
||||
for (int i = 0; i < length; i++)
|
||||
result = 31 * result + a[start + i];
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -48,12 +48,15 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
|
||||
import org.broadinstitute.sting.utils.pairhmm.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -62,11 +65,20 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
import java.util.*;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final byte constantGCP;
|
||||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
private final int minReadLength = 20;
|
||||
|
||||
/**
|
||||
* The expected rate of random sequencing errors for a read originating from its true haplotype.
|
||||
*
|
||||
* For example, if this is 0.01, then we'd expect 1 error per 100 bp.
|
||||
*/
|
||||
private final double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
|
||||
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
|
||||
|
||||
|
|
@ -78,7 +90,7 @@ public class LikelihoodCalculationEngine {
|
|||
pairHMM = new Log10PairHMM(false);
|
||||
break;
|
||||
case LOGLESS_CACHING:
|
||||
pairHMM = new LoglessCachingPairHMM();
|
||||
pairHMM = new LoglessPairHMM();
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
|
||||
|
|
@ -88,9 +100,16 @@ public class LikelihoodCalculationEngine {
|
|||
DEBUG = debug;
|
||||
}
|
||||
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
|
||||
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
/**
|
||||
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
|
||||
*
|
||||
* After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
|
||||
* against the set of haplotypes
|
||||
*
|
||||
* @param haplotypes a non-null list of haplotypes
|
||||
* @param perSampleReadList a mapping from sample -> reads
|
||||
*/
|
||||
private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
for( final GATKSAMRecord read : sample.getValue() ) {
|
||||
|
|
@ -104,19 +123,29 @@ public class LikelihoodCalculationEngine {
|
|||
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
|
||||
}
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
X_METRIC_LENGTH += 2;
|
||||
Y_METRIC_LENGTH += 2;
|
||||
|
||||
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
}
|
||||
|
||||
// for each sample's reads
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
|
||||
// configure the HMM
|
||||
initializePairHMM(haplotypes, perSampleReadList);
|
||||
|
||||
// Add likelihoods for each sample's reads to our stratifiedReadMap
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue()));
|
||||
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
|
||||
|
||||
final List<GATKSAMRecord> removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
|
||||
// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey());
|
||||
// for ( final GATKSAMRecord read : removedReads )
|
||||
// logger.info("\tRemoved " + read.getReadName());
|
||||
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), map);
|
||||
}
|
||||
|
||||
return stratifiedReadMap;
|
||||
}
|
||||
|
||||
|
|
@ -130,10 +159,14 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
if ( read.getReadLength() < minReadLength )
|
||||
// don't consider any reads that have a read length < the minimum
|
||||
continue;
|
||||
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
Haplotype previousHaplotypeSeen = null;
|
||||
final byte[] readQuals = read.getBaseQualities();
|
||||
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
|
||||
final byte[] readQuals = read.getBaseQualities().clone();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities();
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
|
|
@ -146,15 +179,14 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
final boolean isFirstHaplotype = jjj == 0;
|
||||
final double log10l = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
|
||||
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
|
||||
|
||||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : PairHMM.findFirstPositionWhereHaplotypesDiffer(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype),
|
||||
pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0));
|
||||
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
|
||||
}
|
||||
}
|
||||
|
||||
return perReadAlleleLikelihoodMap;
|
||||
}
|
||||
|
||||
|
|
@ -162,17 +194,17 @@ public class LikelihoodCalculationEngine {
|
|||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering ) {
|
||||
final TreeSet<String> sampleSet = new TreeSet<String>();
|
||||
sampleSet.add(sample);
|
||||
return computeDiploidHaplotypeLikelihoods(sampleSet, stratifiedReadMap, alleleOrdering);
|
||||
final List<Allele> alleleOrdering,
|
||||
final boolean normalize ) {
|
||||
return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize);
|
||||
}
|
||||
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering ) {
|
||||
final List<Allele> alleleOrdering,
|
||||
final boolean normalize) {
|
||||
|
||||
final int numHaplotypes = alleleOrdering.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
|
@ -199,7 +231,7 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix;
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
|
|
@ -223,54 +255,127 @@ public class LikelihoodCalculationEngine {
|
|||
return likelihoodMatrix;
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public List<Haplotype> selectBestHaplotypes( final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation ) {
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// System to compute the best N haplotypes for genotyping
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = stratifiedReadMap.keySet();
|
||||
final List<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
|
||||
final List<Allele> haplotypesAsAlleles = new ArrayList<Allele>();
|
||||
for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h, true)); }
|
||||
/**
|
||||
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
|
||||
* @param map an annoying map object that moves us between the allele and haplotype representation
|
||||
* @param haplotypeAsAllele the allele version of the haplotype
|
||||
* @return the haplotype version, with its score incremented by 1 if its non-reference
|
||||
*/
|
||||
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
|
||||
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
|
||||
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
|
||||
return h;
|
||||
}
|
||||
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, stratifiedReadMap, haplotypesAsAlleles ); // all samples pooled together
|
||||
/**
|
||||
* Take the best N haplotypes and return them as a list
|
||||
*
|
||||
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
|
||||
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
|
||||
* order of score (so higher score haplotypes are preferred). The N we take is determined by
|
||||
*
|
||||
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
|
||||
*
|
||||
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
|
||||
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound
|
||||
*
|
||||
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
|
||||
* @param nSamples the number of samples used to select the haplotypes
|
||||
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
|
||||
* @return a list of N or fewer haplotypes, with the reference haplotype first
|
||||
*/
|
||||
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
|
||||
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
|
||||
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
|
||||
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
|
||||
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
|
||||
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
|
||||
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
//double bestElement = Double.NEGATIVE_INFINITY;
|
||||
final int maxChosenHaplotypes = Math.min( maxNumHaplotypesInPopulation, sampleKeySet.size() * 2 + 1 );
|
||||
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
|
||||
maxElement = haplotypeLikelihoodMatrix[iii][jjj];
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
|
||||
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
|
||||
/**
|
||||
* Select the best haplotypes for genotyping the samples in stratifiedReadMap
|
||||
*
|
||||
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
|
||||
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
|
||||
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
|
||||
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
|
||||
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
|
||||
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
|
||||
*
|
||||
* @param haplotypes a list of all haplotypes we're considering
|
||||
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
|
||||
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
|
||||
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
|
||||
*/
|
||||
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
|
||||
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
|
||||
|
||||
// all of the haplotypes that at least one sample called as one of the most likely
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<Haplotype>();
|
||||
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
|
||||
|
||||
// our annoying map from allele -> haplotype
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<Allele, Haplotype>();
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
|
||||
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
// for each sample, compute the most likely pair of haplotypes
|
||||
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
|
||||
// get the two most likely haplotypes under a diploid model for this sample
|
||||
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
|
||||
|
||||
final List<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
bestHaplotypes.add( haplotypes.get(hIndex) );
|
||||
if ( mla != null ) { // there was something to evaluate in this sample
|
||||
// note that there must be at least 2 haplotypes
|
||||
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
|
||||
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
|
||||
|
||||
// if ( DEBUG ) {
|
||||
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
|
||||
// }
|
||||
|
||||
// add these two haplotypes to the set of haplotypes that have been selected
|
||||
selectedHaplotypes.add(best);
|
||||
selectedHaplotypes.add(second);
|
||||
|
||||
// we've already selected all of our haplotypes, and we don't need to prune them down
|
||||
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// take the best N haplotypes forward, in order of the number of samples that choose them
|
||||
final int nSamples = stratifiedReadMap.size();
|
||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
|
||||
|
||||
if ( DEBUG ) {
|
||||
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
|
||||
for ( final Haplotype h : bestHaplotypes ) {
|
||||
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
|
||||
}
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
|
||||
/**
|
||||
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
|
||||
* @param haplotypes non-null list of haplotypes
|
||||
* @return the reference haplotype
|
||||
*/
|
||||
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isReference() ) { return haplotypes.indexOf(h); }
|
||||
if( h.isReference() ) return h;
|
||||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,10 +47,11 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
|
@ -59,13 +60,46 @@ import java.util.List;
|
|||
* Date: Mar 14, 2011
|
||||
*/
|
||||
public abstract class LocalAssemblyEngine {
|
||||
public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8;
|
||||
|
||||
public enum ASSEMBLER {
|
||||
SIMPLE_DE_BRUIJN
|
||||
protected PrintStream graphWriter = null;
|
||||
protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
|
||||
protected int pruneFactor = 2;
|
||||
protected boolean errorCorrectKmers = false;
|
||||
|
||||
protected LocalAssemblyEngine() { }
|
||||
|
||||
public int getPruneFactor() {
|
||||
return pruneFactor;
|
||||
}
|
||||
|
||||
protected LocalAssemblyEngine() {
|
||||
public void setPruneFactor(int pruneFactor) {
|
||||
this.pruneFactor = pruneFactor;
|
||||
}
|
||||
|
||||
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List<VariantContext> activeAllelesToGenotype);
|
||||
public boolean shouldErrorCorrectKmers() {
|
||||
return errorCorrectKmers;
|
||||
}
|
||||
|
||||
public void setErrorCorrectKmers(boolean errorCorrectKmers) {
|
||||
this.errorCorrectKmers = errorCorrectKmers;
|
||||
}
|
||||
|
||||
public PrintStream getGraphWriter() {
|
||||
return graphWriter;
|
||||
}
|
||||
|
||||
public void setGraphWriter(PrintStream graphWriter) {
|
||||
this.graphWriter = graphWriter;
|
||||
}
|
||||
|
||||
public byte getMinBaseQualityToUseInAssembly() {
|
||||
return minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
public void setMinBaseQualityToUseInAssembly(byte minBaseQualityToUseInAssembly) {
|
||||
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
|
||||
}
|
||||
|
||||
public abstract List<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, List<VariantContext> activeAllelesToGenotype);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* simple edge class for connecting nodes in the graph
|
||||
*
|
||||
* Works equally well for all graph types (kmer or sequence)
|
||||
*
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
public class BaseEdge {
|
||||
private int multiplicity;
|
||||
private boolean isRef;
|
||||
|
||||
/**
|
||||
* Create a new BaseEdge with weight multiplicity and, if isRef == true, indicates a path through the reference
|
||||
*
|
||||
* @param isRef indicates whether this edge is a path through the reference
|
||||
* @param multiplicity the number of observations of this edge
|
||||
*/
|
||||
public BaseEdge(final boolean isRef, final int multiplicity) {
|
||||
if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0 but got " + multiplicity);
|
||||
|
||||
this.multiplicity = multiplicity;
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor
|
||||
*
|
||||
* @param toCopy
|
||||
*/
|
||||
public BaseEdge(final BaseEdge toCopy) {
|
||||
this(toCopy.isRef(), toCopy.getMultiplicity());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of observations of paths connecting two vertices
|
||||
* @return a positive integer >= 0
|
||||
*/
|
||||
public int getMultiplicity() {
|
||||
return multiplicity;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the multiplicity of this edge to value
|
||||
* @param value an integer >= 0
|
||||
*/
|
||||
public void setMultiplicity( final int value ) {
|
||||
if ( multiplicity < 0 ) throw new IllegalArgumentException("multiplicity must be >= 0");
|
||||
multiplicity = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this edge indicate a path through the reference graph?
|
||||
* @return true if so
|
||||
*/
|
||||
public boolean isRef() {
|
||||
return isRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicate that this edge follows the reference sequence, or not
|
||||
* @param isRef true if this is a reference edge
|
||||
*/
|
||||
public void setIsRef( final boolean isRef ) {
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this and edge have the same source and target vertices in graph?
|
||||
*
|
||||
* @param graph the graph containing both this and edge
|
||||
* @param edge our comparator edge
|
||||
* @param <T>
|
||||
* @return true if we have the same source and target vertices
|
||||
*/
|
||||
public <T extends BaseVertex> boolean hasSameSourceAndTarget(final BaseGraph<T> graph, final BaseEdge edge) {
|
||||
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
// For use when comparing edges across graphs!
|
||||
public <T extends BaseVertex> boolean seqEquals( final BaseGraph<T> graph, final BaseEdge edge, final BaseGraph<T> graph2 ) {
|
||||
return (graph.getEdgeSource(this).seqEquals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).seqEquals(graph2.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorts a collection of BaseEdges in decreasing order of weight, so that the most
|
||||
* heavily weighted is at the start of the list
|
||||
*/
|
||||
public static class EdgeWeightComparator implements Comparator<BaseEdge>, Serializable {
|
||||
@Override
|
||||
public int compare(final BaseEdge edge1, final BaseEdge edge2) {
|
||||
return edge2.multiplicity - edge1.multiplicity;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add edge to this edge, updating isRef and multiplicity as appropriate
|
||||
*
|
||||
* isRef is simply the or of this and edge
|
||||
* multiplicity is the sum
|
||||
*
|
||||
* @param edge the edge to add
|
||||
* @return this
|
||||
*/
|
||||
public BaseEdge add(final BaseEdge edge) {
|
||||
if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
|
||||
this.multiplicity += edge.getMultiplicity();
|
||||
this.isRef = this.isRef || edge.isRef();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new BaseEdge with multiplicity and isRef that's an or of all edges
|
||||
*
|
||||
* @param edges a collection of edges to or their isRef values
|
||||
* @param multiplicity our desired multiplicity
|
||||
* @return a newly allocated BaseEdge
|
||||
*/
|
||||
public static BaseEdge orRef(final Collection<BaseEdge> edges, final int multiplicity) {
|
||||
for ( final BaseEdge e : edges )
|
||||
if ( e.isRef() )
|
||||
return new BaseEdge(true, multiplicity);
|
||||
return new BaseEdge(false, multiplicity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new edge whose multiplicity is the max of this and edge, and isRef is or of this and edge
|
||||
*
|
||||
* isRef is simply the or of this and edge
|
||||
* multiplicity is the max
|
||||
*
|
||||
* @param edge the edge to max
|
||||
*/
|
||||
public BaseEdge max(final BaseEdge edge) {
|
||||
if ( edge == null ) throw new IllegalArgumentException("edge cannot be null");
|
||||
return new BaseEdge(isRef() || edge.isRef(), Math.max(getMultiplicity(), edge.getMultiplicity()));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,636 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Invariant;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.jgrapht.EdgeFactory;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 2/6/13
|
||||
*/
|
||||
@Invariant("!this.isAllowingMultipleEdges()")
|
||||
public class BaseGraph<T extends BaseVertex> extends DefaultDirectedGraph<T, BaseEdge> {
|
||||
protected final static Logger logger = Logger.getLogger(BaseGraph.class);
|
||||
private final int kmerSize;
|
||||
|
||||
/**
|
||||
* Construct an empty BaseGraph
|
||||
*/
|
||||
public BaseGraph() {
|
||||
this(11);
|
||||
}
|
||||
|
||||
/**
|
||||
* Edge factory that creates non-reference multiplicity 1 edges
|
||||
* @param <T> the new of our vertices
|
||||
*/
|
||||
private static class MyEdgeFactory<T extends BaseVertex> implements EdgeFactory<T, BaseEdge> {
|
||||
@Override
|
||||
public BaseEdge createEdge(T sourceVertex, T targetVertex) {
|
||||
return new BaseEdge(false, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a DeBruijnGraph with kmerSize
|
||||
* @param kmerSize
|
||||
*/
|
||||
public BaseGraph(final int kmerSize) {
|
||||
super(new MyEdgeFactory<T>());
|
||||
|
||||
if ( kmerSize < 1 ) throw new IllegalArgumentException("kmerSize must be >= 1 but got " + kmerSize);
|
||||
this.kmerSize = kmerSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* How big of a kmer did we use to create this graph?
|
||||
* @return
|
||||
*/
|
||||
public int getKmerSize() {
|
||||
return kmerSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference node (meaning that it appears on the reference path in the graph)
|
||||
*/
|
||||
public boolean isReferenceNode( final T v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final BaseEdge e : edgesOf(v) ) {
|
||||
if( e.isRef() ) { return true; }
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a source node (in degree == 0)
|
||||
*/
|
||||
public boolean isSource( final T v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
return inDegreeOf(v) == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a sink node (out degree == 0)
|
||||
*/
|
||||
public boolean isSink( final T v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
return outDegreeOf(v) == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of source vertices of this graph
|
||||
* @return a non-null set
|
||||
*/
|
||||
public Set<T> getSources() {
|
||||
final Set<T> set = new LinkedHashSet<T>();
|
||||
for ( final T v : vertexSet() )
|
||||
if ( isSource(v) )
|
||||
set.add(v);
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of sink vertices of this graph
|
||||
* @return a non-null set
|
||||
*/
|
||||
public Set<T> getSinks() {
|
||||
final Set<T> set = new LinkedHashSet<T>();
|
||||
for ( final T v : vertexSet() )
|
||||
if ( isSink(v) )
|
||||
set.add(v);
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull out the additional sequence implied by traversing this node in the graph
|
||||
* @param v the vertex from which to pull out the additional base sequence
|
||||
* @return non-null byte array
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getAdditionalSequence( final T v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to pull sequence from a null vertex."); }
|
||||
return v.getAdditionalSequence(isSource(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param e the edge to test
|
||||
* @return true if this edge is a reference source edge
|
||||
*/
|
||||
public boolean isRefSource( final BaseEdge e ) {
|
||||
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
|
||||
for( final BaseEdge edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference source
|
||||
*/
|
||||
public boolean isRefSource( final T v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param e the edge to test
|
||||
* @return true if this edge is a reference sink edge
|
||||
*/
|
||||
public boolean isRefSink( final BaseEdge e ) {
|
||||
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
|
||||
for( final BaseEdge edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference sink
|
||||
*/
|
||||
public boolean isRefSink( final T v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the reference source vertex pulled from the graph, can be null if it doesn't exist in the graph
|
||||
*/
|
||||
public T getReferenceSourceVertex( ) {
|
||||
for( final T v : vertexSet() ) {
|
||||
if( isReferenceNode(v) && isRefSource(v) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the reference sink vertex pulled from the graph, can be null if it doesn't exist in the graph
|
||||
*/
|
||||
public T getReferenceSinkVertex( ) {
|
||||
for( final T v : vertexSet() ) {
|
||||
if( isReferenceNode(v) && isRefSink(v) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse the graph and get the next reference vertex if it exists
|
||||
* @param v the current vertex, can be null
|
||||
* @return the next reference vertex if it exists
|
||||
*/
|
||||
public T getNextReferenceVertex( final T v ) {
|
||||
if( v == null ) { return null; }
|
||||
for( final BaseEdge edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) {
|
||||
return getEdgeTarget(edgeToTest);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse the graph and get the previous reference vertex if it exists
|
||||
* @param v the current vertex, can be null
|
||||
* @return the previous reference vertex if it exists
|
||||
*/
|
||||
public T getPrevReferenceVertex( final T v ) {
|
||||
if( v == null ) { return null; }
|
||||
for( final BaseEdge edgeToTest : incomingEdgesOf(v) ) {
|
||||
if( isReferenceNode(getEdgeSource(edgeToTest)) ) {
|
||||
return getEdgeSource(edgeToTest);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does a reference path exist between the two vertices?
|
||||
* @param fromVertex from this vertex, can be null
|
||||
* @param toVertex to this vertex, can be null
|
||||
* @return true if a reference path exists in the graph between the two vertices
|
||||
*/
|
||||
public boolean referencePathExists(final T fromVertex, final T toVertex) {
|
||||
T v = fromVertex;
|
||||
if( v == null ) {
|
||||
return false;
|
||||
}
|
||||
v = getNextReferenceVertex(v);
|
||||
if( v == null ) {
|
||||
return false;
|
||||
}
|
||||
while( !v.equals(toVertex) ) {
|
||||
v = getNextReferenceVertex(v);
|
||||
if( v == null ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Walk along the reference path in the graph and pull out the corresponding bases
|
||||
* @param fromVertex starting vertex
|
||||
* @param toVertex ending vertex
|
||||
* @param includeStart should the starting vertex be included in the path
|
||||
* @param includeStop should the ending vertex be included in the path
|
||||
* @return byte[] array holding the reference bases, this can be null if there are no nodes between the starting and ending vertex (insertions for example)
|
||||
*/
|
||||
public byte[] getReferenceBytes( final T fromVertex, final T toVertex, final boolean includeStart, final boolean includeStop ) {
|
||||
if( fromVertex == null ) { throw new IllegalArgumentException("Starting vertex in requested path cannot be null."); }
|
||||
if( toVertex == null ) { throw new IllegalArgumentException("From vertex in requested path cannot be null."); }
|
||||
|
||||
byte[] bytes = null;
|
||||
T v = fromVertex;
|
||||
if( includeStart ) {
|
||||
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
|
||||
}
|
||||
v = getNextReferenceVertex(v); // advance along the reference path
|
||||
while( v != null && !v.equals(toVertex) ) {
|
||||
bytes = ArrayUtils.addAll( bytes, getAdditionalSequence(v) );
|
||||
v = getNextReferenceVertex(v); // advance along the reference path
|
||||
}
|
||||
if( includeStop && v != null && v.equals(toVertex)) {
|
||||
bytes = ArrayUtils.addAll(bytes, getAdditionalSequence(v));
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple vertices to the graph at once
|
||||
* @param vertices one or more vertices to add
|
||||
*/
|
||||
public void addVertices(final T ... vertices) {
|
||||
for ( final T v : vertices )
|
||||
addVertex(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple vertices to the graph at once
|
||||
* @param vertices one or more vertices to add
|
||||
*/
|
||||
public void addVertices(final Collection<T> vertices) {
|
||||
for ( final T v : vertices )
|
||||
addVertex(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple edges to the graph
|
||||
* @param start the first vertex to connect
|
||||
* @param remaining all additional vertices to connect
|
||||
*/
|
||||
public void addEdges(final T start, final T ... remaining) {
|
||||
addEdges(new BaseEdge(false, 1), start, remaining);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function to add multiple edges to the graph
|
||||
* @param start the first vertex to connect
|
||||
* @param remaining all additional vertices to connect
|
||||
*/
|
||||
public void addEdges(final BaseEdge template, final T start, final T ... remaining) {
|
||||
T prev = start;
|
||||
for ( final T next : remaining ) {
|
||||
addEdge(prev, next, new BaseEdge(template));
|
||||
prev = next;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices connected by outgoing edges of V
|
||||
* @param v a non-null vertex
|
||||
* @return a set of vertices connected by outgoing edges from v
|
||||
*/
|
||||
public Set<T> outgoingVerticesOf(final T v) {
|
||||
final Set<T> s = new LinkedHashSet<T>();
|
||||
for ( final BaseEdge e : outgoingEdgesOf(v) ) {
|
||||
s.add(getEdgeTarget(e));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of vertices connected to v by incoming edges
|
||||
* @param v a non-null vertex
|
||||
* @return a set of vertices {X} connected X -> v
|
||||
*/
|
||||
public Set<T> incomingVerticesOf(final T v) {
|
||||
final Set<T> s = new LinkedHashSet<T>();
|
||||
for ( final BaseEdge e : incomingEdgesOf(v) ) {
|
||||
s.add(getEdgeSource(e));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print out the graph in the dot language for visualization
|
||||
* @param destination File to write to
|
||||
*/
|
||||
public void printGraph(final File destination, final int pruneFactor) {
|
||||
PrintStream stream = null;
|
||||
|
||||
try {
|
||||
stream = new PrintStream(new FileOutputStream(destination));
|
||||
printGraph(stream, true, pruneFactor);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
if ( stream != null ) stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) {
|
||||
if ( writeHeader )
|
||||
graphWriter.println("digraph assemblyGraphs {");
|
||||
|
||||
for( final BaseEdge edge : edgeSet() ) {
|
||||
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getMultiplicity() + "\"];");
|
||||
if( edge.isRef() ) {
|
||||
graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
}
|
||||
|
||||
for( final T v : vertexSet() ) {
|
||||
graphWriter.println("\t" + v.toString() + " [label=\"" + new String(getAdditionalSequence(v)) + "\",shape=box]");
|
||||
}
|
||||
|
||||
if ( writeHeader )
|
||||
graphWriter.println("}");
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove edges that are connected before the reference source and after the reference sink
|
||||
*
|
||||
* Also removes all vertices that are orphaned by this process
|
||||
*/
|
||||
public void cleanNonRefPaths() {
|
||||
if( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove non-ref edges connected before and after the reference path
|
||||
final Set<BaseEdge> edgesToCheck = new HashSet<BaseEdge>();
|
||||
edgesToCheck.addAll(incomingEdgesOf(getReferenceSourceVertex()));
|
||||
while( !edgesToCheck.isEmpty() ) {
|
||||
final BaseEdge e = edgesToCheck.iterator().next();
|
||||
if( !e.isRef() ) {
|
||||
edgesToCheck.addAll( incomingEdgesOf(getEdgeSource(e)) );
|
||||
removeEdge(e);
|
||||
}
|
||||
edgesToCheck.remove(e);
|
||||
}
|
||||
|
||||
edgesToCheck.addAll(outgoingEdgesOf(getReferenceSinkVertex()));
|
||||
while( !edgesToCheck.isEmpty() ) {
|
||||
final BaseEdge e = edgesToCheck.iterator().next();
|
||||
if( !e.isRef() ) {
|
||||
edgesToCheck.addAll( outgoingEdgesOf(getEdgeTarget(e)) );
|
||||
removeEdge(e);
|
||||
}
|
||||
edgesToCheck.remove(e);
|
||||
}
|
||||
|
||||
removeSingletonOrphanVertices();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well
|
||||
*
|
||||
* @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed
|
||||
*/
|
||||
public void pruneGraph( final int pruneFactor ) {
|
||||
final List<BaseEdge> edgesToRemove = new ArrayList<BaseEdge>();
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
if( e.getMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
|
||||
edgesToRemove.add(e);
|
||||
}
|
||||
}
|
||||
removeAllEdges(edgesToRemove);
|
||||
|
||||
removeSingletonOrphanVertices();
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all vertices in the graph that have in and out degree of 0
|
||||
*/
|
||||
protected void removeSingletonOrphanVertices() {
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<T> verticesToRemove = new LinkedList<T>();
|
||||
for( final T v : vertexSet() ) {
|
||||
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all vertices on the graph that cannot be accessed by following any edge,
|
||||
* regardless of its direction, from the reference source vertex
|
||||
*/
|
||||
public void removeVerticesNotConnectedToRefRegardlessOfEdgeDirection() {
|
||||
final HashSet<T> toRemove = new HashSet<T>(vertexSet());
|
||||
|
||||
final T refV = getReferenceSourceVertex();
|
||||
if ( refV != null ) {
|
||||
for ( final T v : new BaseGraphIterator<T>(this, refV, true, true) ) {
|
||||
toRemove.remove(v);
|
||||
}
|
||||
}
|
||||
|
||||
removeAllVertices(toRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all vertices in the graph that aren't on a path from the reference source vertex to the reference sink vertex
|
||||
*
|
||||
* More aggressive reference pruning algorithm than removeVerticesNotConnectedToRefRegardlessOfEdgeDirection,
|
||||
* as it requires vertices to not only be connected by a series of directed edges but also prunes away
|
||||
* paths that do not also meet eventually with the reference sink vertex
|
||||
*/
|
||||
public void removePathsNotConnectedToRef() {
|
||||
if ( getReferenceSourceVertex() == null || getReferenceSinkVertex() == null ) {
|
||||
throw new IllegalStateException("Graph must have ref source and sink vertices");
|
||||
}
|
||||
|
||||
// get the set of vertices we can reach by going forward from the ref source
|
||||
final Set<T> onPathFromRefSource = new HashSet<T>(vertexSet().size());
|
||||
for ( final T v : new BaseGraphIterator<T>(this, getReferenceSourceVertex(), false, true) ) {
|
||||
onPathFromRefSource.add(v);
|
||||
}
|
||||
|
||||
// get the set of vertices we can reach by going backward from the ref sink
|
||||
final Set<T> onPathFromRefSink = new HashSet<T>(vertexSet().size());
|
||||
for ( final T v : new BaseGraphIterator<T>(this, getReferenceSinkVertex(), true, false) ) {
|
||||
onPathFromRefSink.add(v);
|
||||
}
|
||||
|
||||
// we want to remove anything that's not in both the sink and source sets
|
||||
final Set<T> verticesToRemove = new HashSet<T>(vertexSet());
|
||||
onPathFromRefSource.retainAll(onPathFromRefSink);
|
||||
verticesToRemove.removeAll(onPathFromRefSource);
|
||||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
* Semi-lenient comparison of two graphs, truing true if g1 and g2 have similar structure
|
||||
*
|
||||
* By similar this means that both graphs have the same number of vertices, where each vertex can find
|
||||
* a vertex in the other graph that's seqEqual to it. A similar constraint applies to the edges,
|
||||
* where all edges in g1 must have a corresponding edge in g2 where both source and target vertices are
|
||||
* seqEqual
|
||||
*
|
||||
* @param g1 the first graph to compare
|
||||
* @param g2 the second graph to compare
|
||||
* @param <T> the type of the nodes in those graphs
|
||||
* @return true if g1 and g2 are equals
|
||||
*/
|
||||
public static <T extends BaseVertex> boolean graphEquals(final BaseGraph<T> g1, BaseGraph<T> g2) {
|
||||
final Set<T> vertices1 = g1.vertexSet();
|
||||
final Set<T> vertices2 = g2.vertexSet();
|
||||
final Set<BaseEdge> edges1 = g1.edgeSet();
|
||||
final Set<BaseEdge> edges2 = g2.edgeSet();
|
||||
|
||||
if ( vertices1.size() != vertices2.size() || edges1.size() != edges2.size() )
|
||||
return false;
|
||||
|
||||
for ( final T v1 : vertices1 ) {
|
||||
boolean found = false;
|
||||
for ( final T v2 : vertices2 )
|
||||
found = found || v1.getSequenceString().equals(v2.getSequenceString());
|
||||
if ( ! found ) return false;
|
||||
}
|
||||
|
||||
for( final BaseEdge e1 : g1.edgeSet() ) {
|
||||
boolean found = false;
|
||||
for( BaseEdge e2 : g2.edgeSet() ) {
|
||||
if( e1.seqEquals(g1, e2, g2) ) { found = true; break; }
|
||||
}
|
||||
if( !found ) { return false; }
|
||||
}
|
||||
for( final BaseEdge e2 : g2.edgeSet() ) {
|
||||
boolean found = false;
|
||||
for( BaseEdge e1 : g1.edgeSet() ) {
|
||||
if( e2.seqEquals(g2, e1, g1) ) { found = true; break; }
|
||||
}
|
||||
if( !found ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the incoming edge of v. Requires that there be only one such edge or throws an error
|
||||
* @param v our vertex
|
||||
* @return the single incoming edge to v, or null if none exists
|
||||
*/
|
||||
public BaseEdge incomingEdgeOf(final T v) {
|
||||
return getSingletonEdge(incomingEdgesOf(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the outgoing edge of v. Requires that there be only one such edge or throws an error
|
||||
* @param v our vertex
|
||||
* @return the single outgoing edge from v, or null if none exists
|
||||
*/
|
||||
public BaseEdge outgoingEdgeOf(final T v) {
|
||||
return getSingletonEdge(outgoingEdgesOf(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that gets the a single edge from edges, null if edges is empty, or
|
||||
* throws an error is edges has more than 1 element
|
||||
* @param edges a set of edges
|
||||
* @return a edge
|
||||
*/
|
||||
@Requires("edges != null")
|
||||
private BaseEdge getSingletonEdge(final Collection<BaseEdge> edges) {
|
||||
if ( edges.size() > 1 ) throw new IllegalArgumentException("Cannot get a single incoming edge for a vertex with multiple incoming edges " + edges);
|
||||
return edges.isEmpty() ? null : edges.iterator().next();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add edge between source -> target if none exists, or add e to an already existing one if present
|
||||
*
|
||||
* @param source source vertex
|
||||
* @param target vertex
|
||||
* @param e edge to add
|
||||
*/
|
||||
public void addOrUpdateEdge(final T source, final T target, final BaseEdge e) {
|
||||
final BaseEdge prev = getEdge(source, target);
|
||||
if ( prev != null ) {
|
||||
prev.add(e);
|
||||
} else {
|
||||
addEdge(source, target, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* General iterator that can iterate over all vertices in a BaseGraph, following either
|
||||
* incoming, outgoing edge (as well as both or none) edges. Supports traversal of graphs
|
||||
* with cycles and other crazy structures. Will only ever visit each vertex once. The
|
||||
* order in which the vertices are visited is undefined.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/24/13
|
||||
* Time: 4:41 PM
|
||||
*/
|
||||
public class BaseGraphIterator<T extends BaseVertex> implements Iterator<T>, Iterable<T> {
|
||||
final HashSet<T> visited = new HashSet<T>();
|
||||
final LinkedList<T> toVisit = new LinkedList<T>();
|
||||
final BaseGraph<T> graph;
|
||||
final boolean followIncomingEdges, followOutgoingEdges;
|
||||
|
||||
/**
|
||||
* Create a new BaseGraphIterator starting its traversal at start
|
||||
*
|
||||
* Note that if both followIncomingEdges and followOutgoingEdges are false, we simply return the
|
||||
* start vertex
|
||||
*
|
||||
* @param graph the graph to iterator over. Cannot be null
|
||||
* @param start the vertex to start at. Cannot be null
|
||||
* @param followIncomingEdges should we follow incoming edges during our
|
||||
* traversal? (goes backward through the graph)
|
||||
* @param followOutgoingEdges should we follow outgoing edges during out traversal?
|
||||
*/
|
||||
public BaseGraphIterator(final BaseGraph<T> graph, final T start,
|
||||
final boolean followIncomingEdges, final boolean followOutgoingEdges) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( start == null ) throw new IllegalArgumentException("start cannot be null");
|
||||
if ( ! graph.containsVertex(start) ) throw new IllegalArgumentException("start " + start + " must be in graph but it isn't");
|
||||
this.graph = graph;
|
||||
this.followIncomingEdges = followIncomingEdges;
|
||||
this.followOutgoingEdges = followOutgoingEdges;
|
||||
|
||||
toVisit.add(start);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<T> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return ! toVisit.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
final T v = toVisit.pop();
|
||||
|
||||
if ( ! visited.contains(v) ) {
|
||||
visited.add(v);
|
||||
if ( followIncomingEdges ) for ( final T prev : graph.incomingVerticesOf(v) ) toVisit.add(prev);
|
||||
if ( followOutgoingEdges ) for ( final T next : graph.outgoingVerticesOf(v) ) toVisit.add(next);
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Doesn't implement remove");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,179 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* A graph vertex that holds some sequence information
|
||||
*
|
||||
* @author: depristo
|
||||
* @since 03/2013
|
||||
*/
|
||||
public class BaseVertex {
|
||||
final byte[] sequence;
|
||||
private final static int UNASSIGNED_HASHCODE = -1;
|
||||
int cachedHashCode = UNASSIGNED_HASHCODE;
|
||||
|
||||
/**
|
||||
* Create a new sequence vertex with sequence
|
||||
*
|
||||
* This code doesn't copy sequence for efficiency reasons, so sequence should absolutely not be modified
|
||||
* in any way after passing this sequence to the BaseVertex
|
||||
*
|
||||
* @param sequence a non-null, non-empty sequence of bases contained in this vertex
|
||||
*/
|
||||
public BaseVertex(final byte[] sequence) {
|
||||
if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null");
|
||||
this.sequence = sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this vertex have an empty sequence?
|
||||
*
|
||||
* That is, is it a dummy node that's only present for structural reasons but doesn't actually
|
||||
* contribute to the sequence of the graph?
|
||||
*
|
||||
* @return true if sequence is empty, false otherwise
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
return length() == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length of this sequence
|
||||
* @return a positive integer >= 1
|
||||
*/
|
||||
public int length() {
|
||||
return sequence.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing purposes only -- low performance
|
||||
* @param sequence the sequence as a string
|
||||
*/
|
||||
protected BaseVertex(final String sequence) {
|
||||
this(sequence.getBytes());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
BaseVertex that = (BaseVertex) o;
|
||||
|
||||
if (!Arrays.equals(sequence, that.sequence)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Are b and this equal according to their base sequences?
|
||||
*
|
||||
* @param b the vertex to compare ourselves to
|
||||
* @return true if b and this have the same sequence, regardless of other attributes that might differentiate them
|
||||
*/
|
||||
public boolean seqEquals(final BaseVertex b) {
|
||||
return Arrays.equals(this.getSequence(), b.getSequence());
|
||||
}
|
||||
|
||||
/**
|
||||
* necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
if ( cachedHashCode == UNASSIGNED_HASHCODE ) {
|
||||
cachedHashCode = Arrays.hashCode(sequence);
|
||||
}
|
||||
return cachedHashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getSequenceString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sequence of bases contained in this vertex
|
||||
*
|
||||
* Do not modify these bytes in any way!
|
||||
*
|
||||
* @return a non-null pointer to the bases contained in this vertex
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public byte[] getSequence() {
|
||||
return sequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a string representation of the bases in this vertex
|
||||
* @return a non-null String
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public String getSequenceString() {
|
||||
return new String(sequence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sequence unique to this vertex
|
||||
*
|
||||
* This function may not return the entire sequence stored in the vertex, as kmer graphs
|
||||
* really only provide 1 base of additional sequence (the last base of the kmer).
|
||||
*
|
||||
* The base implementation simply returns the sequence.
|
||||
*
|
||||
* @param source is this vertex a source vertex (i.e., no in nodes) in the graph
|
||||
* @return a byte[] of the sequence added by this vertex to the overall sequence
|
||||
*/
|
||||
public byte[] getAdditionalSequence(final boolean source) {
|
||||
return getSequence();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Split a collection of middle nodes in a graph into their shared prefix and suffix values
|
||||
*
|
||||
* This code performs the following transformation. Suppose I have a set of vertices V, such
|
||||
* that each vertex is composed of sequence such that
|
||||
*
|
||||
* Vi = prefix + seq_i + suffix
|
||||
*
|
||||
* where prefix and suffix are shared sequences across all vertices V. This replaces each
|
||||
* Vi with three nodes prefix, seq_i, and suffix connected in a simple chain.
|
||||
*
|
||||
* This operation can be performed in a very general case, without too much worry about the incoming
|
||||
* and outgoing edge structure of each Vi. The partner algorithm SharedSequenceMerger can
|
||||
* put these pieces back together in a smart way that maximizes the sharing of nodes
|
||||
* while respecting complex connectivity.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/22/13
|
||||
* Time: 8:31 AM
|
||||
*/
|
||||
public class CommonSuffixSplitter {
|
||||
/**
|
||||
* Create a new graph that contains the vertices in toMerge with their shared suffix and prefix
|
||||
* sequences extracted out.
|
||||
*
|
||||
*/
|
||||
public CommonSuffixSplitter() {}
|
||||
|
||||
/**
|
||||
* Simple single-function interface to split and then update a graph
|
||||
*
|
||||
* @param graph the graph containing the vertices in toMerge
|
||||
* @param v The bottom node whose incoming vertices we'd like to split
|
||||
* @return true if some useful splitting was done, false otherwise
|
||||
*/
|
||||
public boolean split(final SeqGraph graph, final SeqVertex v) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( v == null ) throw new IllegalArgumentException("v cannot be null");
|
||||
if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex v " + v);
|
||||
|
||||
final Collection<SeqVertex> toSplit = graph.incomingVerticesOf(v);
|
||||
if ( toSplit.size() < 2 )
|
||||
// Can only split at least 2 vertices
|
||||
return false;
|
||||
else if ( ! safeToSplit(graph, v, toSplit) ) {
|
||||
return false;
|
||||
} else {
|
||||
final SeqVertex suffixVTemplate = commonSuffix(toSplit);
|
||||
if ( suffixVTemplate.isEmpty() ) {
|
||||
return false;
|
||||
} else if ( wouldEliminateRefSource(graph, suffixVTemplate, toSplit) ) {
|
||||
return false;
|
||||
} else if ( allVerticesAreTheCommonSuffix(suffixVTemplate, toSplit) ) {
|
||||
return false;
|
||||
} else {
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
|
||||
// graph.printGraph(new File("split.pre_" + v.getSequenceString() + "." + counter + ".dot"), 0);
|
||||
for ( final SeqVertex mid : toSplit ) {
|
||||
// create my own copy of the suffix
|
||||
final SeqVertex suffixV = new SeqVertex(suffixVTemplate.getSequence());
|
||||
graph.addVertex(suffixV);
|
||||
final SeqVertex prefixV = mid.withoutSuffix(suffixV.getSequence());
|
||||
final BaseEdge out = graph.outgoingEdgeOf(mid);
|
||||
|
||||
final SeqVertex incomingTarget;
|
||||
if ( prefixV == null ) {
|
||||
// this node is entirely explained by suffix
|
||||
incomingTarget = suffixV;
|
||||
} else {
|
||||
incomingTarget = prefixV;
|
||||
graph.addVertex(prefixV);
|
||||
graph.addEdge(prefixV, suffixV, new BaseEdge(out.isRef(), 0));
|
||||
edgesToRemove.add(out);
|
||||
}
|
||||
|
||||
graph.addEdge(suffixV, graph.getEdgeTarget(out), new BaseEdge(out));
|
||||
|
||||
for ( final BaseEdge in : graph.incomingEdgesOf(mid) ) {
|
||||
graph.addEdge(graph.getEdgeSource(in), incomingTarget, new BaseEdge(in));
|
||||
edgesToRemove.add(in);
|
||||
}
|
||||
}
|
||||
|
||||
graph.removeAllVertices(toSplit);
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
// graph.printGraph(new File("split.post_" + v.getSequenceString() + "." + counter++ + ".dot"), 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Would factoring out this suffix result in elimating the reference source vertex?
|
||||
* @param graph the graph
|
||||
* @param commonSuffix the common suffix of all toSplits
|
||||
* @param toSplits the list of vertices we're are trying to split
|
||||
* @return true if toSplit contains the reference source and this ref source has all and only the bases of commonSuffix
|
||||
*/
|
||||
private boolean wouldEliminateRefSource(final SeqGraph graph, final SeqVertex commonSuffix, final Collection<SeqVertex> toSplits) {
|
||||
for ( final SeqVertex toSplit : toSplits ) {
|
||||
if ( graph.isRefSource(toSplit) )
|
||||
return toSplit.length() == commonSuffix.length();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// private static int counter = 0;
|
||||
|
||||
/**
|
||||
* Would all vertices that we'd split just result in the common suffix?
|
||||
*
|
||||
* That is, suppose we have prefix nodes ABC and ABC. After splitting all of the vertices would
|
||||
* just be ABC again, and we'd enter into an infinite loop.
|
||||
*
|
||||
* @param commonSuffix the common suffix of all vertices in toSplits
|
||||
* @param toSplits the collection of vertices we want to split
|
||||
* @return true if all of the vertices are equal to the common suffix
|
||||
*/
|
||||
private boolean allVerticesAreTheCommonSuffix(final SeqVertex commonSuffix, final Collection<SeqVertex> toSplits) {
|
||||
for ( final SeqVertex toSplit : toSplits ) {
|
||||
if ( toSplit.length() != commonSuffix.length() )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we safely split up the vertices in toMerge?
|
||||
*
|
||||
* @param graph a graph
|
||||
* @param bot a vertex whose incoming vertices we want to split
|
||||
* @param toMerge the set of vertices we'd be splitting up
|
||||
* @return true if we can safely split up toMerge
|
||||
*/
|
||||
private boolean safeToSplit(final SeqGraph graph, final SeqVertex bot, final Collection<SeqVertex> toMerge) {
|
||||
final Set<SeqVertex> outgoingOfBot = new HashSet<SeqVertex>(graph.outgoingVerticesOf(bot));
|
||||
for ( final SeqVertex m : toMerge ) {
|
||||
final Set<BaseEdge> outs = graph.outgoingEdgesOf(m);
|
||||
if ( m == bot || outs.size() != 1 || ! graph.outgoingVerticesOf(m).contains(bot) )
|
||||
// m == bot => don't allow self cycles in the graph
|
||||
return false;
|
||||
if ( outgoingOfBot.contains(m) )
|
||||
// forbid cycles from bottom -> mid
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the longest suffix of bases shared among all provided vertices
|
||||
*
|
||||
* For example, if the vertices have sequences AC, CC, and ATC, this would return
|
||||
* a single C. However, for ACC and TCC this would return CC. And for AC and TG this
|
||||
* would return null;
|
||||
*
|
||||
* @param middleVertices a non-empty set of vertices
|
||||
* @return a single vertex that contains the common suffix of all middle vertices
|
||||
*/
|
||||
@Requires("!middleVertices.isEmpty()")
|
||||
protected static SeqVertex commonSuffix(final Collection<SeqVertex> middleVertices) {
|
||||
final List<byte[]> kmers = GraphUtils.getKmers(middleVertices);
|
||||
final int min = GraphUtils.minKmerLength(kmers);
|
||||
final int suffixLen = GraphUtils.compSuffixLen(kmers, min);
|
||||
final byte[] kmer = kmers.get(0);
|
||||
final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length);
|
||||
return new SeqVertex(suffix);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,139 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A DeBruijn kmer graph
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 2/6/13
|
||||
*/
|
||||
public final class DeBruijnGraph extends BaseGraph<DeBruijnVertex> {
|
||||
/**
|
||||
* Create an empty DeBruijnGraph with default kmer size
|
||||
*/
|
||||
public DeBruijnGraph() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an empty DeBruijnGraph with kmer size
|
||||
* @param kmerSize kmer size, must be >= 1
|
||||
*/
|
||||
public DeBruijnGraph(int kmerSize) {
|
||||
super(kmerSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull kmers out of the given long sequence and throw them on in the graph
|
||||
* @param sequence byte array holding the sequence with which to build the assembly graph
|
||||
* @param KMER_LENGTH the desired kmer length to use
|
||||
* @param isRef if true the kmers added to the graph will have reference edges linking them
|
||||
*/
|
||||
public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) {
|
||||
if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); }
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add edge to assembly graph connecting the two kmers
|
||||
* @param kmer1 the source kmer for the edge
|
||||
* @param kmer2 the target kmer for the edge
|
||||
* @param isRef true if the added edge is a reference edge
|
||||
*/
|
||||
public void addKmersToGraph( final byte[] kmer1, final byte[] kmer2, final boolean isRef, final int multiplicity ) {
|
||||
if( kmer1 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
|
||||
if( kmer2 == null ) { throw new IllegalArgumentException("Attempting to add a null kmer to the graph."); }
|
||||
if( kmer1.length != kmer2.length ) { throw new IllegalArgumentException("Attempting to add a kmers to the graph with different lengths."); }
|
||||
|
||||
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1 );
|
||||
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2 );
|
||||
final BaseEdge toAdd = new BaseEdge(isRef, multiplicity);
|
||||
|
||||
addVertices(v1, v2);
|
||||
addOrUpdateEdge(v1, v2, toAdd);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert this kmer graph to a simple sequence graph.
|
||||
*
|
||||
* Each kmer suffix shows up as a distinct SeqVertex, attached in the same structure as in the kmer
|
||||
* graph. Nodes that are sources are mapped to SeqVertex nodes that contain all of their sequence
|
||||
*
|
||||
* @return a newly allocated SequenceGraph
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public SeqGraph convertToSequenceGraph() {
|
||||
final SeqGraph seqGraph = new SeqGraph(getKmerSize());
|
||||
final Map<DeBruijnVertex, SeqVertex> vertexMap = new HashMap<DeBruijnVertex, SeqVertex>();
|
||||
|
||||
// create all of the equivalent seq graph vertices
|
||||
for ( final DeBruijnVertex dv : vertexSet() ) {
|
||||
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
|
||||
vertexMap.put(dv, sv);
|
||||
seqGraph.addVertex(sv);
|
||||
}
|
||||
|
||||
// walk through the nodes and connect them to their equivalent seq vertices
|
||||
for( final BaseEdge e : edgeSet() ) {
|
||||
final SeqVertex seqOutV = vertexMap.get(getEdgeTarget(e));
|
||||
final SeqVertex seqInV = vertexMap.get(getEdgeSource(e));
|
||||
seqGraph.addEdge(seqInV, seqOutV, e);
|
||||
}
|
||||
|
||||
return seqGraph;
|
||||
}
|
||||
}
|
||||
|
|
@ -44,70 +44,82 @@
|
|||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Comparator;
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* simple node class for storing kmer sequences
|
||||
*
|
||||
* User: ebanks, mdepristo
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
|
||||
// simple edge class for connecting nodes in the graph
|
||||
public class DeBruijnEdge {
|
||||
|
||||
private int multiplicity;
|
||||
private boolean isRef;
|
||||
|
||||
public DeBruijnEdge() {
|
||||
multiplicity = 1;
|
||||
isRef = false;
|
||||
public final class DeBruijnVertex extends BaseVertex {
|
||||
private final static byte[][] sufficesAsByteArray = new byte[256][];
|
||||
static {
|
||||
for ( int i = 0; i < sufficesAsByteArray.length; i++ )
|
||||
sufficesAsByteArray[i] = new byte[]{(byte)(i & 0xFF)};
|
||||
}
|
||||
|
||||
public DeBruijnEdge( final boolean isRef ) {
|
||||
multiplicity = 1;
|
||||
this.isRef = isRef;
|
||||
public DeBruijnVertex( final byte[] sequence ) {
|
||||
super(sequence);
|
||||
}
|
||||
|
||||
public DeBruijnEdge( final boolean isRef, final int multiplicity ) {
|
||||
this.multiplicity = multiplicity;
|
||||
this.isRef = isRef;
|
||||
/**
|
||||
* For testing purposes only
|
||||
* @param sequence
|
||||
*/
|
||||
protected DeBruijnVertex( final String sequence ) {
|
||||
this(sequence.getBytes());
|
||||
}
|
||||
|
||||
public int getMultiplicity() {
|
||||
return multiplicity;
|
||||
/**
|
||||
* Get the kmer size for this DeBruijnVertex
|
||||
* @return integer >= 1
|
||||
*/
|
||||
@Ensures("result >= 1")
|
||||
public int getKmer() {
|
||||
return sequence.length;
|
||||
}
|
||||
|
||||
public void setMultiplicity( final int value ) {
|
||||
multiplicity = value;
|
||||
/**
|
||||
* Get the string representation of the suffix of this DeBruijnVertex
|
||||
* @return a non-null non-empty string
|
||||
*/
|
||||
@Ensures({"result != null", "result.length() >= 1"})
|
||||
public String getSuffixString() {
|
||||
return new String(getSuffixAsArray());
|
||||
}
|
||||
|
||||
public boolean isRef() {
|
||||
return isRef;
|
||||
/**
|
||||
* Get the suffix byte of this DeBruijnVertex
|
||||
*
|
||||
* The suffix byte is simply the last byte of the kmer sequence, so if this is holding sequence ACT
|
||||
* getSuffix would return T
|
||||
*
|
||||
* @return a byte
|
||||
*/
|
||||
public byte getSuffix() {
|
||||
return sequence[getKmer() - 1];
|
||||
}
|
||||
|
||||
public void setIsRef( final boolean isRef ) {
|
||||
this.isRef = isRef;
|
||||
/**
|
||||
* Optimized version that returns a byte[] for the single byte suffix of this graph without allocating memory.
|
||||
*
|
||||
* Should not be modified
|
||||
*
|
||||
* @return a byte[] that contains 1 byte == getSuffix()
|
||||
*/
|
||||
@Ensures({"result != null", "result.length == 1", "result[0] == getSuffix()"})
|
||||
private byte[] getSuffixAsArray() {
|
||||
return sufficesAsByteArray[getSuffix()];
|
||||
}
|
||||
|
||||
// For use when comparing edges pulled from the same graph
|
||||
public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge ) {
|
||||
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
// For use when comparing edges across graphs!
|
||||
public boolean equals( final DeBruijnAssemblyGraph graph, final DeBruijnEdge edge, final DeBruijnAssemblyGraph graph2 ) {
|
||||
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
public static class EdgeWeightComparator implements Comparator<DeBruijnEdge>, Serializable {
|
||||
@Override
|
||||
public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) {
|
||||
return edge1.multiplicity - edge2.multiplicity;
|
||||
}
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public byte[] getAdditionalSequence(boolean source) {
|
||||
return source ? super.getAdditionalSequence(source) : getSuffixAsArray();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Utility functions used in the graphs package
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/25/13
|
||||
* Time: 9:42 PM
|
||||
*/
|
||||
final class GraphUtils {
|
||||
private GraphUtils() {}
|
||||
|
||||
/**
|
||||
* Compute the maximum shared prefix length of list of bytes.
|
||||
*
|
||||
* @param listOfBytes a list of bytes with at least one element
|
||||
* @param minLength the min. length among all byte[] in listOfBytes
|
||||
* @return the number of shared bytes common at the start of all bytes
|
||||
*/
|
||||
@Requires({"listOfBytes.size() >= 1", "minLength >= 0"})
|
||||
@Ensures("result >= 0")
|
||||
protected static int compPrefixLen(final List<byte[]> listOfBytes, final int minLength) {
|
||||
for ( int i = 0; i < minLength; i++ ) {
|
||||
final byte b = listOfBytes.get(0)[i];
|
||||
for ( int j = 1; j < listOfBytes.size(); j++ ) {
|
||||
if ( b != listOfBytes.get(j)[i] )
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
return minLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the maximum shared suffix length of list of bytes.
|
||||
*
|
||||
* @param listOfBytes a list of bytes with at least one element
|
||||
* @param minLength the min. length among all byte[] in listOfBytes
|
||||
* @return the number of shared bytes common at the end of all bytes
|
||||
*/
|
||||
@Requires({"listOfBytes.size() >= 1", "minLength >= 0"})
|
||||
@Ensures("result >= 0")
|
||||
protected static int compSuffixLen(final List<byte[]> listOfBytes, final int minLength) {
|
||||
for ( int suffixLen = 0; suffixLen < minLength; suffixLen++ ) {
|
||||
final byte b = listOfBytes.get(0)[listOfBytes.get(0).length - suffixLen - 1];
|
||||
for ( int j = 1; j < listOfBytes.size(); j++ ) {
|
||||
if ( b != listOfBytes.get(j)[listOfBytes.get(j).length - suffixLen - 1] )
|
||||
return suffixLen;
|
||||
}
|
||||
}
|
||||
return minLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of kmers as byte[] from the vertices in the graph
|
||||
*
|
||||
* @param vertices a collection of vertices
|
||||
* @return a list of their kmers in order of the iterator on vertices
|
||||
*/
|
||||
protected static List<byte[]> getKmers(final Collection<SeqVertex> vertices) {
|
||||
final List<byte[]> kmers = new ArrayList<byte[]>(vertices.size());
|
||||
for ( final SeqVertex v : vertices ) {
|
||||
kmers.add(v.getSequence());
|
||||
}
|
||||
return kmers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the minimum length of a collection of byte[]
|
||||
*
|
||||
* @param kmers a list of kmers whose .length min we want
|
||||
* @return the min of the kmers, if kmers is empty the result is 0
|
||||
*/
|
||||
protected static int minKmerLength(final Collection<byte[]> kmers) {
|
||||
if ( kmers == null ) throw new IllegalArgumentException("kmers cannot be null");
|
||||
|
||||
if ( kmers.isEmpty() ) return 0;
|
||||
int min = Integer.MAX_VALUE;
|
||||
for ( final byte[] kmer : kmers ) {
|
||||
min = Math.min(min, kmer.length);
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,185 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.common.collect.MinMaxPriorityQueue;
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
|
||||
* This is different from most graph traversals because we want to test paths from any source node to any sink node.
|
||||
*
|
||||
* User: ebanks, rpoplin, mdepristo
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
public class KBestPaths<T extends BaseVertex> {
|
||||
private final boolean allowCycles;
|
||||
|
||||
/**
|
||||
* Create a new KBestPaths finder that follows cycles in the graph
|
||||
*/
|
||||
public KBestPaths() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new KBestPaths finder
|
||||
*
|
||||
* @param allowCycles should we allow paths that follow cycles in the graph?
|
||||
*/
|
||||
public KBestPaths(final boolean allowCycles) {
|
||||
this.allowCycles = allowCycles;
|
||||
}
|
||||
|
||||
protected static class MyInt { public int val = 0; }
|
||||
|
||||
/**
|
||||
* Compare paths such that paths with greater weight are earlier in a list
|
||||
*/
|
||||
protected static class PathComparatorTotalScore implements Comparator<Path>, Serializable {
|
||||
@Override
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path2.getScore() - path1.getScore();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths
|
||||
*/
|
||||
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph ) {
|
||||
return getKBestPaths(graph, 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths
|
||||
* starting from all source vertices and ending with all sink vertices
|
||||
*/
|
||||
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final int k ) {
|
||||
return getKBestPaths(graph, k, graph.getSources(), graph.getSinks());
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000
|
||||
*/
|
||||
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final Set<T> sources, final Set<T> sinks ) {
|
||||
return getKBestPaths(graph, 1000, sources, sinks);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000
|
||||
*/
|
||||
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final T source, final T sink ) {
|
||||
return getKBestPaths(graph, 1000, source, sink);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets
|
||||
*/
|
||||
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final int k, final T source, final T sink ) {
|
||||
return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink));
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverse the graph and pull out the best k paths.
|
||||
* Paths are scored via their comparator function. The default being PathComparatorTotalScore()
|
||||
* @param graph the graph from which to pull paths
|
||||
* @param k the number of paths to find
|
||||
* @param sources a set of vertices we want to start paths with
|
||||
* @param sinks a set of vertices we want to end paths with
|
||||
* @return a list with at most k top-scoring paths from the graph
|
||||
*/
|
||||
@Ensures({"result != null", "result.size() <= k"})
|
||||
public List<Path<T>> getKBestPaths( final BaseGraph<T> graph, final int k, final Set<T> sources, final Set<T> sinks ) {
|
||||
if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); }
|
||||
|
||||
// a min max queue that will collect the best k paths
|
||||
final MinMaxPriorityQueue<Path<T>> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create();
|
||||
|
||||
// run a DFS for best paths
|
||||
for ( final T source : sources ) {
|
||||
final Path<T> startingPath = new Path<T>(source, graph);
|
||||
findBestPaths(startingPath, sinks, bestPaths, new MyInt());
|
||||
}
|
||||
|
||||
// the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result
|
||||
final List<Path<T>> toReturn = new ArrayList<Path<T>>(bestPaths);
|
||||
Collections.sort(toReturn, new PathComparatorTotalScore());
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive algorithm to find the K best paths in the graph from the current path to any of the sinks
|
||||
* @param path the current path progress
|
||||
* @param sinks a set of nodes that are sinks. Will terminate and add a path if the last vertex of path is in this set
|
||||
* @param bestPaths a path to collect completed paths.
|
||||
* @param n used to limit the search by tracking the number of vertices visited across all paths
|
||||
*/
|
||||
private void findBestPaths( final Path<T> path, final Set<T> sinks, final Collection<Path<T>> bestPaths, final MyInt n ) {
|
||||
if ( sinks.contains(path.getLastVertex())) {
|
||||
bestPaths.add(path);
|
||||
} else if( n.val > 10000 ) {
|
||||
// do nothing, just return, as we've done too much work already
|
||||
} else {
|
||||
// recursively run DFS
|
||||
final ArrayList<BaseEdge> edgeArrayList = new ArrayList<BaseEdge>(path.getOutgoingEdgesOfLastVertex());
|
||||
Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator());
|
||||
for ( final BaseEdge edge : edgeArrayList ) {
|
||||
final T target = path.getGraph().getEdgeTarget(edge);
|
||||
// make sure the edge is not already in the path
|
||||
final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target);
|
||||
if ( ! alreadyVisited ) {
|
||||
final Path<T> newPath = new Path<T>(path, edge);
|
||||
n.val++;
|
||||
findBestPaths(newPath, sinks, bestPaths, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,445 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.Parameters;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A path thought a BaseGraph
|
||||
*
|
||||
* class to keep track of paths
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/19/13
|
||||
* Time: 2:34 PM
|
||||
*
|
||||
*/
|
||||
public class Path<T extends BaseVertex> {
|
||||
private final static int MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW = 20;
|
||||
|
||||
// the last vertex seen in the path
|
||||
private final T lastVertex;
|
||||
|
||||
// the list of edges comprising the path
|
||||
private Set<BaseEdge> edgesAsSet = null;
|
||||
private final LinkedList<BaseEdge> edgesInOrder;
|
||||
|
||||
// the scores for the path
|
||||
private final int totalScore;
|
||||
|
||||
// the graph from which this path originated
|
||||
private final BaseGraph<T> graph;
|
||||
|
||||
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
|
||||
// these values were chosen via optimization against the NA12878 knowledge base
|
||||
public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1);
|
||||
|
||||
private static final byte[] STARTING_SW_ANCHOR_BYTES = "XXXXXXXXX".getBytes();
|
||||
|
||||
/**
|
||||
* Create a new Path containing no edges and starting at initialVertex
|
||||
* @param initialVertex the starting vertex of the path
|
||||
* @param graph the graph this path with follow through
|
||||
*/
|
||||
public Path(final T initialVertex, final BaseGraph<T> graph) {
|
||||
if ( initialVertex == null ) throw new IllegalArgumentException("initialVertex cannot be null");
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph);
|
||||
|
||||
lastVertex = initialVertex;
|
||||
edgesInOrder = new LinkedList<BaseEdge>();
|
||||
totalScore = 0;
|
||||
this.graph = graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience constructor for testing that creates a path through vertices in graph
|
||||
*/
|
||||
protected static <T extends BaseVertex> Path<T> makePath(final List<T> vertices, final BaseGraph<T> graph) {
|
||||
Path<T> path = new Path<T>(vertices.get(0), graph);
|
||||
for ( int i = 1; i < vertices.size(); i++ )
|
||||
path = new Path<T>(path, graph.getEdge(path.lastVertex, vertices.get(i)));
|
||||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Path extending p with edge
|
||||
*
|
||||
* @param p the path to extend
|
||||
* @param edge the edge to extend path by
|
||||
*/
|
||||
public Path(final Path<T> p, final BaseEdge edge) {
|
||||
if ( p == null ) throw new IllegalArgumentException("Path cannot be null");
|
||||
if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null");
|
||||
if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't");
|
||||
if ( ! p.graph.getEdgeSource(edge).equals(p.lastVertex) ) { throw new IllegalStateException("Edges added to path must be contiguous."); }
|
||||
|
||||
graph = p.graph;
|
||||
lastVertex = p.graph.getEdgeTarget(edge);
|
||||
edgesInOrder = new LinkedList<BaseEdge>(p.getEdges());
|
||||
edgesInOrder.add(edge);
|
||||
totalScore = p.totalScore + edge.getMultiplicity();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the collection of edges leaving the last vertex of this path
|
||||
* @return a non-null collection
|
||||
*/
|
||||
public Collection<BaseEdge> getOutgoingEdgesOfLastVertex() {
|
||||
return getGraph().outgoingEdgesOf(getLastVertex());
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this path contain the given edge
|
||||
* @param edge the given edge to test
|
||||
* @return true if the edge is found in this path
|
||||
*/
|
||||
public boolean containsEdge( final BaseEdge edge ) {
|
||||
if( edge == null ) { throw new IllegalArgumentException("Attempting to test null edge."); }
|
||||
if ( edgesInOrder.isEmpty() ) return false;
|
||||
|
||||
// initialize contains cache if necessary
|
||||
if ( edgesAsSet == null ) edgesAsSet = new HashSet<BaseEdge>(edgesInOrder);
|
||||
return edgesAsSet.contains(edge);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this path contain the given vertex?
|
||||
*
|
||||
* @param v a non-null vertex
|
||||
* @return true if v occurs within this path, false otherwise
|
||||
*/
|
||||
public boolean containsVertex(final T v) {
|
||||
if ( v == null ) throw new IllegalArgumentException("Vertex cannot be null");
|
||||
|
||||
// TODO -- warning this is expensive. Need to do vertex caching
|
||||
return getVertices().contains(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that two paths have the same edges and total score
|
||||
* @param path the other path we might be the same as
|
||||
* @return true if this and path are the same
|
||||
*/
|
||||
protected boolean pathsAreTheSame(Path<T> path) {
|
||||
return totalScore == path.totalScore && edgesInOrder.equals(path.edgesInOrder);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path=");
|
||||
boolean first = true;
|
||||
for ( final T v : getVertices() ) {
|
||||
if ( first ) {
|
||||
first = false;
|
||||
} else {
|
||||
b.append(" -> ");
|
||||
}
|
||||
b.append(v.getSequenceString());
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the graph of this path
|
||||
* @return a non-null graph
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public BaseGraph<T> getGraph() {
|
||||
return graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the edges of this path in order
|
||||
* @return a non-null list of edges
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public List<BaseEdge> getEdges() { return edgesInOrder; }
|
||||
|
||||
/**
|
||||
* Get the list of vertices in this path in order defined by the edges of the path
|
||||
* @return a non-null, non-empty list of vertices
|
||||
*/
|
||||
@Ensures({"result != null", "!result.isEmpty()"})
|
||||
public List<T> getVertices() {
|
||||
if ( getEdges().isEmpty() )
|
||||
return Collections.singletonList(lastVertex);
|
||||
else {
|
||||
final LinkedList<T> vertices = new LinkedList<T>();
|
||||
boolean first = true;
|
||||
for ( final BaseEdge e : getEdges() ) {
|
||||
if ( first ) {
|
||||
vertices.add(graph.getEdgeSource(e));
|
||||
first = false;
|
||||
}
|
||||
vertices.add(graph.getEdgeTarget(e));
|
||||
}
|
||||
return vertices;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the total score of this path (bigger is better)
|
||||
* @return a positive integer
|
||||
*/
|
||||
@Ensures("result >= 0")
|
||||
public int getScore() { return totalScore; }
|
||||
|
||||
/**
|
||||
* Get the final vertex of the path
|
||||
* @return a non-null vertex
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public T getLastVertex() { return lastVertex; }
|
||||
|
||||
/**
|
||||
* The base sequence for this path. Pull the full sequence for source nodes and then the suffix for all subsequent nodes
|
||||
* @return non-null sequence of bases corresponding to this path
|
||||
*/
|
||||
@Ensures({"result != null"})
|
||||
public byte[] getBases() {
|
||||
if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); }
|
||||
|
||||
byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst()));
|
||||
for( final BaseEdge e : edgesInOrder ) {
|
||||
bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
|
||||
}
|
||||
return bases;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the cigar string for this path using a bubble traversal of the assembly graph and running a Smith-Waterman alignment on each bubble
|
||||
* @return non-null Cigar string with reference length equal to the refHaplotype's reference length
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public Cigar calculateCigar() {
|
||||
final Cigar cigar = new Cigar();
|
||||
// special case for paths that start on reference but not at the reference source node
|
||||
if( edgesInOrder.getFirst().isRef() && !graph.isRefSource(edgesInOrder.getFirst()) ) {
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(null, null, graph.getEdgeSource(edgesInOrder.getFirst())).getCigarElements() ) {
|
||||
cigar.add(ce);
|
||||
}
|
||||
}
|
||||
|
||||
// reset the bubble state machine
|
||||
final BubbleStateMachine<T> bsm = new BubbleStateMachine<T>(cigar);
|
||||
|
||||
for( final BaseEdge e : getEdges() ) {
|
||||
if ( e.hasSameSourceAndTarget(graph, edgesInOrder.getFirst()) ) {
|
||||
advanceBubbleStateMachine( bsm, graph.getEdgeSource(e), null );
|
||||
}
|
||||
advanceBubbleStateMachine( bsm, graph.getEdgeTarget(e), e );
|
||||
}
|
||||
|
||||
// special case for paths that don't end on reference
|
||||
if( bsm.inBubble ) {
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, null).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
} else if( edgesInOrder.getLast().isRef() && !graph.isRefSink(edgesInOrder.getLast()) ) { // special case for paths that end of the reference but haven't completed the entire reference circuit
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, graph.getEdgeTarget(edgesInOrder.getLast()), null).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
}
|
||||
|
||||
return AlignmentUtils.consolidateCigar(bsm.cigar);
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the bubble state machine by incorporating the next node in the path.
|
||||
* @param bsm the current bubble state machine
|
||||
* @param node the node to be incorporated
|
||||
* @param e the edge which generated this node in the path
|
||||
*/
|
||||
@Requires({"bsm != null", "graph != null", "node != null"})
|
||||
private void advanceBubbleStateMachine( final BubbleStateMachine<T> bsm, final T node, final BaseEdge e ) {
|
||||
if( graph.isReferenceNode( node ) ) {
|
||||
if( !bsm.inBubble ) { // just add the ref bases as M's in the Cigar string, and don't do anything else
|
||||
if( e !=null && !e.isRef() ) {
|
||||
if( graph.referencePathExists( graph.getEdgeSource(e), node) ) {
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(null, graph.getEdgeSource(e), node).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
|
||||
} else if ( graph.getEdgeSource(e).equals(graph.getEdgeTarget(e)) ) { // alt edge at ref node points to itself
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.I) );
|
||||
} else {
|
||||
bsm.inBubble = true;
|
||||
bsm.bubbleBytes = null;
|
||||
bsm.lastSeenReferenceNode = graph.getEdgeSource(e);
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
}
|
||||
} else {
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
|
||||
}
|
||||
} else if( bsm.lastSeenReferenceNode != null && !graph.referencePathExists( bsm.lastSeenReferenceNode, node ) ) { // add bases to the bubble string until we get back to the reference path
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
} else { // close the bubble and use a local SW to determine the Cigar string
|
||||
for( final CigarElement ce : calculateCigarForCompleteBubble(bsm.bubbleBytes, bsm.lastSeenReferenceNode, node).getCigarElements() ) {
|
||||
bsm.cigar.add(ce);
|
||||
}
|
||||
bsm.inBubble = false;
|
||||
bsm.bubbleBytes = null;
|
||||
bsm.lastSeenReferenceNode = null;
|
||||
bsm.cigar.add( new CigarElement( graph.getAdditionalSequence(node).length, CigarOperator.M) );
|
||||
}
|
||||
} else { // non-ref vertex
|
||||
if( bsm.inBubble ) { // just keep accumulating until we get back to the reference path
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
} else { // open up a bubble
|
||||
bsm.inBubble = true;
|
||||
bsm.bubbleBytes = null;
|
||||
bsm.lastSeenReferenceNode = (e != null ? graph.getEdgeSource(e) : null );
|
||||
bsm.bubbleBytes = ArrayUtils.addAll( bsm.bubbleBytes, graph.getAdditionalSequence(node) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that we have a completed bubble run a Smith-Waterman alignment to determine the cigar string for this bubble
|
||||
* @param bubbleBytes the bytes that comprise the alternate allele path in this bubble
|
||||
* @param fromVertex the vertex that marks the beginning of the reference path in this bubble (null indicates ref source vertex)
|
||||
* @param toVertex the vertex that marks the end of the reference path in this bubble (null indicates ref sink vertex)
|
||||
* @return the cigar string generated by running a SW alignment between the reference and alternate paths in this bubble
|
||||
*/
|
||||
@Requires({"graph != null"})
|
||||
@Ensures({"result != null"})
|
||||
private Cigar calculateCigarForCompleteBubble( final byte[] bubbleBytes, final T fromVertex, final T toVertex ) {
|
||||
final byte[] refBytes = graph.getReferenceBytes(fromVertex == null ? graph.getReferenceSourceVertex() : fromVertex, toVertex == null ? graph.getReferenceSinkVertex() : toVertex, fromVertex == null, toVertex == null);
|
||||
|
||||
final Cigar returnCigar = new Cigar();
|
||||
|
||||
// add padding to anchor ref/alt bases in the SW matrix
|
||||
byte[] padding = STARTING_SW_ANCHOR_BYTES;
|
||||
boolean goodAlignment = false;
|
||||
SWPairwiseAlignment swConsensus = null;
|
||||
while( !goodAlignment && padding.length < 1000 ) {
|
||||
padding = ArrayUtils.addAll(padding, padding); // double the size of the padding each time
|
||||
final byte[] reference = ArrayUtils.addAll( ArrayUtils.addAll(padding, refBytes), padding );
|
||||
final byte[] alternate = ArrayUtils.addAll( ArrayUtils.addAll(padding, bubbleBytes), padding );
|
||||
swConsensus = new SWPairwiseAlignment( reference, alternate, NEW_SW_PARAMETERS );
|
||||
if( swConsensus.getAlignmentStart2wrt1() == 0 && !swConsensus.getCigar().toString().contains("S") && swConsensus.getCigar().getReferenceLength() == reference.length ) {
|
||||
goodAlignment = true;
|
||||
}
|
||||
}
|
||||
if( !goodAlignment ) {
|
||||
returnCigar.add(new CigarElement(1, CigarOperator.N));
|
||||
return returnCigar;
|
||||
}
|
||||
|
||||
final Cigar swCigar = swConsensus.getCigar();
|
||||
if( swCigar.numCigarElements() > MAX_CIGAR_ELEMENTS_BEFORE_FAILING_SW ) { // this bubble is too divergent from the reference
|
||||
returnCigar.add(new CigarElement(1, CigarOperator.N));
|
||||
} else {
|
||||
for( int iii = 0; iii < swCigar.numCigarElements(); iii++ ) {
|
||||
// now we need to remove the padding from the cigar string
|
||||
int length = swCigar.getCigarElement(iii).getLength();
|
||||
if( iii == 0 ) { length -= padding.length; }
|
||||
if( iii == swCigar.numCigarElements() - 1 ) { length -= padding.length; }
|
||||
if( length > 0 ) {
|
||||
returnCigar.add(new CigarElement(length, swCigar.getCigarElement(iii).getOperator()));
|
||||
}
|
||||
}
|
||||
if( (refBytes == null && returnCigar.getReferenceLength() != 0) || ( refBytes != null && returnCigar.getReferenceLength() != refBytes.length ) ) {
|
||||
throw new IllegalStateException("SmithWaterman cigar failure: " + (refBytes == null ? "-" : new String(refBytes)) + " against " + new String(bubbleBytes) + " = " + swConsensus.getCigar());
|
||||
}
|
||||
}
|
||||
|
||||
return returnCigar;
|
||||
}
|
||||
|
||||
// class to keep track of the bubble state machine
|
||||
private static class BubbleStateMachine<T extends BaseVertex> {
|
||||
public boolean inBubble = false;
|
||||
public byte[] bubbleBytes = null;
|
||||
public T lastSeenReferenceNode = null;
|
||||
public Cigar cigar = null;
|
||||
|
||||
public BubbleStateMachine( final Cigar initialCigar ) {
|
||||
inBubble = false;
|
||||
bubbleBytes = null;
|
||||
lastSeenReferenceNode = null;
|
||||
cigar = initialCigar;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that this and other have the same score and vertices in the same order with the same seq
|
||||
* @param other the other path to consider. Cannot be null
|
||||
* @return true if this and path are equal, false otherwise
|
||||
*/
|
||||
public boolean equalScoreAndSequence(final Path<T> other) {
|
||||
if ( other == null ) throw new IllegalArgumentException("other cannot be null");
|
||||
return getScore() == other.getScore() && equalSequence(other);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that this and other have the same vertices in the same order with the same seq
|
||||
* @param other the other path to consider. Cannot be null
|
||||
* @return true if this and path are equal, false otherwise
|
||||
*/
|
||||
public boolean equalSequence(final Path<T> other) {
|
||||
final List<T> mine = getVertices();
|
||||
final List<T> yours = other.getVertices();
|
||||
if ( mine.size() == yours.size() ) { // hehehe
|
||||
for ( int i = 0; i < mine.size(); i++ )
|
||||
if ( ! mine.get(i).seqEquals(yours.get(i)) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,544 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A graph that contains base sequence at each node
|
||||
*
|
||||
* @author: depristo
|
||||
* @since 03/2013
|
||||
*/
|
||||
public final class SeqGraph extends BaseGraph<SeqVertex> {
|
||||
private final static boolean PRINT_SIMPLIFY_GRAPHS = false;
|
||||
|
||||
/**
|
||||
* The minimum number of common bp from the prefix (head merging) or suffix (tail merging)
|
||||
* required before we'll merge in such configurations. A large value here is critical to avoid
|
||||
* merging inappropriate head or tail nodes, which introduces large insertion / deletion events
|
||||
* as the merge operation creates a link among the non-linked sink / source vertices
|
||||
*/
|
||||
protected final static int MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES = 10;
|
||||
|
||||
/**
|
||||
* How many cycles of the graph simplifications algorithms will we run before
|
||||
* thinking something has gone wrong and throw an exception?
|
||||
*/
|
||||
private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100;
|
||||
|
||||
/**
|
||||
* Construct an empty SeqGraph
|
||||
*/
|
||||
public SeqGraph() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer
|
||||
*
|
||||
* The kmer size is purely information. It is useful when converting a Debruijn graph -> SeqGraph
|
||||
* for us to track the kmer used to make the transformation.
|
||||
*
|
||||
* @param kmer kmer
|
||||
*/
|
||||
public SeqGraph(final int kmer) {
|
||||
super(kmer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simplify this graph, merging vertices together and restructuring the graph in an
|
||||
* effort to minimize the number of overall vertices in the graph without changing
|
||||
* in any way the sequences implied by a complex enumeration of all paths through the graph.
|
||||
*/
|
||||
public void simplifyGraph() {
|
||||
simplifyGraph(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
protected void simplifyGraph(final int maxCycles) {
|
||||
// start off with one round of zipping of chains for performance reasons
|
||||
zipLinearChains();
|
||||
|
||||
SeqGraph prevGraph = null;
|
||||
for( int i = 0; i < maxCycles; i++ ) {
|
||||
if ( i > MAX_REASONABLE_SIMPLIFICATION_CYCLES ) {
|
||||
logger.warn("Infinite loop detected in simpliciation routines. Writing current graph to debugMeMark.dot");
|
||||
printGraph(new File("debugMeMark.dot"), 0);
|
||||
throw new IllegalStateException("Infinite loop detected in simplification routines for kmer graph " + getKmerSize());
|
||||
}
|
||||
|
||||
final boolean didSomeWork = simplifyGraphOnce(i);
|
||||
if ( ! didSomeWork )
|
||||
// no simplification algorithm could run, so stop
|
||||
break;
|
||||
|
||||
// we get five cycles before we start looking for changes in the graph
|
||||
// by cloning ourselves and then checking for any changes
|
||||
if ( i > 5 ) {
|
||||
// the previous graph and this graph have the same structure, so the simplification
|
||||
// algorithms are looping endless between states. Just break and consider ourselves done
|
||||
if ( prevGraph != null && graphEquals(prevGraph, this) )
|
||||
break;
|
||||
|
||||
prevGraph = (SeqGraph)clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run one full cycle of the graph simplification algorithms
|
||||
* @return true if any algorithms said they did some simplification
|
||||
*/
|
||||
private boolean simplifyGraphOnce(final int iteration) {
|
||||
//logger.info("simplifyGraph iteration " + i);
|
||||
// iterate until we haven't don't anything useful
|
||||
boolean didSomeWork = false;
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".1.dot"), 0);
|
||||
didSomeWork |= new MergeDiamonds().transformUntilComplete();
|
||||
didSomeWork |= new MergeTails().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".2.diamonds_and_tails.dot"), 0);
|
||||
|
||||
didSomeWork |= new SplitCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".3.split_suffix.dot"), 0);
|
||||
didSomeWork |= new MergeCommonSuffices().transformUntilComplete();
|
||||
if ( PRINT_SIMPLIFY_GRAPHS ) printGraph(new File("simplifyGraph." + iteration + ".4.merge_suffix.dot"), 0);
|
||||
|
||||
didSomeWork |= new MergeHeadlessIncomingSources().transformUntilComplete();
|
||||
didSomeWork |= zipLinearChains();
|
||||
return didSomeWork;
|
||||
}
|
||||
|
||||
/**
|
||||
* Zip up all of the simple linear chains present in this graph.
|
||||
*
|
||||
* Merges together all pairs of vertices in the graph v1 -> v2 into a single vertex v' containing v1 + v2 sequence
|
||||
*
|
||||
* Only works on vertices where v1's only outgoing edge is to v2 and v2's only incoming edge is from v1.
|
||||
*
|
||||
* If such a pair of vertices is found, they are merged and the graph is update. Otherwise nothing is changed.
|
||||
*
|
||||
* @return true if any such pair of vertices could be found, false otherwise
|
||||
*/
|
||||
public boolean zipLinearChains() {
|
||||
// create the list of start sites [doesn't modify graph yet]
|
||||
final List<SeqVertex> zipStarts = new LinkedList<SeqVertex>();
|
||||
for ( final SeqVertex source : vertexSet() ) {
|
||||
if ( isLinearChainStart(source) )
|
||||
zipStarts.add(source);
|
||||
}
|
||||
|
||||
if ( zipStarts.isEmpty() ) // nothing to do, as nothing could start a chain
|
||||
return false;
|
||||
|
||||
// At this point, zipStarts contains all of the vertices in this graph that might start some linear
|
||||
// chain of vertices. We walk through each start, building up the linear chain of vertices and then
|
||||
// zipping them up with mergeLinearChain, if possible
|
||||
boolean mergedOne = false;
|
||||
for ( final SeqVertex zipStart : zipStarts ) {
|
||||
final LinkedList<SeqVertex> linearChain = traceLinearChain(zipStart);
|
||||
|
||||
// merge the linearized chain, recording if we actually did some useful work
|
||||
mergedOne |= mergeLinearChain(linearChain);
|
||||
}
|
||||
|
||||
return mergedOne;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is source vertex potentially a start of a linear chain of vertices?
|
||||
*
|
||||
* We are a start of a zip chain if our out degree is 1 and either the
|
||||
* the vertex has no incoming connections or 2 or more (we must start a chain) or
|
||||
* we have exactly one incoming vertex and that one has out-degree > 1 (i.e., source's incoming
|
||||
* vertex couldn't be a start itself
|
||||
*
|
||||
* @param source a non-null vertex
|
||||
* @return true if source might start a linear chain
|
||||
*/
|
||||
@Requires("source != null")
|
||||
private boolean isLinearChainStart(final SeqVertex source) {
|
||||
return outDegreeOf(source) == 1
|
||||
&& ( inDegreeOf(source) != 1
|
||||
|| outDegreeOf(incomingVerticesOf(source).iterator().next()) > 1 );
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all of the vertices in a linear chain of vertices starting at zipStart
|
||||
*
|
||||
* Build a list of vertices (in order) starting from zipStart such that each sequential pair of vertices
|
||||
* in the chain A and B can be zipped together.
|
||||
*
|
||||
* @param zipStart a vertex that starts a linear chain
|
||||
* @return a list of vertices that comprise a linear chain starting with zipStart. The resulting
|
||||
* list will always contain at least zipStart as the first element.
|
||||
*/
|
||||
@Requires("isLinearChainStart(zipStart)")
|
||||
@Ensures({"result != null", "result.size() >= 1"})
|
||||
private LinkedList<SeqVertex> traceLinearChain(final SeqVertex zipStart) {
|
||||
final LinkedList<SeqVertex> linearChain = new LinkedList<SeqVertex>();
|
||||
linearChain.add(zipStart);
|
||||
|
||||
boolean lastIsRef = isReferenceNode(zipStart); // remember because this calculation is expensive
|
||||
SeqVertex last = zipStart;
|
||||
while (true) {
|
||||
if ( outDegreeOf(last) != 1 )
|
||||
// cannot extend a chain from last if last has multiple outgoing branches
|
||||
break;
|
||||
|
||||
// there can only be one (outgoing edge of last) by contract
|
||||
final SeqVertex target = getEdgeTarget(outgoingEdgeOf(last));
|
||||
|
||||
if ( inDegreeOf(target) != 1 || last.equals(target) )
|
||||
// cannot zip up a target that has multiple incoming nodes or that's a cycle to the last node
|
||||
break;
|
||||
|
||||
final boolean targetIsRef = isReferenceNode(target);
|
||||
if ( lastIsRef != targetIsRef ) // both our isRef states must be equal
|
||||
break;
|
||||
|
||||
linearChain.add(target); // extend our chain by one
|
||||
|
||||
// update our last state to be the current state, and continue
|
||||
last = target;
|
||||
lastIsRef = targetIsRef;
|
||||
}
|
||||
|
||||
return linearChain;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge a linear chain of vertices into a single combined vertex, and update this graph to such that
|
||||
* the incoming edges into the first element of the linearChain and the outgoing edges from linearChain.getLast()
|
||||
* all point to this new combined vertex.
|
||||
*
|
||||
* @param linearChain a non-empty chain of vertices that can be zipped up into a single vertex
|
||||
* @return true if we actually merged at least two vertices together
|
||||
*/
|
||||
protected boolean mergeLinearChain(final LinkedList<SeqVertex> linearChain) {
|
||||
if ( linearChain.isEmpty() ) throw new IllegalArgumentException("BUG: cannot have linear chain with 0 elements but got " + linearChain);
|
||||
|
||||
final SeqVertex first = linearChain.getFirst();
|
||||
final SeqVertex last = linearChain.getLast();
|
||||
|
||||
if ( first == last ) return false; // only one element in the chain, cannot be extended
|
||||
|
||||
// create the combined vertex, and add it to the graph
|
||||
// TODO -- performance problem -- can be optimized if we want
|
||||
final List<byte[]> seqs = new LinkedList<byte[]>();
|
||||
for ( SeqVertex v : linearChain ) seqs.add(v.getSequence());
|
||||
final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{}));
|
||||
final SeqVertex addedVertex = new SeqVertex( seqsCat );
|
||||
addVertex(addedVertex);
|
||||
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(first);
|
||||
final Set<BaseEdge> outEdges = outgoingEdgesOf(last);
|
||||
|
||||
final int nEdges = inEdges.size() + outEdges.size();
|
||||
int sharedWeightAmongEdges = nEdges == 0 ? 0 : sumEdgeWeightAlongChain(linearChain) / nEdges;
|
||||
final BaseEdge inc = new BaseEdge(false, sharedWeightAmongEdges); // template to make .add function call easy
|
||||
|
||||
// update the incoming and outgoing edges to point to the new vertex
|
||||
for( final BaseEdge edge : outEdges ) { addEdge(addedVertex, getEdgeTarget(edge), new BaseEdge(edge).add(inc)); }
|
||||
for( final BaseEdge edge : inEdges ) { addEdge(getEdgeSource(edge), addedVertex, new BaseEdge(edge).add(inc)); }
|
||||
|
||||
removeAllVertices(linearChain);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sum of the edge weights on a linear chain of at least 2 elements
|
||||
*
|
||||
* @param chain a linear chain of vertices with at least 2 vertices
|
||||
* @return the sum of the multiplicities along all edges connecting vertices within the chain
|
||||
*/
|
||||
@Requires({"chain != null", "chain.size() >= 2"})
|
||||
private int sumEdgeWeightAlongChain(final LinkedList<SeqVertex> chain) {
|
||||
int sum = 0;
|
||||
SeqVertex prev = null;
|
||||
|
||||
for ( final SeqVertex v : chain ) {
|
||||
if ( prev != null ) {
|
||||
final BaseEdge e = getEdge(prev, v);
|
||||
if ( e == null ) throw new IllegalStateException("Something wrong with the linear chain, got a null edge between " + prev + " and " + v);
|
||||
sum += e.getMultiplicity();
|
||||
}
|
||||
prev = v;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Base class for transformation operations that need to iterate over proposed vertices, where
|
||||
* each proposed vertex is a seed vertex for a potential transformation.
|
||||
*
|
||||
* transformUntilComplete will iteratively apply the tryToTransform function on each vertex in the graph
|
||||
* until no vertex can be found that can be transformed.
|
||||
*
|
||||
* Note that in order to eventually terminate tryToTransform must transform the graph such that eventually
|
||||
* no vertices are candidates for further transformations.
|
||||
*/
|
||||
private abstract class VertexBasedTransformer {
|
||||
/**
|
||||
* For testing purposes we sometimes want to test that can be transformed capabilities are working
|
||||
* without actually modifying the graph */
|
||||
private boolean dontModifyGraphEvenIfPossible = false;
|
||||
|
||||
public boolean dontModifyGraphEvenIfPossible() { return dontModifyGraphEvenIfPossible; }
|
||||
public void setDontModifyGraphEvenIfPossible() { this.dontModifyGraphEvenIfPossible = true; }
|
||||
|
||||
/**
|
||||
* Merge until the graph has no vertices that are candidates for merging
|
||||
*/
|
||||
public boolean transformUntilComplete() {
|
||||
boolean didAtLeastOneTranform = false;
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
|
||||
for( final SeqVertex v : vertexSet() ) {
|
||||
foundNodesToMerge = tryToTransform(v);
|
||||
if ( foundNodesToMerge ) {
|
||||
didAtLeastOneTranform = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return didAtLeastOneTranform;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge, if possible, seeded on the vertex v
|
||||
* @param v the proposed seed vertex to merge
|
||||
* @return true if some useful merging happened, false otherwise
|
||||
*/
|
||||
abstract boolean tryToTransform(final SeqVertex v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge diamond configurations:
|
||||
*
|
||||
* Performance the transformation:
|
||||
*
|
||||
* { A -> x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { A -> x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*/
|
||||
protected class MergeDiamonds extends VertexBasedTransformer {
|
||||
@Override
|
||||
protected boolean tryToTransform(final SeqVertex top) {
|
||||
final Set<SeqVertex> middles = outgoingVerticesOf(top);
|
||||
if ( middles.size() <= 1 )
|
||||
// we can only merge if there's at least two middle nodes
|
||||
return false;
|
||||
|
||||
SeqVertex bottom = null;
|
||||
for ( final SeqVertex mi : middles ) {
|
||||
// all nodes must have at least 1 connection
|
||||
if ( outDegreeOf(mi) < 1 )
|
||||
return false;
|
||||
|
||||
// can only have 1 incoming node, the root vertex
|
||||
if ( inDegreeOf(mi) != 1 )
|
||||
return false;
|
||||
|
||||
// make sure that all outgoing vertices of mi go only to the bottom node
|
||||
for ( final SeqVertex mt : outgoingVerticesOf(mi) ) {
|
||||
if ( bottom == null )
|
||||
bottom = mt;
|
||||
else if ( ! bottom.equals(mt) )
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// bottom has some connections coming in from other nodes, don't allow
|
||||
if ( inDegreeOf(bottom) != middles.size() )
|
||||
return false;
|
||||
|
||||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
// actually do the merging, returning true if at least 1 base was successfully split
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, middles);
|
||||
if (splitter.meetsMinMergableSequenceForEitherPrefixOrSuffix(1))
|
||||
return splitter.splitAndUpdate(top, bottom);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge tail configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { A -> x + S_i + y }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { A -> x -> S_i -> y }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no bottom node is required
|
||||
*/
|
||||
protected class MergeTails extends VertexBasedTransformer {
|
||||
@Override
|
||||
protected boolean tryToTransform(final SeqVertex top) {
|
||||
final Set<SeqVertex> tails = outgoingVerticesOf(top);
|
||||
if ( tails.size() <= 1 )
|
||||
return false;
|
||||
|
||||
for ( final SeqVertex t : tails )
|
||||
if ( ! isSink(t) || inDegreeOf(t) > 1 )
|
||||
return false;
|
||||
|
||||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, tails);
|
||||
|
||||
if (splitter.meetsMinMergableSequenceForSuffix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
|
||||
return splitter.splitAndUpdate(top, null);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge headless configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { x + S_i -> y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { x -> S_i -> y + Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*/
|
||||
protected class MergeCommonSuffices extends VertexBasedTransformer {
|
||||
@Override
|
||||
boolean tryToTransform(final SeqVertex bottom) {
|
||||
return new SharedSequenceMerger().merge(SeqGraph.this, bottom);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no top node is required
|
||||
*/
|
||||
protected class SplitCommonSuffices extends VertexBasedTransformer {
|
||||
final Set<SeqVertex> alreadySplit = new HashSet<SeqVertex>();
|
||||
|
||||
@Override
|
||||
boolean tryToTransform(final SeqVertex bottom) {
|
||||
if ( alreadySplit.contains(bottom) )
|
||||
return false;
|
||||
else {
|
||||
alreadySplit.add(bottom);
|
||||
return new CommonSuffixSplitter().split(SeqGraph.this, bottom);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge headless configurations:
|
||||
*
|
||||
* Performs the transformation:
|
||||
*
|
||||
* { x + S_i + y -> Z }
|
||||
*
|
||||
* goes to:
|
||||
*
|
||||
* { x -> S_i -> y -> Z }
|
||||
*
|
||||
* for all nodes that match this configuration.
|
||||
*
|
||||
* Differs from the diamond transform in that no top node is required
|
||||
*/
|
||||
protected class MergeHeadlessIncomingSources extends VertexBasedTransformer {
|
||||
@Override
|
||||
boolean tryToTransform(final SeqVertex bottom) {
|
||||
final Set<SeqVertex> incoming = incomingVerticesOf(bottom);
|
||||
if ( incoming.size() <= 1 )
|
||||
return false;
|
||||
|
||||
for ( final SeqVertex inc : incoming )
|
||||
if ( ! isSource(inc) || outDegreeOf(inc) > 1 )
|
||||
return false;
|
||||
|
||||
if ( dontModifyGraphEvenIfPossible() ) return true;
|
||||
|
||||
final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(SeqGraph.this, incoming);
|
||||
if (splitter.meetsMinMergableSequenceForPrefix(MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES))
|
||||
return splitter.splitAndUpdate(null, bottom);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* A graph vertex containing a sequence of bases and a unique ID that
|
||||
* allows multiple distinct nodes in the graph to have the same sequence.
|
||||
*
|
||||
* This is essential when thinking about representing the actual sequence of a haplotype
|
||||
* in a graph. There can be many parts of the sequence that have the same sequence, but
|
||||
* are distinct elements in the graph because they have a different position in the graph. For example:
|
||||
*
|
||||
* A -> C -> G -> A -> T
|
||||
*
|
||||
* The two As are not the same, because they occur with different connections. In a kmer graph equals()
|
||||
* is based on the sequence itself, as each distinct kmer can only be represented once. But the transformation
|
||||
* of the kmer graph into a graph of base sequences, without their kmer prefixes, means that nodes that
|
||||
* where once unique including their prefix can become equal after shedding the prefix. So we need to
|
||||
* use some mechanism -- here a unique ID per node -- to separate nodes that have the same sequence
|
||||
* but are distinct elements of the graph.
|
||||
*
|
||||
* @author: depristo
|
||||
* @since 03/2013
|
||||
*/
|
||||
public final class SeqVertex extends BaseVertex {
|
||||
private static int idCounter = 0;
|
||||
public final int id;
|
||||
|
||||
/**
|
||||
* Create a new SeqVertex with sequence and the next available id
|
||||
* @param sequence our base sequence
|
||||
*/
|
||||
public SeqVertex(final byte[] sequence) {
|
||||
super(sequence);
|
||||
this.id = idCounter++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new SeqVertex having bases of sequence.getBytes()
|
||||
* @param sequence the string representation of our bases
|
||||
*/
|
||||
public SeqVertex(final String sequence) {
|
||||
super(sequence);
|
||||
this.id = idCounter++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a copy of toCopy
|
||||
* @param toCopy a SeqVertex to copy into this newly allocated one
|
||||
*/
|
||||
public SeqVertex(final SeqVertex toCopy) {
|
||||
super(toCopy.sequence);
|
||||
this.id = toCopy.id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the unique ID for this SeqVertex
|
||||
* @return a positive integer >= 0
|
||||
*/
|
||||
public int getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SeqVertex_id_" + id + "_seq_" + getSequenceString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Two SeqVertex are equal only if their ids are equal
|
||||
* @param o
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
SeqVertex seqVertex = (SeqVertex) o;
|
||||
if (id != seqVertex.id) return false;
|
||||
|
||||
// note that we don't test for super equality here because the ids are unique
|
||||
//if (!super.equals(o)) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new SeqVertex derived from this one but not including the suffix bases
|
||||
*
|
||||
* @param suffix the suffix bases to remove from this vertex
|
||||
* @return a newly allocated SeqVertex with appropriate prefix, or null if suffix removes all bases from this node
|
||||
*/
|
||||
@Requires("Utils.endsWith(sequence, suffix)")
|
||||
public SeqVertex withoutSuffix(final byte[] suffix) {
|
||||
final int prefixSize = sequence.length - suffix.length;
|
||||
return prefixSize > 0 ? new SeqVertex(Arrays.copyOf(sequence, prefixSize)) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new SeqVertex derived from this one but not including prefix or suffix bases
|
||||
*
|
||||
* @param prefix the previx bases to remove
|
||||
* @param suffix the suffix bases to remove from this vertex
|
||||
* @return a newly allocated SeqVertex
|
||||
*/
|
||||
@Requires("Utils.endsWith(sequence, suffix)")
|
||||
public SeqVertex withoutPrefixAndSuffix(final byte[] prefix, final byte[] suffix) {
|
||||
final int start = prefix.length;
|
||||
final int length = sequence.length - suffix.length - prefix.length;
|
||||
final int stop = start + length;
|
||||
return length > 0 ? new SeqVertex(Arrays.copyOfRange(sequence, start, stop)) : null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Merges the incoming vertices of a vertex V of a graph
|
||||
*
|
||||
* Looks at the vertices that are incoming to V (i.e., have an outgoing edge connecting to V). If
|
||||
* they all have the same sequence, merges them into the sequence of V, and updates the graph
|
||||
* as appropriate
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/22/13
|
||||
* Time: 8:31 AM
|
||||
*/
|
||||
public class SharedSequenceMerger {
|
||||
public SharedSequenceMerger() { }
|
||||
|
||||
/**
|
||||
* Attempt to merge the incoming vertices of v
|
||||
*
|
||||
* @param graph the graph containing the vertex v
|
||||
* @param v the vertex whose incoming vertices we want to merge
|
||||
* @return true if some useful merging was done, false otherwise
|
||||
*/
|
||||
public boolean merge(final SeqGraph graph, final SeqVertex v) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( ! graph.vertexSet().contains(v) ) throw new IllegalArgumentException("graph doesn't contain vertex " + v);
|
||||
|
||||
final Set<SeqVertex> prevs = graph.incomingVerticesOf(v);
|
||||
if ( ! canMerge(graph, v, prevs) )
|
||||
return false;
|
||||
else {
|
||||
// graph.printGraph(new File("csm." + counter + "." + v.getSequenceString() + "_pre.dot"), 0);
|
||||
|
||||
final List<BaseEdge> edgesToRemove = new LinkedList<BaseEdge>();
|
||||
final byte[] prevSeq = prevs.iterator().next().getSequence();
|
||||
final SeqVertex newV = new SeqVertex(ArrayUtils.addAll(prevSeq, v.getSequence()));
|
||||
graph.addVertex(newV);
|
||||
|
||||
for ( final SeqVertex prev : prevs ) {
|
||||
for ( final BaseEdge prevIn : graph.incomingEdgesOf(prev) ) {
|
||||
graph.addEdge(graph.getEdgeSource(prevIn), newV, new BaseEdge(prevIn));
|
||||
edgesToRemove.add(prevIn);
|
||||
}
|
||||
}
|
||||
|
||||
for ( final BaseEdge e : graph.outgoingEdgesOf(v) ) {
|
||||
graph.addEdge(newV, graph.getEdgeTarget(e), new BaseEdge(e));
|
||||
}
|
||||
|
||||
graph.removeAllVertices(prevs);
|
||||
graph.removeVertex(v);
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
|
||||
// graph.printGraph(new File("csm." + counter++ + "." + v.getSequenceString() + "_post.dot"), 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
//private static int counter = 0;
|
||||
|
||||
/**
|
||||
* Can we safely merge the incoming vertices of v
|
||||
*
|
||||
* @param graph the graph containing v and incomingVertices
|
||||
* @param v the vertex we want to merge into
|
||||
* @param incomingVertices the incoming vertices of v
|
||||
* @return true if we can safely merge incomingVertices
|
||||
*/
|
||||
private boolean canMerge(final SeqGraph graph, final SeqVertex v, final Collection<SeqVertex> incomingVertices) {
|
||||
if ( incomingVertices.isEmpty() )
|
||||
return false;
|
||||
|
||||
final SeqVertex first = incomingVertices.iterator().next();
|
||||
for ( final SeqVertex prev : incomingVertices) {
|
||||
if ( ! prev.seqEquals(first) )
|
||||
return false;
|
||||
final Collection<SeqVertex> prevOuts = graph.outgoingVerticesOf(prev);
|
||||
if ( prevOuts.size() != 1 )
|
||||
return false;
|
||||
if ( prevOuts.iterator().next() != v )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,329 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Split a collection of middle nodes in a graph into their shared prefix and suffix values
|
||||
*
|
||||
* This code performs the following transformation. Suppose I have a set of vertices V, such
|
||||
* that each vertex is composed of sequence such that
|
||||
*
|
||||
* Vi = prefix + seq_i + suffix
|
||||
*
|
||||
* where prefix and suffix are shared sequences across all vertices V
|
||||
*
|
||||
* This algorithm creates a new SeqGraph with the following configuration
|
||||
*
|
||||
* prefix -> has outgoing edges to all seq_i
|
||||
* suffix -> has incoming edges for all seq_i
|
||||
*
|
||||
* There are a few special cases that must be handled. First, Vi could be simply
|
||||
* == to the prefix or the suffix. These generate direct connections between
|
||||
* the prefix and suffix nodes, and they are handled internally by the algorithm.
|
||||
*
|
||||
* Note that for convenience, we will always create newTop and newBottom nodes, but
|
||||
* these may be empty node (i.e., they contain no sequence). That allows them to be
|
||||
* trivially merged, if desired, when the graph is incorporated into an overall
|
||||
* graph.
|
||||
*
|
||||
* The product of this operation is a SeqGraph that contains the split. There's a
|
||||
* function to merge reconnect this graph into the graph that contains the middle nodes
|
||||
*
|
||||
* The process guarentees a few things about the output:
|
||||
*
|
||||
* -- Preserves the paths and weights among all vertices
|
||||
*
|
||||
* It produces a graph that has some unusual properties
|
||||
*
|
||||
* -- May add nodes with no sequence (isEmpty() == true) to preserve connectivity among the graph
|
||||
* -- May introduce edges with no multiplicity to preserve paths through the graph
|
||||
*
|
||||
* The overall workflow of using this class is simple:
|
||||
*
|
||||
* find vertices V in graph that you want to split out
|
||||
* s = new SharedVertexSequenceSplitter(graph, V)
|
||||
* s.updateGraph(graph)
|
||||
*
|
||||
* to update the graph with the modifications created by this splitter
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 3/22/13
|
||||
* Time: 8:31 AM
|
||||
*/
|
||||
public class SharedVertexSequenceSplitter {
|
||||
final private SeqGraph outer;
|
||||
final protected SeqVertex prefixV, suffixV;
|
||||
final protected Collection<SeqVertex> toSplits;
|
||||
|
||||
// updated in split routine
|
||||
protected SeqGraph splitGraph = null;
|
||||
protected Collection<SeqVertex> newMiddles = null;
|
||||
protected List<BaseEdge> edgesToRemove = null;
|
||||
|
||||
/**
|
||||
* Create a new graph that contains the vertices in toSplitsArg with their shared suffix and prefix
|
||||
* sequences extracted out.
|
||||
*
|
||||
* @param graph the graph containing the vertices in toSplitsArg
|
||||
* @param toSplitsArg a collection of vertices to split. Must be contained within graph, and have only connections
|
||||
* from a single shared top and/or bottom node
|
||||
*/
|
||||
public SharedVertexSequenceSplitter(final SeqGraph graph, final Collection<SeqVertex> toSplitsArg) {
|
||||
if ( graph == null ) throw new IllegalArgumentException("graph cannot be null");
|
||||
if ( toSplitsArg == null ) throw new IllegalArgumentException("toSplitsArg cannot be null");
|
||||
if ( toSplitsArg.size() < 2 ) throw new IllegalArgumentException("Can only split at least 2 vertices but only got " + toSplitsArg);
|
||||
if ( ! graph.vertexSet().containsAll(toSplitsArg) ) throw new IllegalArgumentException("graph doesn't contain all of the vertices to split");
|
||||
|
||||
this.outer = graph;
|
||||
this.toSplits = toSplitsArg;
|
||||
|
||||
// all of the edges point to the same sink, so it's time to merge
|
||||
final Pair<SeqVertex, SeqVertex> prefixAndSuffix = commonPrefixAndSuffixOfVertices(toSplits);
|
||||
prefixV = prefixAndSuffix.getFirst();
|
||||
suffixV = prefixAndSuffix.getSecond();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given sequencing that are all equal, does this splitter make those into prefix or suffix nodes?
|
||||
* @return true if we merge equal nodes into prefix nodes or suffix nodes
|
||||
*/
|
||||
protected static boolean prefersPrefixMerging() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple single-function interface to split and then update a graph
|
||||
*
|
||||
* @see #updateGraph(SeqVertex, SeqVertex) for a full description of top and bottom
|
||||
*
|
||||
* @param top the top vertex, may be null
|
||||
* @param bottom the bottom vertex, may be null
|
||||
* @return true if some useful splitting was done, false otherwise
|
||||
*/
|
||||
public boolean splitAndUpdate(final SeqVertex top, final SeqVertex bottom) {
|
||||
split();
|
||||
updateGraph(top, bottom);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does either the common suffix or prefix have at least minCommonSequence bases in it?
|
||||
* @param minCommonSequence a minimum length of the common sequence, must be >= 0
|
||||
* @return true if either suffix or prefix length >= minCommonSequence
|
||||
*/
|
||||
public boolean meetsMinMergableSequenceForEitherPrefixOrSuffix(final int minCommonSequence) {
|
||||
return meetsMinMergableSequenceForPrefix(minCommonSequence) || meetsMinMergableSequenceForSuffix(minCommonSequence);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the common prefix have at least minCommonSequence bases in it?
|
||||
* @param minCommonSequence a minimum length of the common sequence, must be >= 0
|
||||
* @return true if prefix length >= minCommonSequence
|
||||
*/
|
||||
public boolean meetsMinMergableSequenceForPrefix(final int minCommonSequence) {
|
||||
return prefixV.length() >= minCommonSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the common suffix have at least minCommonSequence bases in it?
|
||||
* @param minCommonSequence a minimum length of the common sequence, must be >= 0
|
||||
* @return true if suffix length >= minCommonSequence
|
||||
*/
|
||||
public boolean meetsMinMergableSequenceForSuffix(final int minCommonSequence) {
|
||||
return suffixV.length() >= minCommonSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually do the splitting up of the vertices
|
||||
*
|
||||
* Must be called before calling updateGraph
|
||||
*/
|
||||
public void split() {
|
||||
splitGraph = new SeqGraph();
|
||||
newMiddles = new LinkedList<SeqVertex>();
|
||||
edgesToRemove = new LinkedList<BaseEdge>();
|
||||
|
||||
splitGraph.addVertices(prefixV, suffixV);
|
||||
|
||||
for ( final SeqVertex mid : toSplits ) {
|
||||
final BaseEdge toMid = processEdgeToRemove(mid, outer.incomingEdgeOf(mid));
|
||||
final BaseEdge fromMid = processEdgeToRemove(mid, outer.outgoingEdgeOf(mid));
|
||||
|
||||
final SeqVertex remaining = mid.withoutPrefixAndSuffix(prefixV.getSequence(), suffixV.getSequence());
|
||||
if ( remaining != null ) {
|
||||
// there's some sequence prefix + seq + suffix, so add the node and make edges
|
||||
splitGraph.addVertex(remaining);
|
||||
newMiddles.add(remaining);
|
||||
// update edge from top -> middle to be top -> without suffix
|
||||
splitGraph.addEdge(prefixV, remaining, toMid);
|
||||
splitGraph.addEdge(remaining, suffixV, fromMid);
|
||||
} else {
|
||||
// prefix + suffix completely explain this node
|
||||
splitGraph.addOrUpdateEdge(prefixV, suffixV, new BaseEdge(toMid).add(fromMid));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update graph outer, replacing the previous middle vertices that were split out with the new
|
||||
* graph structure of the split, linking this subgraph into the graph at top and bot (the
|
||||
* vertex connecting the middle nodes and the vertex outgoing of all middle node)
|
||||
*
|
||||
* @param top an optional top node that must have outgoing edges to all split vertices. If null, this subgraph
|
||||
* will be added without any incoming edges
|
||||
* @param bot an optional bottom node that must have incoming edges to all split vertices. If null, this subgraph
|
||||
* will be added without any outgoing edges to the rest of the graph
|
||||
*/
|
||||
public void updateGraph(final SeqVertex top, final SeqVertex bot) {
|
||||
if ( ! outer.vertexSet().containsAll(toSplits) ) throw new IllegalArgumentException("graph doesn't contain all of the original vertices to split");
|
||||
if ( top == null && bot == null ) throw new IllegalArgumentException("Cannot update graph without at least one top or bot vertex, but both were null");
|
||||
if ( top != null && ! outer.containsVertex(top) ) throw new IllegalArgumentException("top " + top + " not found in graph " + outer);
|
||||
if ( bot != null && ! outer.containsVertex(bot) ) throw new IllegalArgumentException("bot " + bot + " not found in graph " + outer);
|
||||
if ( splitGraph == null ) throw new IllegalStateException("Cannot call updateGraph until split() has been called");
|
||||
|
||||
outer.removeAllVertices(toSplits);
|
||||
outer.removeAllEdges(edgesToRemove);
|
||||
|
||||
outer.addVertices(newMiddles);
|
||||
|
||||
final boolean hasPrefixSuffixEdge = splitGraph.getEdge(prefixV, suffixV) != null;
|
||||
final boolean hasOnlyPrefixSuffixEdges = hasPrefixSuffixEdge && splitGraph.outDegreeOf(prefixV) == 1;
|
||||
final boolean needPrefixNode = ! prefixV.isEmpty() || (top == null && ! hasOnlyPrefixSuffixEdges);
|
||||
final boolean needSuffixNode = ! suffixV.isEmpty() || (bot == null && ! hasOnlyPrefixSuffixEdges);
|
||||
|
||||
// if prefix / suffix are needed, keep them
|
||||
final SeqVertex topForConnect = needPrefixNode ? prefixV : top;
|
||||
final SeqVertex botForConnect = needSuffixNode ? suffixV : bot;
|
||||
|
||||
if ( needPrefixNode ) {
|
||||
outer.addVertex(prefixV);
|
||||
if ( top != null ) outer.addEdge(top, prefixV, BaseEdge.orRef(splitGraph.outgoingEdgesOf(prefixV), 0));
|
||||
}
|
||||
|
||||
if ( needSuffixNode ) {
|
||||
outer.addVertex(suffixV);
|
||||
if ( bot != null ) outer.addEdge(suffixV, bot, BaseEdge.orRef(splitGraph.incomingEdgesOf(suffixV), 0));
|
||||
}
|
||||
|
||||
if ( topForConnect != null ) {
|
||||
for ( final BaseEdge e : splitGraph.outgoingEdgesOf(prefixV) ) {
|
||||
final SeqVertex target = splitGraph.getEdgeTarget(e);
|
||||
|
||||
if ( target == suffixV ) { // going straight from prefix -> suffix
|
||||
if ( botForConnect != null )
|
||||
outer.addEdge(topForConnect, botForConnect, e);
|
||||
} else {
|
||||
outer.addEdge(topForConnect, target, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( botForConnect != null ) {
|
||||
for ( final BaseEdge e : splitGraph.incomingEdgesOf(suffixV) ) {
|
||||
outer.addEdge(splitGraph.getEdgeSource(e), botForConnect, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the longest suffix of bases shared among all provided vertices
|
||||
*
|
||||
* For example, if the vertices have sequences AC, CC, and ATC, this would return
|
||||
* a single C. However, for ACC and TCC this would return CC. And for AC and TG this
|
||||
* would return null;
|
||||
*
|
||||
* @param middleVertices a non-empty set of vertices
|
||||
* @return
|
||||
*/
|
||||
@Requires("!middleVertices.isEmpty()")
|
||||
protected static Pair<SeqVertex, SeqVertex> commonPrefixAndSuffixOfVertices(final Collection<SeqVertex> middleVertices) {
|
||||
final List<byte[]> kmers = new ArrayList<byte[]>(middleVertices.size());
|
||||
|
||||
int min = Integer.MAX_VALUE;
|
||||
for ( final SeqVertex v : middleVertices ) {
|
||||
kmers.add(v.getSequence());
|
||||
min = Math.min(min, v.getSequence().length);
|
||||
}
|
||||
|
||||
final int prefixLen = GraphUtils.compPrefixLen(kmers, min);
|
||||
final int suffixLen = GraphUtils.compSuffixLen(kmers, min - prefixLen);
|
||||
|
||||
final byte[] kmer = kmers.get(0);
|
||||
final byte[] prefix = Arrays.copyOfRange(kmer, 0, prefixLen);
|
||||
final byte[] suffix = Arrays.copyOfRange(kmer, kmer.length - suffixLen, kmer.length);
|
||||
return new Pair<SeqVertex, SeqVertex>(new SeqVertex(prefix), new SeqVertex(suffix));
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function that returns an edge that we should use for splitting
|
||||
*
|
||||
* If e is null, creates a new 0 multiplicity edge, set to ref is any edges to V are ref
|
||||
* If e is not null, returns a new copy of e, and schedules e for removal
|
||||
*
|
||||
* @param e a non-null edge
|
||||
* @return a non-null edge
|
||||
*/
|
||||
@Requires("v != null")
|
||||
@Ensures("result != null")
|
||||
private BaseEdge processEdgeToRemove(final SeqVertex v, final BaseEdge e) {
|
||||
if ( e == null ) {
|
||||
// there's no edge, so we return a newly allocated one and don't schedule e for removal
|
||||
// the weight must be 0 to preserve sum through the diamond
|
||||
return new BaseEdge(outer.isReferenceNode(v), 0);
|
||||
} else {
|
||||
// schedule edge for removal, and return a freshly allocated one for our graph to use
|
||||
edgesToRemove.add(e);
|
||||
return new BaseEdge(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -47,7 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.indels;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
|
|||
|
|
@ -62,7 +62,8 @@ import org.broadinstitute.sting.gatk.walkers.BAQMode;
|
|||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.Parameters;
|
||||
import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -87,7 +88,7 @@ import java.io.IOException;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Performs local realignment of reads based on misalignments due to the presence of indels.
|
||||
* Performs local realignment of reads to correct misalignments due to the presence of indels.
|
||||
*
|
||||
* <p>
|
||||
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
||||
|
|
@ -100,39 +101,46 @@ import java.util.*;
|
|||
* indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
|
||||
* appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
|
||||
* specifically identify indels.
|
||||
* <p>
|
||||
* </p>
|
||||
* <ol>There are 2 steps to the realignment process:
|
||||
* <li>Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)</li>
|
||||
* <li>Running the realigner over those intervals (IndelRealigner)</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step.
|
||||
* <p>
|
||||
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
|
||||
* (or with reads from similar technologies).
|
||||
* For more details, see http://www.broadinstitute.org/gatk/guide/article?id=38
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* One or more aligned BAM files and optionally one or more lists of known indels.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A realigned version of your input BAM file(s).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Example</h3>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -I input.bam \
|
||||
* -R ref.fasta \
|
||||
* -T IndelRealigner \
|
||||
* -R ref.fasta \
|
||||
* -I input.bam \
|
||||
* -targetIntervals intervalListFromRTC.intervals \
|
||||
* -o realignedBam.bam \
|
||||
* [-known /path/to/indels.vcf] \
|
||||
* [-compress 0] (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
|
||||
* </pre>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
*
|
||||
* <ul><li>
|
||||
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step.
|
||||
* </li><li>
|
||||
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
|
||||
* (or with reads from similar technologies).
|
||||
* </li></ul>
|
||||
*
|
||||
* @author ebanks
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
|
||||
|
|
@ -168,7 +176,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
/**
|
||||
* The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s).
|
||||
*/
|
||||
@Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
|
||||
@Input(fullName="targetIntervals", shortName="targetIntervals", doc="Intervals file output from RealignerTargetCreator", required=true)
|
||||
protected IntervalBinding<Feature> intervalsFile = null;
|
||||
|
||||
/**
|
||||
|
|
@ -182,7 +190,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
/**
|
||||
* The realigned bam file.
|
||||
*/
|
||||
@Output(required=false, doc="Output bam")
|
||||
@Output(required=false, doc="Output bam", defaultToStdout=false)
|
||||
protected StingSAMFileWriter writer = null;
|
||||
protected ConstrainedMateFixingManager manager = null;
|
||||
protected SAMFileWriter writerToUse = null;
|
||||
|
|
@ -203,7 +211,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
* push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
|
||||
@Argument(fullName="entropyThreshold", shortName="entropy", doc="Percentage of mismatches at a locus to be considered having high entropy (0.0 < entropy <= 1.0)", required=false)
|
||||
protected double MISMATCH_THRESHOLD = 0.15;
|
||||
|
||||
/**
|
||||
|
|
@ -225,21 +233,21 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
* For expert users only!
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false)
|
||||
@Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="Maximum positional move in basepairs that a read can be adjusted during realignment", required=false)
|
||||
protected int MAX_POS_MOVE_ALLOWED = 200;
|
||||
|
||||
/**
|
||||
* For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false)
|
||||
@Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="Max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false)
|
||||
protected int MAX_CONSENSUSES = 30;
|
||||
|
||||
/**
|
||||
* For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
|
||||
@Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="Max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
|
||||
protected int MAX_READS_FOR_CONSENSUSES = 120;
|
||||
|
||||
/**
|
||||
|
|
@ -247,7 +255,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
* If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false)
|
||||
@Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="Max reads allowed at an interval for realignment", required=false)
|
||||
protected int MAX_READS = 20000;
|
||||
|
||||
@Advanced
|
||||
|
|
@ -263,7 +271,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
*
|
||||
* Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing).
|
||||
*/
|
||||
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
|
||||
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file (not compatible with -output)")
|
||||
protected String N_WAY_OUT = null;
|
||||
|
||||
@Hidden
|
||||
|
|
@ -288,15 +296,15 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
protected boolean KEEP_ALL_PG_RECORDS = false;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
|
||||
@Output(fullName="indelsFileForDebugging", shortName="indels", required=false, defaultToStdout=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
|
||||
protected String OUT_INDELS = null;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false)
|
||||
@Output(fullName="statisticsFileForDebugging", shortName="stats", doc="print out statistics (what does or doesn't get cleaned); FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false)
|
||||
protected String OUT_STATS = null;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false)
|
||||
@Output(fullName="SNPsFileForDebugging", shortName="snps", doc="print out whether mismatching columns do or don't get cleaned out; FOR DEBUGGING PURPOSES ONLY", required=false, defaultToStdout=false)
|
||||
protected String OUT_SNPS = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
|
|
@ -321,10 +329,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
// fraction of mismatches that need to no longer mismatch for a column to be considered cleaned
|
||||
private static final double MISMATCH_COLUMN_CLEANED_FRACTION = 0.75;
|
||||
|
||||
private static final double SW_MATCH = 30.0; // 1.0;
|
||||
private static final double SW_MISMATCH = -10.0; //-1.0/3.0;
|
||||
private static final double SW_GAP = -10.0; //-1.0-1.0/3.0;
|
||||
private static final double SW_GAP_EXTEND = -2.0; //-1.0/.0;
|
||||
private final static Parameters swParameters = new Parameters(30.0, -10.0, -10.0, -2.0);
|
||||
|
||||
// reference base padding size
|
||||
// TODO -- make this a command-line argument if the need arises
|
||||
|
|
@ -992,7 +997,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
private void createAndAddAlternateConsensus(final byte[] read, final Set<Consensus> altConsensesToPopulate, final byte[] reference) {
|
||||
|
||||
// do a pairwise alignment against the reference
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, swParameters);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read);
|
||||
if ( c != null )
|
||||
altConsensesToPopulate.add(c);
|
||||
|
|
@ -1009,7 +1014,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
// do a pairwise alignment against the reference
|
||||
SWalignmentRuns++;
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read.getReadBases(), swParameters);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read.getReadBases());
|
||||
if ( c != null ) {
|
||||
altConsensesToPopulate.add(c);
|
||||
|
|
|
|||
|
|
@ -68,17 +68,17 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
* placed at multiple positions and still represent the same haplotype. While a standard convention is to place an
|
||||
* indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* A bam file to left-align.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A left-aligned bam.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* java -Xmx3g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
|
@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel {
|
|||
final ReferenceContext ref,
|
||||
final int eventLength,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
|
||||
final double downsamplingFraction,
|
||||
final PrintStream downsamplingLog) {
|
||||
final double downsamplingFraction) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
|
||||
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
|
||||
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
|
||||
|
||||
}
|
||||
|
|
@ -247,8 +245,13 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
else {
|
||||
final int refWindowStart = ref.getWindow().getStart();
|
||||
final int refWindowStop = ref.getWindow().getStop();
|
||||
// extra padding on candidate haplotypes to make sure reads are always strictly contained
|
||||
// in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly
|
||||
// in case bases at edge of reads have lower quality.
|
||||
final int trailingBases = 3;
|
||||
final int extraOffset = Math.abs(eventLength);
|
||||
final int refWindowStart = ref.getWindow().getStart()+(trailingBases+extraOffset);
|
||||
final int refWindowStop = ref.getWindow().getStop()-(trailingBases+extraOffset);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
|
||||
|
|
@ -257,10 +260,10 @@ public class PairHMMIndelErrorModel {
|
|||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
|
||||
if (!read.isEmpty() && (read.getSoftEnd() > refWindowStop && read.getSoftStart() < refWindowStop))
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, ref.getWindow().getStop());
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, refWindowStop);
|
||||
|
||||
if (!read.isEmpty() && (read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart))
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, ref.getWindow().getStart());
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, refWindowStart);
|
||||
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
|
@ -272,7 +275,6 @@ public class PairHMMIndelErrorModel {
|
|||
continue;
|
||||
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
final int trailingBases = 3;
|
||||
final long readStart = read.getSoftStart();
|
||||
final long readEnd = read.getSoftEnd();
|
||||
|
||||
|
|
@ -288,7 +290,6 @@ public class PairHMMIndelErrorModel {
|
|||
final int numEndSoftClippedBases = softClips ? read.getSoftEnd()- read.getAlignmentEnd() : 0 ;
|
||||
final byte [] unclippedReadBases = read.getReadBases();
|
||||
final byte [] unclippedReadQuals = read.getBaseQualities();
|
||||
final int extraOffset = Math.abs(eventLength);
|
||||
|
||||
/**
|
||||
* Compute genomic locations that candidate haplotypes will span.
|
||||
|
|
@ -315,6 +316,7 @@ public class PairHMMIndelErrorModel {
|
|||
startLocationInRefForHaplotypes = ref.getWindow().getStop(); // read starts after haplotype: read will have to be clipped completely;
|
||||
}
|
||||
|
||||
// candidate haplotype cannot go beyond reference context
|
||||
if (stopLocationInRefForHaplotypes > ref.getWindow().getStop()) {
|
||||
stopLocationInRefForHaplotypes = ref.getWindow().getStop(); // check also if end of read will go beyond reference context
|
||||
}
|
||||
|
|
@ -349,7 +351,6 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
int j=0;
|
||||
|
||||
byte[] previousHaplotypeSeen = null;
|
||||
final byte[] contextLogGapOpenProbabilities = new byte[readBases.length];
|
||||
final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length];
|
||||
|
||||
|
|
@ -389,37 +390,30 @@ public class PairHMMIndelErrorModel {
|
|||
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
|
||||
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
|
||||
|
||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(),
|
||||
(int)indStart, (int)indStop);
|
||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
|
||||
|
||||
final int X_METRIC_LENGTH = readBases.length+2;
|
||||
final int Y_METRIC_LENGTH = haplotypeBases.length+2;
|
||||
// it's possible that the indel starts at the last base of the haplotypes
|
||||
if ( haplotypeBases.length == 0 ) {
|
||||
readLikelihood = -Double.MAX_VALUE;
|
||||
} else {
|
||||
if (firstHap) {
|
||||
//no need to reallocate arrays for each new haplotype, as length won't change
|
||||
pairHMM.initialize(readBases.length, haplotypeBases.length);
|
||||
firstHap = false;
|
||||
}
|
||||
|
||||
if (previousHaplotypeSeen == null) {
|
||||
//no need to reallocate arrays for each new haplotype, as length won't change
|
||||
pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
|
||||
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap);
|
||||
}
|
||||
|
||||
int startIndexInHaplotype = 0;
|
||||
if (previousHaplotypeSeen != null)
|
||||
startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen);
|
||||
previousHaplotypeSeen = haplotypeBases.clone();
|
||||
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
|
||||
baseInsertionQualities, baseDeletionQualities,
|
||||
contextLogGapContinuationProbabilities, startIndexInHaplotype, firstHap);
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("H:"+new String(haplotypeBases));
|
||||
System.out.println("R:"+new String(readBases));
|
||||
System.out.format("L:%4.2f\n",readLikelihood);
|
||||
System.out.format("StPos:%d\n", startIndexInHaplotype);
|
||||
}
|
||||
|
||||
perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
|
||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||
firstHap = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue