Merge remote-tracking branch 'unstable/master'
This commit is contained in:
commit
70e2d21e12
12
build.xml
12
build.xml
|
|
@ -526,6 +526,7 @@
|
|||
<fileset dir="${lib.dir}">
|
||||
<include name="scala-compiler-*.jar"/>
|
||||
<include name="scala-library-*.jar"/>
|
||||
<include name="scala-reflect-*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
<taskdef resource="scala/tools/ant/antlib.xml">
|
||||
|
|
@ -537,7 +538,7 @@
|
|||
<target name="scala.compile" depends="init,resolve,gatk.compile,queue-extensions.generate,init.scala.compile" if="include.scala">
|
||||
<mkdir dir="${scala.classes}"/>
|
||||
<echo>Building Scala...</echo>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.classes}" classpathref="scala.dependencies" deprecation="yes" unchecked="yes">
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.classes}" classpathref="scala.dependencies" deprecation="yes" unchecked="yes" addparams="-feature">
|
||||
<src refid="scala.source.path" />
|
||||
<src path="${queue-extensions.source.dir}" />
|
||||
<include name="**/*.scala" />
|
||||
|
|
@ -1218,7 +1219,7 @@
|
|||
|
||||
<target name="test.scala.compile" depends="test.java.compile,scala.compile" if="include.scala">
|
||||
<echo message="Scala: Compiling test cases!"/>
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes">
|
||||
<scalac fork="true" jvmargs="-Xmx512m" destdir="${scala.test.classes}" deprecation="yes" unchecked="yes" addparams="-feature">
|
||||
<src refid="scala.test.source.path" />
|
||||
<classpath>
|
||||
<path refid="build.results"/>
|
||||
|
|
@ -1414,6 +1415,13 @@
|
|||
<run-test testtype="${pipetype}" outputdir="${report}/${pipetype}" runfailed="false"/>
|
||||
</target>
|
||||
|
||||
<target name="knowledgebasetest" depends="test.compile,test.init" description="Run knowledge base tests">
|
||||
<condition property="ktype" value="*KnowledgeBaseTest" else="${single}">
|
||||
<not><isset property="single"/></not>
|
||||
</condition>
|
||||
<run-test testtype="${ktype}" outputdir="${report}/${ktype}" runfailed="false"/>
|
||||
</target>
|
||||
|
||||
<target name="failed-unit" depends="test.compile,test.init">
|
||||
<run-test testtype="${report}/*UnitTest/testng-failed.xml" outputdir="${report}/failed_rerun" runfailed="true"/>
|
||||
</target>
|
||||
|
|
|
|||
4
ivy.xml
4
ivy.xml
|
|
@ -82,8 +82,8 @@
|
|||
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
|
||||
|
||||
<!-- Scala dependancies -->
|
||||
<dependency org="org.scala-lang" name="scala-compiler" rev="2.9.2"/>
|
||||
<dependency org="org.scala-lang" name="scala-library" rev="2.9.2"/>
|
||||
<dependency org="org.scala-lang" name="scala-compiler" rev="2.10.2"/>
|
||||
<dependency org="org.scala-lang" name="scala-library" rev="2.10.2"/>
|
||||
|
||||
<!-- testing and evaluation dependencies -->
|
||||
<dependency org="org.testng" name="testng" rev="6.8"/>
|
||||
|
|
|
|||
|
|
@ -112,18 +112,18 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
|
||||
private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) {
|
||||
|
||||
HashMap<Byte, Integer> alleleCounts = new HashMap<Byte, Integer>();
|
||||
for ( Allele allele : vc.getAlleles() )
|
||||
final HashMap<Byte, Integer> alleleCounts = new HashMap<>();
|
||||
for ( final Allele allele : vc.getAlleles() )
|
||||
alleleCounts.put(allele.getBases()[0], 0);
|
||||
|
||||
ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
for ( PileupElement p : pileup ) {
|
||||
final ReadBackedPileup pileup = stratifiedContext.getBasePileup();
|
||||
for ( final PileupElement p : pileup ) {
|
||||
if ( alleleCounts.containsKey(p.getBase()) )
|
||||
alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount());
|
||||
}
|
||||
|
||||
// we need to add counts in the correct order
|
||||
int[] counts = new int[alleleCounts.size()];
|
||||
final int[] counts = new int[alleleCounts.size()];
|
||||
counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
|
||||
for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
|
||||
counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);
|
||||
|
|
@ -141,7 +141,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa
|
|||
final HashMap<Allele, Integer> alleleCounts = new HashMap<>();
|
||||
for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); }
|
||||
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
for ( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles);
|
||||
if (! a.isInformative() ) continue; // read is non-informative
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
|
|
|
|||
|
|
@ -51,7 +51,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -68,11 +67,15 @@ import java.util.*;
|
|||
|
||||
|
||||
/**
|
||||
* The depth of coverage of each allele per sample
|
||||
* The depth of coverage for informative reads for each sample.
|
||||
*
|
||||
* the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot
|
||||
* differentiate between reads that align over the event but aren't informative vs. those that aren't even
|
||||
* close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP).
|
||||
* An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a
|
||||
* case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear
|
||||
* whether the read contains the reference allele or e.g. an extra repeat.
|
||||
* The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can
|
||||
* only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator).
|
||||
* This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field
|
||||
* (i.e., sum(AD) = DP).
|
||||
*/
|
||||
public class DepthPerSampleHC extends GenotypeAnnotation {
|
||||
public void annotate(final RefMetaDataTracker tracker,
|
||||
|
|
@ -121,6 +124,6 @@ public class DepthPerSampleHC extends GenotypeAnnotation {
|
|||
}
|
||||
|
||||
public List<VCFFormatHeaderLine> getDescriptions() {
|
||||
return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
|
||||
return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
|
||||
}
|
||||
}
|
||||
|
|
@ -58,6 +58,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota
|
|||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.variant.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
|
|
@ -97,6 +99,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
if ( !vc.isVariant() )
|
||||
return null;
|
||||
|
||||
if ( vc.hasGenotypes() ) {
|
||||
final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() );
|
||||
if ( tableFromPerSampleAnnotations != null ) {
|
||||
return pValueForBestTable(tableFromPerSampleAnnotations, null);
|
||||
}
|
||||
}
|
||||
|
||||
if (vc.isSNP() && stratifiedContexts != null) {
|
||||
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1);
|
||||
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST);
|
||||
|
|
@ -117,6 +126,32 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together
|
||||
* @param genotypes the genotypes from which to pull out the per-sample strand bias annotation
|
||||
* @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation
|
||||
*/
|
||||
private int[][] getTableFromSamples( final GenotypesContext genotypes ) {
|
||||
if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); }
|
||||
|
||||
final int[] sbArray = {0,0,0,0}; // forward-reverse -by- alternate-reference
|
||||
boolean foundData = false;
|
||||
|
||||
for( final Genotype g : genotypes ) {
|
||||
if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) )
|
||||
continue;
|
||||
|
||||
foundData = true;
|
||||
final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME);
|
||||
final int[] data = encodeSBBS(sbbsString);
|
||||
for( int index = 0; index < sbArray.length; index++ ) {
|
||||
sbArray[index] += data[index];
|
||||
}
|
||||
}
|
||||
|
||||
return ( foundData ? decodeSBBS(sbArray) : null );
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
|
||||
*
|
||||
|
|
@ -148,12 +183,56 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
}
|
||||
|
||||
public List<String> getKeyNames() {
|
||||
return Arrays.asList(FS);
|
||||
return Collections.singletonList(FS);
|
||||
}
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() {
|
||||
return Arrays.asList(
|
||||
new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
|
||||
return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to turn the FisherStrand table into the SB annotation array
|
||||
* @param table the table used by the FisherStrand annotation
|
||||
* @return the array used by the per-sample Strand Bias annotation
|
||||
*/
|
||||
public static int[] getContingencyArray( final int[][] table ) {
|
||||
if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
|
||||
if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); }
|
||||
final int[] array = new int[4]; // TODO - if we ever want to do something clever with multi-allelic sites this will need to change
|
||||
array[0] = table[0][0];
|
||||
array[1] = table[0][1];
|
||||
array[2] = table[1][0];
|
||||
array[3] = table[1][1];
|
||||
return array;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to parse the genotype annotation into the SB annotation array
|
||||
* @param string the string that is returned by genotype.getAnnotation("SB")
|
||||
* @return the array used by the per-sample Strand Bias annotation
|
||||
*/
|
||||
private static int[] encodeSBBS( final String string ) {
|
||||
final int[] array = new int[4];
|
||||
final StringTokenizer tokenizer = new StringTokenizer(string, ",", false);
|
||||
for( int index = 0; index < 4; index++ ) {
|
||||
array[index] = Integer.parseInt(tokenizer.nextToken());
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to turn the SB annotation array into the FisherStrand table
|
||||
* @param array the array used by the per-sample Strand Bias annotation
|
||||
* @return the table used by the FisherStrand annotation
|
||||
*/
|
||||
private static int[][] decodeSBBS( final int[] array ) {
|
||||
if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); }
|
||||
final int[][] table = new int[2][2];
|
||||
table[0][0] = array[0];
|
||||
table[0][1] = array[1];
|
||||
table[1][0] = array[2];
|
||||
table[1][1] = array[3];
|
||||
return table;
|
||||
}
|
||||
|
||||
private Double pValueForContingencyTable(int[][] originalTable) {
|
||||
|
|
@ -284,13 +363,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat
|
|||
* allele2 # #
|
||||
* @return a 2x2 contingency table
|
||||
*/
|
||||
private static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
|
||||
public static int[][] getContingencyTable( final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) {
|
||||
if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); }
|
||||
if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); }
|
||||
|
||||
final Allele ref = vc.getReference();
|
||||
final Allele alt = vc.getAltAlleleWithHighestAlleleCount();
|
||||
int[][] table = new int[2][2];
|
||||
final int[][] table = new int[2][2];
|
||||
|
||||
for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
|
||||
for (Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
|
||||
for (final Map.Entry<GATKSAMRecord,Map<Allele,Double>> el : maps.getLikelihoodReadMap().entrySet()) {
|
||||
final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
|
||||
final GATKSAMRecord read = el.getKey();
|
||||
final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1;
|
||||
|
|
|
|||
|
|
@ -70,14 +70,16 @@ import java.util.*;
|
|||
*
|
||||
* A continuous generalization of the Hardy-Weinberg test for disequilibrium that works
|
||||
* well with limited coverage per sample. See the 1000 Genomes Phase I release for
|
||||
* more information. Note that the Inbreeding Coefficient will not be calculated for files
|
||||
* with fewer than a minimum (generally 10) number of samples.
|
||||
* more information. Note that the Inbreeding Coefficient can only be calculated for
|
||||
* cohorts containing at least 10 founder samples.
|
||||
*/
|
||||
public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
|
||||
|
||||
private static final int MIN_SAMPLES = 10;
|
||||
private static final String INBREEDING_COEFFICIENT_KEY_NAME = "InbreedingCoeff";
|
||||
private Set<String> founderIds;
|
||||
|
||||
@Override
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
final ReferenceContext ref,
|
||||
|
|
@ -92,15 +94,15 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
|
||||
private Map<String, Object> calculateIC(final VariantContext vc) {
|
||||
final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds);
|
||||
if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant())
|
||||
if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant())
|
||||
return null;
|
||||
|
||||
int idxAA = 0, idxAB = 1, idxBB = 2;
|
||||
|
||||
if (!vc.isBiallelic()) {
|
||||
// for non-bliallelic case, do test with most common alt allele.
|
||||
// Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB.
|
||||
int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount());
|
||||
// Get then corresponding indices in GL vectors to retrieve GL of AA,AB and BB.
|
||||
final int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount());
|
||||
idxAA = idxVector[0];
|
||||
idxAB = idxVector[1];
|
||||
idxBB = idxVector[2];
|
||||
|
|
@ -132,12 +134,12 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno
|
|||
final double q = 1.0 - p; // expected alternative allele frequency
|
||||
final double F = 1.0 - ( hetCount / ( 2.0 * p * q * (double)N ) ); // inbreeding coefficient
|
||||
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
map.put(getKeyNames().get(0), String.format("%.4f", F));
|
||||
return map;
|
||||
return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F));
|
||||
}
|
||||
|
||||
public List<String> getKeyNames() { return Arrays.asList("InbreedingCoeff"); }
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Collections.singletonList(INBREEDING_COEFFICIENT_KEY_NAME); }
|
||||
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("InbreedingCoeff", 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); }
|
||||
@Override
|
||||
public List<VCFInfoHeaderLine> getDescriptions() { return Collections.singletonList(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY_NAME, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); }
|
||||
}
|
||||
|
|
@ -54,6 +54,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverage;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
|
@ -94,19 +96,20 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
if ( !genotype.isHet() && !genotype.isHomVar() )
|
||||
continue;
|
||||
|
||||
if (stratifiedContexts!= null) {
|
||||
AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) {
|
||||
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
|
||||
if ( context == null )
|
||||
continue;
|
||||
depth += context.getBasePileup().depthOfCoverage();
|
||||
|
||||
}
|
||||
else if (perReadAlleleLikelihoodMap != null) {
|
||||
PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName());
|
||||
} else if (perReadAlleleLikelihoodMap != null) {
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName());
|
||||
if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty())
|
||||
continue;
|
||||
|
||||
depth += perReadAlleleLikelihoods.getNumberOfStoredElements();
|
||||
} else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed
|
||||
depth += genotype.getDP();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -116,7 +119,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati
|
|||
final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc);
|
||||
double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength);
|
||||
QD = fixTooHighQD(QD);
|
||||
Map<String, Object> map = new HashMap<String, Object>();
|
||||
Map<String, Object> map = new HashMap<>();
|
||||
map.put(getKeyNames().get(0), String.format("%.2f", QD));
|
||||
return map;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
final Map<String, AlignmentContext> stratifiedContexts,
|
||||
final VariantContext vc,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedPerReadAlleleLikelihoodMap) {
|
||||
// either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
|
||||
// either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null
|
||||
|
||||
final GenotypesContext genotypes = vc.getGenotypes();
|
||||
if (genotypes == null || genotypes.size() == 0)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLineType;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias
|
||||
* User: rpoplin
|
||||
* Date: 8/28/13
|
||||
*/
|
||||
|
||||
public class StrandBiasBySample extends GenotypeAnnotation implements ExperimentalAnnotation {
|
||||
|
||||
public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB";
|
||||
|
||||
@Override
|
||||
public void annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
final ReferenceContext ref,
|
||||
final AlignmentContext stratifiedContext,
|
||||
final VariantContext vc,
|
||||
final Genotype g,
|
||||
final GenotypeBuilder gb,
|
||||
final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
|
||||
if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) )
|
||||
return;
|
||||
|
||||
if (alleleLikelihoodMap == null )
|
||||
throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller");
|
||||
|
||||
final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc);
|
||||
|
||||
gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table));
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); }
|
||||
|
||||
@Override
|
||||
public List<VCFFormatHeaderLine> getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); }
|
||||
|
||||
}
|
||||
|
|
@ -84,10 +84,10 @@ import java.util.List;
|
|||
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
|
||||
*
|
||||
* <p>
|
||||
* This walker will generated reduced versions of the BAM files that still follow the BAM spec
|
||||
* and contain all the information necessary for the GSA variant calling pipeline. Some options
|
||||
* allow you to tune in how much compression you want to achieve. The default values have been
|
||||
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
|
||||
* This tool will generate reduced versions of the BAM files that still follow the BAM specification
|
||||
* and contain all the information necessary to call variants according to the GATK Best Practices recommendations.
|
||||
* Some options allow you to tune how much compression you want to achieve. The default values have been
|
||||
* shown to reduce a typical whole exome BAM file by 100x. The higher the coverage, the bigger the
|
||||
* savings in file size and performance of the downstream tools.
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
|
|
@ -121,25 +121,25 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
private SAMFileWriter writerToUse = null;
|
||||
|
||||
/**
|
||||
* The number of bases to keep around mismatches (potential variation)
|
||||
*
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "The number of bases to keep around mismatches (potential variation)", required = false)
|
||||
public int contextSize = 10;
|
||||
|
||||
/**
|
||||
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
|
||||
* Reads that have
|
||||
* mapping quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "The minimum mapping quality to be considered for the consensus synthetic read", required = false)
|
||||
public int minMappingQuality = 20;
|
||||
|
||||
/**
|
||||
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
|
||||
* Reads that have
|
||||
* base quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "The minimum base quality to be considered for the consensus synthetic read", required = false)
|
||||
public byte minBaseQual = 15;
|
||||
|
||||
/**
|
||||
|
|
@ -160,81 +160,77 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
|
||||
* and read group).
|
||||
* This strips away all extra information of the read -- anything other than bases, quals
|
||||
* and read group.
|
||||
*/
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "Do not simplify read", required = false)
|
||||
public boolean DONT_SIMPLIFY_READS = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly in those cases.
|
||||
* Note that it is not necessary to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly by default in those cases.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "Do not hard clip adaptor sequences", required = false)
|
||||
public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
|
||||
* This option overrides the argument of minimum tail
|
||||
* quality.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "Do not hard clip the low quality tails of the reads", required = false)
|
||||
public boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
|
||||
/**
|
||||
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
* By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
* base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant
|
||||
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
|
||||
*/
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "Do not use high quality soft-clipped bases", required = false)
|
||||
public boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
|
||||
/**
|
||||
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
|
||||
* By default, ReduceReads will compress read names to numbers and guarantee
|
||||
* uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather
|
||||
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
|
||||
*/
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "Do not compress read names", required = false)
|
||||
public boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
|
||||
/**
|
||||
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
|
||||
* border.
|
||||
* The hard clips will happen exactly at the interval border.
|
||||
*/
|
||||
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
|
||||
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "Hard clip all incoming reads to the desired intervals", required = false)
|
||||
public boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
|
||||
/**
|
||||
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
|
||||
* Anything below this will be
|
||||
* considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that
|
||||
* this value is used only regions with high coverage.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "Minimum proportion of mismatches in a site to trigger a variant region", required = false)
|
||||
public double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region.
|
||||
* Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to
|
||||
* trigger polyploid compression). Note that this value is used only regions with low coverage.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false)
|
||||
@Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region", required = false)
|
||||
public double minAltPValueToTriggerVariant = 0.01;
|
||||
|
||||
/**
|
||||
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
* Anything below this will be considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "Minimum proportion of indels in a site to trigger a variant region", required = false)
|
||||
public double minIndelProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* The number of reads emitted per sample in a variant region can be downsampled for better compression.
|
||||
* This level of downsampling only happens after the region has been evaluated, therefore it can
|
||||
* be combined with the engine level downsampling.
|
||||
* A value of 0 turns downsampling off.
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "Downsample the number of reads emitted per sample in a variant region for better compression", required = false)
|
||||
public int downsampleCoverage = 250;
|
||||
|
||||
/**
|
||||
|
|
@ -243,27 +239,27 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
* To prevent users from unintentionally running the tool in a less than ideal manner, we require them
|
||||
* to explicitly enable multi-sample analysis with this argument.
|
||||
*/
|
||||
@Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "enable multi-samples reduction for cancer analysis", required = false)
|
||||
@Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "Enable multi-sample reduction for cancer analysis", required = false)
|
||||
public boolean ALLOW_MULTIPLE_SAMPLES = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false)
|
||||
@Argument(fullName = "nwayout", shortName = "nw", doc = "Generate separate output files per input file", required = false)
|
||||
public boolean nwayout = false;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
|
||||
@Argument(fullName = "", shortName = "dl", doc = "Debug level", required = false)
|
||||
public int debugLevel = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
|
||||
@Argument(fullName = "", shortName = "dr", doc = "Debug read", required = false)
|
||||
public String debugRead = "";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "Downsampling strategy", required = false)
|
||||
public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags", required = false)
|
||||
public boolean NO_PG_TAG = false;
|
||||
|
||||
public enum DownsampleStrategy {
|
||||
|
|
@ -297,7 +293,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes");
|
||||
|
||||
if ( nwayout && out != null )
|
||||
throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other");
|
||||
throw new UserException.CommandLineException("--out and --nwayout cannot be used simultaneously; please use one or the other");
|
||||
|
||||
if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 )
|
||||
throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
|
@ -306,7 +302,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
||||
if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES )
|
||||
throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis");
|
||||
throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis. If that is what you want to do, use the -cancer_mode flag.");
|
||||
|
||||
if ( known.isEmpty() )
|
||||
knownSnpPositions = null;
|
||||
|
|
|
|||
|
|
@ -1033,7 +1033,7 @@ public class SlidingWindow {
|
|||
protected void actuallyUpdateHeaderForRead(final LinkedList<HeaderElement> header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) {
|
||||
|
||||
final Iterator<HeaderElement> headerElementIterator = header.listIterator(startIndex);
|
||||
final byte mappingQuality = (byte) read.getMappingQuality();
|
||||
final int mappingQuality = read.getMappingQuality();
|
||||
final boolean isNegativeStrand = read.getReadNegativeStrandFlag();
|
||||
|
||||
// iterator variables
|
||||
|
|
@ -1062,14 +1062,15 @@ public class SlidingWindow {
|
|||
|
||||
break;
|
||||
case D:
|
||||
// deletions are added to the baseCounts with the read mapping quality as it's quality score
|
||||
// deletions are added to the baseCounts with the read mapping quality as its quality score
|
||||
final int nDeletionBases = cigarElement.getLength();
|
||||
final byte MQbyte = mappingQuality > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)mappingQuality;
|
||||
for ( int i = 0; i < nDeletionBases; i++ ) {
|
||||
headerElement = headerElementIterator.next();
|
||||
if (removeRead)
|
||||
headerElement.removeBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand);
|
||||
headerElement.removeBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand);
|
||||
else
|
||||
headerElement.addBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand);
|
||||
headerElement.addBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand);
|
||||
}
|
||||
locationIndex += nDeletionBases;
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -101,11 +101,24 @@ public class FindCoveredIntervals extends ActiveRegionWalker<GenomeLoc, Long> {
|
|||
@Argument(fullName = "coverage_threshold", shortName = "cov", doc = "The minimum allowable coverage to be considered covered", required = false)
|
||||
private int coverageThreshold = 20;
|
||||
|
||||
@Argument(fullName = "minBaseQuality", shortName = "minBQ", doc = "The minimum allowable base quality score to be counted for coverage",required = false)
|
||||
private int minBaseQuality = 0;
|
||||
|
||||
@Argument(fullName = "minMappingQuality", shortName = "minMQ", doc = "The minimum allowable mapping quality score to be counted for coverage",required = false)
|
||||
private int minMappingQuality = 0;
|
||||
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
// Look to see if the region has sufficient coverage
|
||||
public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
|
||||
|
||||
int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage();
|
||||
int depth;
|
||||
if(minBaseQuality == 0 && minMappingQuality == 0)
|
||||
depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage();
|
||||
else
|
||||
depth = context.getBasePileup().getBaseAndMappingFilteredPileup(minBaseQuality,minMappingQuality).depthOfCoverage();
|
||||
|
||||
// note the linear probability scale
|
||||
return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1));
|
||||
|
|
|
|||
|
|
@ -89,7 +89,12 @@ import java.util.*;
|
|||
* <p/>
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* A modified VCF detailing each interval by sample
|
||||
* A modified VCF detailing each interval by sample and information for each interval according to the thresholds used.
|
||||
* Interval information includes GC Content, average interval depth, callable status among others.
|
||||
*
|
||||
* If you use the --missing option, you can get as a second output a intervals file with the loci that have missing data.
|
||||
* This file can then be used as input to QualifyMissingIntervals for full qualification and interpretation of why
|
||||
* the data is missing.
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h3>Examples</h3>
|
||||
|
|
@ -117,6 +122,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
private static final String AVG_INTERVAL_DP_KEY = "IDP";
|
||||
private static final String LOW_COVERAGE_LOCI = "LL";
|
||||
private static final String ZERO_COVERAGE_LOCI = "ZL";
|
||||
private static final String GC_CONTENT_KEY = "GC";
|
||||
|
||||
|
||||
@Output(doc = "File to which interval statistics should be written")
|
||||
|
|
@ -161,7 +167,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
|
||||
// at this point, all intervals in intervalMap overlap with this locus, so update all of them
|
||||
for (IntervalStratification intervalStratification : intervalMap.values())
|
||||
intervalStratification.addLocus(context);
|
||||
intervalStratification.addLocus(context, ref);
|
||||
|
||||
return 1L;
|
||||
}
|
||||
|
|
@ -276,6 +282,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
|
||||
attributes.put(VCFConstants.END_KEY, interval.getStop());
|
||||
attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size()));
|
||||
attributes.put(GC_CONTENT_KEY, stats.gcContent());
|
||||
|
||||
vcb = vcb.attributes(attributes);
|
||||
vcb = vcb.genotypes(genotypes);
|
||||
|
|
@ -391,6 +398,7 @@ public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
|||
// INFO fields for overall data
|
||||
headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY));
|
||||
headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size."));
|
||||
headerLines.add(new VCFInfoHeaderLine(GC_CONTENT_KEY, 1, VCFHeaderLineType.Float, "GC Content of the interval"));
|
||||
headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
|
||||
|
||||
// FORMAT fields for each genotype
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -57,9 +58,13 @@ final class IntervalStratification extends AbstractStratification {
|
|||
private final Map<String, AbstractStratification> samples;
|
||||
private final GenomeLoc interval;
|
||||
private List<CallableStatus> callableStatuses;
|
||||
private long gcCount = 0;
|
||||
|
||||
public IntervalStratification(Set<String> samples, GenomeLoc interval, ThresHolder thresholds) {
|
||||
super(thresholds);
|
||||
|
||||
assert interval != null && interval.size() > 0; // contracts
|
||||
|
||||
this.interval = interval;
|
||||
this.samples = new HashMap<String, AbstractStratification>(samples.size());
|
||||
for (String sample : samples)
|
||||
|
|
@ -83,8 +88,11 @@ final class IntervalStratification extends AbstractStratification {
|
|||
* This takes the input and manages passing the data to the SampleStatistics and Locus Statistics
|
||||
*
|
||||
* @param context The alignment context given from the walker
|
||||
* @param ref The reference context given from the walker
|
||||
*/
|
||||
public void addLocus(AlignmentContext context) {
|
||||
public void addLocus(final AlignmentContext context, final ReferenceContext ref) {
|
||||
assert ref != null; // contracts
|
||||
|
||||
ReadBackedPileup pileup = context.getBasePileup();
|
||||
|
||||
Map<String, ReadBackedPileup> samplePileups = pileup.getPileupsForSamples(samples.keySet());
|
||||
|
|
@ -99,7 +107,11 @@ final class IntervalStratification extends AbstractStratification {
|
|||
|
||||
sampleStratification.addLocus(context.getLocation(), samplePileup);
|
||||
}
|
||||
gcCount += (ref.getBase() == 'G' || ref.getBase() == 'C') ? 1 : 0;
|
||||
}
|
||||
|
||||
public double gcContent() {
|
||||
return (double) gcCount / interval.size();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ final class ThresHolder {
|
|||
* If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE
|
||||
*/
|
||||
@Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false)
|
||||
public int maximumCoverage = 700;
|
||||
public int maximumCoverage = Integer.MAX_VALUE / 2;
|
||||
|
||||
/**
|
||||
* If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE
|
||||
|
|
|
|||
|
|
@ -47,29 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
* <p/>
|
||||
* <p>
|
||||
* [Long description of the walker]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Description of the Input]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Description of the Output]
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T [walker name]
|
||||
* </pre>
|
||||
* Metrics class for the QualifyMissingInterval walker
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 5/1/13
|
||||
|
|
@ -81,6 +59,8 @@ final class Metrics {
|
|||
private int reads;
|
||||
private int refs;
|
||||
|
||||
public Metrics() {}
|
||||
|
||||
void reads(int reads) {this.reads = reads;}
|
||||
void refs(int refs) {this.refs = refs;}
|
||||
|
||||
|
|
@ -108,4 +88,13 @@ final class Metrics {
|
|||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Test related constructor and methods
|
||||
protected Metrics(double gccontent, double baseQual, double mapQual, int reads, int refs) {
|
||||
this.gccontent = gccontent;
|
||||
this.baseQual = baseQual;
|
||||
this.mapQual = mapQual;
|
||||
this.reads = reads;
|
||||
this.refs = refs;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,16 +47,15 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Gather;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.NanoSchedulable;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportGatherer;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
|
@ -76,10 +75,12 @@ import java.util.List;
|
|||
* <ul>
|
||||
* <li>Average Base Quality</li>
|
||||
* <li>Average Mapping Quality</li>
|
||||
* <li>Average Depth</li>
|
||||
* <li>GC Content</li>
|
||||
* <li>Position in the target</li>
|
||||
* <li>Coding Sequence / Intron</li>
|
||||
* <li>Length of the uncovered area</li>
|
||||
* <li>Position in the target (Integer.MIN_VALUE if no overlap)</li>
|
||||
* <li>Length of the overlapping target (zero if no overlap)</li>
|
||||
* <li>Coding Sequence / Intron (optional)</li>
|
||||
* <li>Length of the uncovered interval</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
|
|
@ -89,7 +90,7 @@ import java.util.List;
|
|||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* GC content calculations per interval.
|
||||
* GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Example</h3>
|
||||
|
|
@ -107,19 +108,82 @@ import java.util.List;
|
|||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
@By(DataSource.REFERENCE)
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
public final class QualifyMissingIntervals extends LocusWalker<Metrics, Metrics> implements NanoSchedulable {
|
||||
/**
|
||||
* A single GATKReport table with the qualifications on why the intervals passed by the -L argument were missing.
|
||||
*/
|
||||
@Gather(GATKReportGatherer.class)
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
/**
|
||||
* List of targets used in the experiment. This file will be used to calculate the distance your missing
|
||||
* intervals are to the targets (usually exons). Typically this is your hybrid selection targets file
|
||||
* (e.g. Agilent exome target list)
|
||||
*/
|
||||
@Argument(shortName = "targets", required = true)
|
||||
public String targetsFile;
|
||||
|
||||
@Argument(shortName = "cds", required = false)
|
||||
public String cdsFile = null;
|
||||
/**
|
||||
* List of baits to distinguish untargeted intervals from those that are targeted but not covered
|
||||
*/
|
||||
@Argument(shortName = "baits", required = false)
|
||||
public String baitsFile = null;
|
||||
|
||||
/**
|
||||
* This value will be used to determine whether or not an interval had too high or too low GC content to be
|
||||
* sequenced. This is only applied if there was not enough data in the interval.
|
||||
*/
|
||||
@Argument(doc = "upper and lower bound for an interval to be considered high/low GC content",
|
||||
shortName = "gc", required = false)
|
||||
public double gcThreshold = 0.3;
|
||||
|
||||
/**
|
||||
* The coverage of a missing interval may determine whether or not an interval is sequenceable. A low coverage will
|
||||
* trigger gc content, mapping, base qualities and other checks to figure out why this interval was deemed
|
||||
* unsequenceable.
|
||||
*/
|
||||
@Argument(doc = "minimum coverage to be considered sequenceable",
|
||||
shortName = "cov", required = false)
|
||||
public int coverageThreshold = 20;
|
||||
|
||||
/**
|
||||
* An average mapping quality above this value will determine the interval to be mappable.
|
||||
*/
|
||||
@Argument(doc = "minimum mapping quality for it to be considered usable",
|
||||
shortName = "mmq", required = false)
|
||||
public byte mappingThreshold = 20;
|
||||
|
||||
/**
|
||||
* An average base quality above this value will rule out the possibility of context specific problems with the
|
||||
* sequencer.
|
||||
*/
|
||||
@Argument(doc = "minimum base quality for it to be considered usable",
|
||||
shortName = "mbq", required = false)
|
||||
public byte qualThreshold = 20;
|
||||
|
||||
/**
|
||||
* Intervals that are too small generate biased analysis. For example an interval of size 1 will have GC content
|
||||
* 1 or 0. To avoid misinterpreting small intervals, all intervals below this threshold will be ignored in the
|
||||
* interpretation.
|
||||
*/
|
||||
@Argument(doc = "minimum interval length to be considered",
|
||||
shortName = "size", required = false)
|
||||
public byte intervalSizeThreshold = 10;
|
||||
|
||||
enum Interpretation {
|
||||
UNKNOWN,
|
||||
UNMAPPABLE,
|
||||
UNSEQUENCEABLE,
|
||||
GCCONTENT,
|
||||
NO_DATA,
|
||||
SMALL_INTERVAL
|
||||
}
|
||||
|
||||
GATKReport simpleReport;
|
||||
GenomeLocSortedSet target;
|
||||
GenomeLocSortedSet cds;
|
||||
GenomeLocSortedSet targets;
|
||||
GenomeLocSortedSet baits;
|
||||
|
||||
public boolean isReduceByInterval() {
|
||||
return true;
|
||||
|
|
@ -127,13 +191,13 @@ public final class QualifyMissingIntervals extends LocusWalker<Metrics, Metrics>
|
|||
|
||||
public void initialize() {
|
||||
// if cds file is not provided, just use the targets file (no harm done)
|
||||
if (cdsFile == null)
|
||||
cdsFile = targetsFile;
|
||||
if (baitsFile == null)
|
||||
baitsFile = targetsFile;
|
||||
|
||||
simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "DP", "TP", "CD", "LN");
|
||||
simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "INTERVAL", "GC", "BQ", "MQ", "DP", "POS_IN_TARGET", "TARGET_SIZE", "BAITED", "MISSING_SIZE", "INTERPRETATION");
|
||||
final GenomeLocParser parser = getToolkit().getGenomeLocParser();
|
||||
target = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile));
|
||||
cds = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, cdsFile));
|
||||
targets = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile));
|
||||
baits = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, baitsFile));
|
||||
}
|
||||
|
||||
public Metrics reduceInit() {
|
||||
|
|
@ -174,29 +238,90 @@ public final class QualifyMissingIntervals extends LocusWalker<Metrics, Metrics>
|
|||
|
||||
public void onTraversalDone(List<Pair<GenomeLoc, Metrics>> results) {
|
||||
for (Pair<GenomeLoc, Metrics> r : results) {
|
||||
GenomeLoc interval = r.getFirst();
|
||||
Metrics metrics = r.getSecond();
|
||||
final GenomeLoc interval = r.getFirst();
|
||||
final Metrics metrics = r.getSecond();
|
||||
final List<GenomeLoc> overlappingIntervals = targets.getOverlapping(interval);
|
||||
|
||||
simpleReport.addRow(
|
||||
interval.toString(),
|
||||
metrics.gccontent(),
|
||||
metrics.baseQual(),
|
||||
metrics.mapQual(),
|
||||
metrics.depth(),
|
||||
getPositionInTarget(interval),
|
||||
cds.overlaps(interval),
|
||||
interval.size()
|
||||
getPositionInTarget(interval, overlappingIntervals),
|
||||
getTargetSize(overlappingIntervals),
|
||||
baits.overlaps(interval),
|
||||
interval.size(),
|
||||
interpret(metrics, interval)
|
||||
);
|
||||
}
|
||||
simpleReport.print(out);
|
||||
out.close();
|
||||
}
|
||||
|
||||
private int getPositionInTarget(GenomeLoc interval) {
|
||||
final List<GenomeLoc> hits = target.getOverlapping(interval);
|
||||
int result = 0;
|
||||
for (GenomeLoc hit : hits) {
|
||||
result = interval.getStart() - hit.getStart(); // if there are multiple hits, we'll get the last one.
|
||||
protected static int getPositionInTarget(final GenomeLoc interval, final List<GenomeLoc> targets) {
|
||||
if (targets.size() > 0) {
|
||||
final GenomeLoc target = targets.get(0);
|
||||
|
||||
// interval is larger on both ends than the target -- return the maximum distance to either side as a negative number. (min of 2 negative numbers)
|
||||
if (interval.getStart() < target.getStart() && interval.getStop() > target.getStop())
|
||||
return Math.min(target.getStart() - interval.getStart(), target.getStop() - interval.getStop());
|
||||
|
||||
// interval is a left overlap -- return a negative number representing the distance between the two starts
|
||||
else if (interval.getStart() < target.getStart())
|
||||
return interval.getStart() - target.getStart();
|
||||
|
||||
// interval is a right overlap -- return a negative number representing the distance between the two stops
|
||||
else if (interval.getStop() > target.getStop())
|
||||
return target.getStop() - interval.getStop();
|
||||
|
||||
// interval is fully contained -- return the smallest distance to the edge of the target (left or right) as a positive number
|
||||
return Math.min(interval.getStart() - target.getStart(), target.getStop() - interval.getStop());
|
||||
}
|
||||
return result;
|
||||
// if there is no overlapping interval, return int min value.
|
||||
return Integer.MIN_VALUE;
|
||||
}
|
||||
|
||||
private int getTargetSize(final List<GenomeLoc> overlappingIntervals) {
|
||||
return overlappingIntervals.size() > 0 ? overlappingIntervals.get(0).size() : -1;
|
||||
}
|
||||
|
||||
String interpret(final Metrics metrics, final GenomeLoc interval) {
|
||||
if (interval.size() < intervalSizeThreshold) {
|
||||
return Interpretation.SMALL_INTERVAL.toString();
|
||||
}
|
||||
else if (metrics.depth() == 0.0) {
|
||||
return Interpretation.NO_DATA.toString();
|
||||
}
|
||||
return trim(checkMappability(metrics) + checkGCContent(metrics) + checkContext(metrics));
|
||||
}
|
||||
|
||||
String checkMappability(Metrics metrics) {
|
||||
return metrics.depth() >= coverageThreshold && metrics.mapQual() < mappingThreshold ?
|
||||
Interpretation.UNMAPPABLE + ", " : "";
|
||||
}
|
||||
|
||||
String checkGCContent(Metrics metrics) {
|
||||
return metrics.depth() < coverageThreshold && (metrics.gccontent() < gcThreshold || metrics.gccontent() > 1.0-gcThreshold) ?
|
||||
Interpretation.GCCONTENT + ", " : "";
|
||||
}
|
||||
|
||||
String checkContext(Metrics metrics) {
|
||||
return metrics.depth() < coverageThreshold && metrics.baseQual() < qualThreshold ?
|
||||
Interpretation.UNSEQUENCEABLE + ", " : "";
|
||||
}
|
||||
|
||||
String trim (String s) {
|
||||
if (s.isEmpty())
|
||||
return Interpretation.UNKNOWN.toString();
|
||||
|
||||
s = s.trim();
|
||||
if (s.endsWith(","))
|
||||
s = s.substring(0, s.length() - 1);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -76,16 +76,13 @@ public class ConsensusAlleleCounter {
|
|||
private final int minIndelCountForGenotyping;
|
||||
private final boolean doMultiAllelicCalls;
|
||||
private final double minFractionInOneSample;
|
||||
private final GenomeLocParser locParser;
|
||||
|
||||
public ConsensusAlleleCounter(final GenomeLocParser locParser,
|
||||
final boolean doMultiAllelicCalls,
|
||||
public ConsensusAlleleCounter(final boolean doMultiAllelicCalls,
|
||||
final int minIndelCountForGenotyping,
|
||||
final double minFractionInOneSample) {
|
||||
this.minIndelCountForGenotyping = minIndelCountForGenotyping;
|
||||
this.doMultiAllelicCalls = doMultiAllelicCalls;
|
||||
this.minFractionInOneSample = minFractionInOneSample;
|
||||
this.locParser = locParser;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -289,7 +286,7 @@ public class ConsensusAlleleCounter {
|
|||
if (vcs.isEmpty())
|
||||
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
|
||||
|
||||
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
|
||||
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false);
|
||||
return mergedVC.getAlleles();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
|
|||
final List<Allele> allAllelesToUse){
|
||||
|
||||
|
||||
List<Allele> alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
|
||||
List<Allele> alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, UAC,true);
|
||||
|
||||
if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
|
||||
alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
|
||||
|
|
|
|||
|
|
@ -89,9 +89,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
protected static List<Allele> computeConsensusAlleles(final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final UnifiedArgumentCollection UAC) {
|
||||
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
|
||||
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
|
||||
return counter.computeConsensusAlleles(ref, contexts, contextType);
|
||||
}
|
||||
|
||||
|
|
@ -113,7 +112,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
// starting a new site: clear allele list
|
||||
haplotypeMap.clear();
|
||||
perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
|
||||
alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels);
|
||||
alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels);
|
||||
if (alleleList.isEmpty())
|
||||
return null;
|
||||
}
|
||||
|
|
@ -212,7 +211,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final UnifiedArgumentCollection UAC,
|
||||
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
|
||||
|
||||
|
|
@ -244,7 +242,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
}
|
||||
|
||||
} else {
|
||||
alleles = computeConsensusAlleles(ref, contexts, contextType, locParser, UAC);
|
||||
alleles = computeConsensusAlleles(ref, contexts, contextType, UAC);
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -95,7 +95,11 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
|
|||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
|
||||
public int MIN_BASE_QUALTY_SCORE = 17;
|
||||
|
||||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
|
||||
/**
|
||||
* If the fraction of reads with deletions spanning a locus is greater than this value, the site will not be considered callable and will be skipped.
|
||||
* To disable the use of this parameter, set its value to >1.
|
||||
*/
|
||||
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable", required = false)
|
||||
public Double MAX_DELETION_FRACTION = 0.05;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -374,7 +374,7 @@ public class UnifiedGenotyperEngine {
|
|||
final VariantContext vc,
|
||||
final GenotypeLikelihoodsCalculationModel.Model model,
|
||||
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
|
||||
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap);
|
||||
return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ class ActiveRegionTrimmer {
|
|||
if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping);
|
||||
if ( parser == null ) throw new IllegalArgumentException("parser cannot be null");
|
||||
|
||||
logger.debug("Trimmer created with parameters " + logTrimming + " " + snpPadding + " " + nonSnpPadding + " " + maxDistanceInExtensionForGenotyping);
|
||||
this.logTrimming = logTrimming;
|
||||
this.snpPadding = snpPadding;
|
||||
this.nonSnpPadding = nonSnpPadding;
|
||||
|
|
@ -101,28 +102,35 @@ class ActiveRegionTrimmer {
|
|||
*
|
||||
* @param region our full active region
|
||||
* @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position
|
||||
* @param emitReferenceConfidence are we going to estimate the reference confidence with this active region?
|
||||
* @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully
|
||||
*/
|
||||
public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet<VariantContext> allVariantsWithinExtendedRegion) {
|
||||
public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet<VariantContext> allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) {
|
||||
|
||||
if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region
|
||||
return null;
|
||||
|
||||
final List<VariantContext> withinActiveRegion = new LinkedList<VariantContext>();
|
||||
int pad = snpPadding;
|
||||
final List<VariantContext> withinActiveRegion = new LinkedList<>();
|
||||
boolean foundNonSnp = false;
|
||||
GenomeLoc trimLoc = null;
|
||||
for ( final VariantContext vc : allVariantsWithinExtendedRegion ) {
|
||||
final GenomeLoc vcLoc = parser.createGenomeLoc(vc);
|
||||
if ( region.getLocation().overlapsP(vcLoc) ) {
|
||||
if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding
|
||||
pad = nonSnpPadding;
|
||||
foundNonSnp = true;
|
||||
trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc);
|
||||
withinActiveRegion.add(vc);
|
||||
}
|
||||
}
|
||||
final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding );
|
||||
|
||||
// we don't actually have anything in the region after removing variants that don't overlap the region's full location
|
||||
if ( trimLoc == null ) return null;
|
||||
|
||||
// final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
|
||||
// Try to have one kmer before and after any event.
|
||||
|
||||
final GenomeLoc regionLoc = region.getLocation();
|
||||
final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping);
|
||||
final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad);
|
||||
final GenomeLoc finalSpan = maxSpan.intersect(idealSpan);
|
||||
|
|
@ -130,6 +138,7 @@ class ActiveRegionTrimmer {
|
|||
final ActiveRegion trimmedRegion = region.trim(finalSpan);
|
||||
if ( logTrimming ) {
|
||||
logger.info("events : " + withinActiveRegion);
|
||||
logger.info("region : " + regionLoc);
|
||||
logger.info("trimLoc : " + trimLoc);
|
||||
logger.info("pad : " + pad);
|
||||
logger.info("idealSpan : " + idealSpan);
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
|
||||
|
||||
/**
|
||||
* Result of assembling, with the resulting graph and status
|
||||
|
|
@ -57,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
|||
*/
|
||||
public class AssemblyResult {
|
||||
private final Status status;
|
||||
private ReadThreadingGraph threadingGraph;
|
||||
private final SeqGraph graph;
|
||||
|
||||
/**
|
||||
|
|
@ -72,9 +74,25 @@ public class AssemblyResult {
|
|||
this.graph = graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the threading-graph associated with this assembly-result.
|
||||
*/
|
||||
public void setThreadingGraph(final ReadThreadingGraph threadingGraph) {
|
||||
this.threadingGraph = threadingGraph;
|
||||
}
|
||||
|
||||
public ReadThreadingGraph getThreadingGraph() {
|
||||
return threadingGraph;
|
||||
}
|
||||
|
||||
public Status getStatus() { return status; }
|
||||
public SeqGraph getGraph() { return graph; }
|
||||
|
||||
public int getKmerSize() {
|
||||
return graph.getKmerSize();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Status of the assembly result
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,466 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Collection of read assembly using several kmerSizes.
|
||||
*
|
||||
* <p>
|
||||
* There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those
|
||||
* assemblies.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one.
|
||||
* </p>
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.com>
|
||||
*/
|
||||
public class AssemblyResultSet {
|
||||
|
||||
private final Map<Integer,AssemblyResult> assemblyResultByKmerSize;
|
||||
private final Set<Haplotype> haplotypes;
|
||||
private final Map<Haplotype,AssemblyResult> assemblyResultByHaplotype;
|
||||
private ActiveRegion regionForGenotyping;
|
||||
private byte[] fullReferenceWithPadding;
|
||||
private GenomeLoc paddedReferenceLoc;
|
||||
private boolean variationPresent;
|
||||
private Haplotype refHaplotype;
|
||||
private boolean wasTrimmed = false;
|
||||
private final CountSet kmerSizes;
|
||||
|
||||
/**
|
||||
* Constructs a new empty assembly result set.
|
||||
*/
|
||||
public AssemblyResultSet() {
|
||||
assemblyResultByKmerSize = new LinkedHashMap<>(4);
|
||||
haplotypes = new LinkedHashSet<>(10);
|
||||
assemblyResultByHaplotype = new LinkedHashMap<>(10);
|
||||
kmerSizes = new CountSet(4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims an assembly result set down based on a new set of trimmed haplotypes.
|
||||
*
|
||||
* @param originalByTrimmedHaplotypes map from trimmed to original haplotypes.
|
||||
* @param trimmedActiveRegion the trimmed down active region.
|
||||
*
|
||||
* @throws NullPointerException if any argument in {@code null} or
|
||||
* if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys.
|
||||
* @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones.
|
||||
*
|
||||
*
|
||||
* @return never {@code null}, a new trimmed assembly result set.
|
||||
*/
|
||||
public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion,
|
||||
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes) {
|
||||
if (refHaplotype == null) throw new IllegalStateException();
|
||||
if (trimmedActiveRegion == null) throw new NullPointerException();
|
||||
final AssemblyResultSet result = new AssemblyResultSet();
|
||||
|
||||
for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) {
|
||||
final Haplotype original = originalByTrimmedHaplotypes.get(trimmed);
|
||||
if (original == null)
|
||||
throw new NullPointerException("all trimmed haplotypes must have an original one");
|
||||
final AssemblyResult as = assemblyResultByHaplotype.get(original);
|
||||
if (as == null) result.add(trimmed); else result.add(trimmed, as);
|
||||
}
|
||||
|
||||
result.setRegionForGenotyping(trimmedActiveRegion);
|
||||
result.setFullReferenceWithPadding(this.fullReferenceWithPadding);
|
||||
result.setPaddedReferenceLoc(this.paddedReferenceLoc);
|
||||
if (result.refHaplotype == null)
|
||||
throw new IllegalStateException("missing reference haplotype in the trimmed set");
|
||||
result.wasTrimmed = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query the reference haplotype in the result set.
|
||||
* @return {@code null} if none wasn't yet added, otherwise a reference haplotype.
|
||||
*/
|
||||
public Haplotype getReferenceHaplotype() {
|
||||
return refHaplotype;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether there is any variation present in the assembly result set.
|
||||
*
|
||||
* <p>
|
||||
* This is equivalent to whether there is more than one haplotype.
|
||||
* </p>
|
||||
*
|
||||
* @return {@code true} if there is variation present, {@code false} otherwise.
|
||||
*/
|
||||
public boolean isVariationPresent() {
|
||||
return variationPresent && haplotypes.size() > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumps debugging information into a print-writer.
|
||||
*
|
||||
* @param pw where to dump the information.
|
||||
*
|
||||
* @throws NullPointerException if {@code pw} is {@code null}.
|
||||
*/
|
||||
public void debugDump(final PrintWriter pw) {
|
||||
if (getHaplotypeList().size() == 0) {
|
||||
return;
|
||||
}
|
||||
pw.println("Active Region " + this.regionForGenotyping.getLocation());
|
||||
pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc());
|
||||
pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation());
|
||||
pw.println("Haplotype count " + haplotypes.size());
|
||||
final Map<Integer,Integer> kmerSizeToCount = new HashMap<>();
|
||||
|
||||
for (final Map.Entry<Haplotype,AssemblyResult> e : assemblyResultByHaplotype.entrySet()) {
|
||||
final AssemblyResult as = e.getValue();
|
||||
final int kmerSize = as.getGraph().getKmerSize();
|
||||
if (kmerSizeToCount.containsKey(kmerSize)) {
|
||||
kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1);
|
||||
} else {
|
||||
kmerSizeToCount.put(kmerSize,1);
|
||||
}
|
||||
}
|
||||
pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() );
|
||||
Integer[] kmerSizes = new Integer[kmerSizeToCount.size()];
|
||||
kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes);
|
||||
Arrays.sort(kmerSizes);
|
||||
pw.println("Kmer sizes values " + Arrays.toString(kmerSizes));
|
||||
for (int size : kmerSizes) {
|
||||
pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a haplotype to the result set without indicating a generating assembly result.
|
||||
*
|
||||
* <p>
|
||||
* It is possible to call this method with the same haplotype several times. In that the second and further
|
||||
* calls won't have any effect (thus returning {@code false}).
|
||||
* </p>
|
||||
*
|
||||
* @param h the haplotype to add to the assembly result set.
|
||||
*
|
||||
* @throws NullPointerException if {@code h} is {@code null}
|
||||
* @throws IllegalArgumentException if {@code h} does not have a genome location.
|
||||
*
|
||||
* @return {@code true} if the assembly result set has been modified as a result of this call.
|
||||
*/
|
||||
public boolean add(final Haplotype h) {
|
||||
if (h == null) throw new NullPointerException("input haplotype cannot be null");
|
||||
if (h.getGenomeLocation() == null)
|
||||
throw new IllegalArgumentException("the haplotype provided must have a genomic location");
|
||||
if (haplotypes.contains(h))
|
||||
return false;
|
||||
haplotypes.add(h);
|
||||
updateReferenceHaplotype(h);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds simultaneously a haplotype and the generating assembly-result.
|
||||
*
|
||||
* <p>
|
||||
* Haplotypes and their assembly-result can be added multiple times although just the first call will have
|
||||
* any effect (return value is {@code true}).
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* @param h haplotype to add.
|
||||
* @param ar assembly-result that is assumed to have given rise to that haplotype.
|
||||
*
|
||||
* @throws NullPointerException if {@code h} or {@code ar} is {@code null}.
|
||||
* @throws IllegalArgumentException if {@code h} has not defined genome location.
|
||||
*
|
||||
* @return {@code true} iff this called changes the assembly result set.
|
||||
*/
|
||||
public boolean add(final Haplotype h, final AssemblyResult ar) {
|
||||
if (h == null) throw new NullPointerException("input haplotype cannot be null");
|
||||
if (ar == null) throw new NullPointerException("input assembly-result cannot be null");
|
||||
if (h.getGenomeLocation() == null)
|
||||
throw new IllegalArgumentException("the haplotype provided must have a genomic location");
|
||||
|
||||
final boolean assemblyResultAdditionReturn = add(ar);
|
||||
|
||||
if (haplotypes.contains(h)) {
|
||||
final AssemblyResult previousAr = assemblyResultByHaplotype.get(h);
|
||||
if (previousAr == null) {
|
||||
assemblyResultByHaplotype.put(h, ar);
|
||||
return true;
|
||||
} else if (!previousAr.equals(ar))
|
||||
throw new IllegalStateException("there is already a different assembly result for the input haplotype");
|
||||
else
|
||||
return assemblyResultAdditionReturn;
|
||||
} else {
|
||||
haplotypes.add(h);
|
||||
assemblyResultByHaplotype.put(h,ar);
|
||||
updateReferenceHaplotype(h);
|
||||
if (h.isNonReference()) variationPresent = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a assembly-result object.
|
||||
*
|
||||
* @param ar the assembly result to add.
|
||||
*
|
||||
* @throws NullPointerException if {@code ar} is {@code null}.
|
||||
* @throws IllegalStateException if there is an assembly result with the same kmerSize.
|
||||
* @return {@code true} iff this addition changed the assembly result set.
|
||||
*/
|
||||
public boolean add(final AssemblyResult ar) {
|
||||
if (ar == null)
|
||||
throw new NullPointerException();
|
||||
final int kmerSize = ar.getKmerSize();
|
||||
if (assemblyResultByKmerSize.containsKey(kmerSize)) {
|
||||
if (!assemblyResultByKmerSize.get(kmerSize).equals(ar))
|
||||
throw new IllegalStateException("a different assembly result with the same kmerSize was already added");
|
||||
return false;
|
||||
} else {
|
||||
assemblyResultByKmerSize.put(kmerSize, ar);
|
||||
kmerSizes.add(kmerSize);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current region for genotyping.
|
||||
*
|
||||
* @return might be {@code null}.
|
||||
*/
|
||||
public ActiveRegion getRegionForGenotyping() {
|
||||
return regionForGenotyping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the region for genotyping.
|
||||
*
|
||||
* @param regionForGenotyping the new value.
|
||||
*/
|
||||
public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) {
|
||||
this.regionForGenotyping = regionForGenotyping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current full reference with padding.
|
||||
*
|
||||
* @return might be {@code null}.
|
||||
*/
|
||||
public byte[] getFullReferenceWithPadding() {
|
||||
return fullReferenceWithPadding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the full reference with padding base sequence.
|
||||
*
|
||||
* @param fullReferenceWithPadding the new value.
|
||||
*/
|
||||
public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) {
|
||||
this.fullReferenceWithPadding = fullReferenceWithPadding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the padded reference location.
|
||||
*
|
||||
* @return might be {@code null}
|
||||
*/
|
||||
public GenomeLoc getPaddedReferenceLoc() {
|
||||
return paddedReferenceLoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Changes the padded reference location.
|
||||
* @param paddedReferenceLoc the new value.
|
||||
*/
|
||||
public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) {
|
||||
this.paddedReferenceLoc = paddedReferenceLoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of haplotypes in the assembly result set.
|
||||
* @return {@code 0} or greater.
|
||||
*/
|
||||
public int getHaplotypeCount() {
|
||||
return haplotypes.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the haplotypes as a list.
|
||||
*
|
||||
* <p>
|
||||
* The result is unmodifiable.
|
||||
* </p>
|
||||
*
|
||||
* @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly.
|
||||
*/
|
||||
public List<Haplotype> getHaplotypeList() {
|
||||
return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum kmerSize available.
|
||||
*
|
||||
* @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize.
|
||||
*
|
||||
* @return greater than 0.
|
||||
*/
|
||||
public int getMaximumKmerSize() {
|
||||
if (kmerSizes.size() == 0)
|
||||
throw new IllegalStateException("there is yet no kmerSize in this assembly result set");
|
||||
return kmerSizes.max();
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether there are more than one kmerSize in the set.
|
||||
*
|
||||
* @return {@code true} iff there is more than one kmerSize assembly in the set.
|
||||
*/
|
||||
public boolean hasMultipleKmerSizes() {
|
||||
return kmerSizes.size() > 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the minimum kmerSize available.
|
||||
*
|
||||
* @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize.
|
||||
*
|
||||
* @return greater than 0.
|
||||
*/
|
||||
public int getMinimumKmerSize() {
|
||||
if (kmerSizes.size() == 0)
|
||||
throw new IllegalStateException("there is yet no kmerSize in this assembly result set");
|
||||
return kmerSizes.min();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a read-threading graph in the assembly set that has a particular kmerSize.
|
||||
*
|
||||
* @param kmerSize the requested kmerSize.
|
||||
*
|
||||
* @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize.
|
||||
*/
|
||||
public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) {
|
||||
final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize);
|
||||
if (assemblyResult == null) return null;
|
||||
return assemblyResult.getThreadingGraph();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether this assembly result set was trimmed.
|
||||
*
|
||||
* @return {@code true} iff this assembly result set was trimmed.
|
||||
*/
|
||||
public boolean wasTrimmed() {
|
||||
return wasTrimmed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the assembly as not having variation even if it has more than one haplotype.
|
||||
*/
|
||||
public void resetVariationPresent() {
|
||||
variationPresent = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Dumps debugging information into a logger.
|
||||
*
|
||||
* @param logger where to dump the information.
|
||||
*
|
||||
* @throws NullPointerException if {@code logger} is {@code null}.
|
||||
*/
|
||||
public void debugDump(final Logger logger) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
final PrintWriter pw = new PrintWriter(sw);
|
||||
debugDump(pw);
|
||||
final String str = sw.toString();
|
||||
final String[] lines = str.split("\n");
|
||||
for (final String line : lines) {
|
||||
if (line.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
logger.debug(line);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the
|
||||
* reference haplotype and updates {@link #refHaplotype} accordingly.
|
||||
*
|
||||
* <p>
|
||||
* This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes}
|
||||
* I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference
|
||||
* haplotype and this has already be set. This is the case even if the new haplotypes and the current reference
|
||||
* are equal.
|
||||
* </p>
|
||||
*
|
||||
* @param newHaplotype the new haplotype.
|
||||
* @throws NullPointerException if {@code newHaplotype} is {@code null}.
|
||||
* @throws IllegalStateException if there is already a reference haplotype.
|
||||
*/
|
||||
private void updateReferenceHaplotype(final Haplotype newHaplotype) {
|
||||
if (!newHaplotype.isReference()) return;
|
||||
if (refHaplotype == null)
|
||||
refHaplotype = newHaplotype;
|
||||
else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality.
|
||||
throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Represents an event block in the graph.
|
||||
*
|
||||
* <p>
|
||||
* Event block is defined as the non-trivial section of the haplotype-graph between two vertices along the
|
||||
* reference route, that has at least one alternative route between those two vertices.
|
||||
* </p>
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class EventBlock {
|
||||
|
||||
private final HaplotypeGraph graph;
|
||||
|
||||
private final MultiDeBruijnVertex source;
|
||||
|
||||
private final int sourcePosition;
|
||||
|
||||
private final MultiDeBruijnVertex sink;
|
||||
|
||||
private final int sinkPosition;
|
||||
|
||||
private Set<Route<MultiDeBruijnVertex,MultiSampleEdge>> routesAcross;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a event block given the base haplotype graph and the source and sink vertice (both included in the block)
|
||||
* @param graph the base haplotype graph.
|
||||
* @param source the starting vertex.
|
||||
* @param sink the ending vertex.
|
||||
*
|
||||
* @throws NullPointerException if any of the input is {@code null}.
|
||||
* @throws IllegalArgumentException if {@code source} or {@code sink} are not part of the graphs reference route,
|
||||
* such a route does not exists, any of the vertices is not part of such a route or they are out of order.
|
||||
*/
|
||||
public EventBlock(final HaplotypeGraph graph, final MultiDeBruijnVertex source, final MultiDeBruijnVertex sink) {
|
||||
if (graph == null) throw new NullPointerException("the graph cannot be null");
|
||||
if (source == null) throw new NullPointerException("the source vertex is null");
|
||||
if (sink == null) throw new NullPointerException("the sink node is null");
|
||||
this.graph = graph;
|
||||
this.source = source;
|
||||
this.sink = sink;
|
||||
final HaplotypeRoute route = graph.getReferenceRoute();
|
||||
if (route == null)
|
||||
throw new IllegalArgumentException("there is reference route in the graph");
|
||||
this.sourcePosition = route.getVertexPosition(source);
|
||||
this.sinkPosition = route.getVertexPosition(sink);
|
||||
if (sourcePosition == -1)
|
||||
throw new IllegalArgumentException("the source vertex does not belong to the reference route");
|
||||
if (sinkPosition == -1)
|
||||
throw new IllegalArgumentException("the sink vertex does not belong to the reference route");
|
||||
if (sourcePosition > sinkPosition)
|
||||
throw new IllegalArgumentException("source and sink vertices are out of order in reference route");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reference to the event block graph.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public HaplotypeGraph getGraph() {
|
||||
return graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reference to the block starting vertex.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public MultiDeBruijnVertex getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the reference ot the end block vertex.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public MultiDeBruijnVertex getSink() {
|
||||
return sink;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all possible routes between the event block start and end vertices.
|
||||
* @return never {@code null}, and unmodifiable route set.
|
||||
*/
|
||||
public Set<Route<MultiDeBruijnVertex,MultiSampleEdge>> getRoutesAcross() {
|
||||
// catching:
|
||||
if (routesAcross != null) return routesAcross;
|
||||
|
||||
final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> result = new HashSet<>(10); // 10 is rather generous.
|
||||
|
||||
// bread-first iterative search for all paths.
|
||||
final Queue<Route<MultiDeBruijnVertex, MultiSampleEdge>> queue = new LinkedList<>();
|
||||
|
||||
queue.add(new Route<>(source, graph)); // the seed is the empty route at the start vertex.
|
||||
|
||||
final HaplotypeRoute referenceRoute = graph.getReferenceRoute();
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
final Route<MultiDeBruijnVertex, MultiSampleEdge> route = queue.remove();
|
||||
final MultiDeBruijnVertex routeEndVertex = route.getLastVertex();
|
||||
|
||||
if (routeEndVertex == sink) // bingo!!!
|
||||
result.add(route);
|
||||
else { // only queue promising extension of this route.
|
||||
final int routeEndPosition = referenceRoute.getVertexPosition(routeEndVertex);
|
||||
if (routeEndPosition == -1 || (routeEndPosition >= sourcePosition && routeEndPosition < sinkPosition))
|
||||
for (final MultiSampleEdge e : graph.outgoingEdgesOf(routeEndVertex))
|
||||
queue.add(new Route<>(route, e));
|
||||
}
|
||||
}
|
||||
return routesAcross = Collections.unmodifiableSet(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,287 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Encapsulates the graph traversals needed to find event-blocks.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class EventBlockFinder {
|
||||
|
||||
private final HaplotypeGraph graph;
|
||||
|
||||
private final Map<Pair<MultiDeBruijnVertex,MultiDeBruijnVertex>,EventBlock> eventBlockCache;
|
||||
|
||||
/**
|
||||
* Constructs a new engine.
|
||||
*
|
||||
* @param graph the base haplotype graph to iterate over.
|
||||
*/
|
||||
public EventBlockFinder(final HaplotypeGraph graph) {
|
||||
if (graph == null) throw new NullPointerException();
|
||||
this.graph = graph;
|
||||
eventBlockCache = new HashMap<>(20);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new traversal object based on a read anchoring.
|
||||
* @param anchoring
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public Traversal traversal(final ReadAnchoring anchoring) {
|
||||
if (anchoring == null) throw new NullPointerException();
|
||||
return new Traversal(anchoring);
|
||||
}
|
||||
|
||||
|
||||
public class Traversal implements Iterable<EventBlock> {
|
||||
|
||||
private final ReadAnchoring anchoring;
|
||||
|
||||
private EventBlock lastEventBlock;
|
||||
|
||||
|
||||
private Traversal(final ReadAnchoring anchoring) {
|
||||
this.anchoring = anchoring;
|
||||
lastEventBlock = findLastEventBlock(anchoring);
|
||||
}
|
||||
|
||||
@Override
|
||||
public java.util.Iterator<EventBlock> iterator() {
|
||||
return lastEventBlock == null ? Collections.EMPTY_SET.iterator() : new Iterator();
|
||||
}
|
||||
|
||||
private class Iterator implements java.util.Iterator<EventBlock> {
|
||||
|
||||
private MultiDeBruijnVertex currentVertex;
|
||||
|
||||
private Iterator() {
|
||||
currentVertex = anchoring.leftAnchorVertex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return currentVertex != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EventBlock next() {
|
||||
final EventBlock result;
|
||||
if (currentVertex == null)
|
||||
throw new NoSuchElementException("going beyond last event block");
|
||||
else if (currentVertex == lastEventBlock.getSource()) {
|
||||
result = lastEventBlock;
|
||||
currentVertex = null;
|
||||
} else {
|
||||
final EventBlock candidate = findEventBlock(anchoring,false,currentVertex,lastEventBlock.getSource());
|
||||
if (candidate == null) {
|
||||
result = findEventBlock(anchoring,false,currentVertex,anchoring.rightAnchorVertex);
|
||||
currentVertex = null;
|
||||
} else {
|
||||
result = candidate;
|
||||
currentVertex = candidate.getSink();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the last event block.
|
||||
* <p>
|
||||
* It can do it forward or backwards.
|
||||
* </p>
|
||||
*
|
||||
* @param anchoring target read anchoring information.
|
||||
* @return {@code null} if there is no event block, depending on {@code backwards} before or after current
|
||||
*/
|
||||
private EventBlock findLastEventBlock(
|
||||
final ReadAnchoring anchoring) {
|
||||
return findEventBlock(anchoring,true,anchoring.leftAnchorVertex,anchoring.rightAnchorVertex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds an event block forward or backwards along the reference route.
|
||||
* @param anchoring the read anchoring information.
|
||||
* @param backwards true if the block should be constructed from right to left.
|
||||
* @param leftVertex the left vertex
|
||||
* @param rightVertex the right vertex
|
||||
* @return {@code null} if there is no such a event block between these coordinates.
|
||||
*/
|
||||
private EventBlock findEventBlock(
|
||||
final ReadAnchoring anchoring, final boolean backwards,
|
||||
final MultiDeBruijnVertex leftVertex, final MultiDeBruijnVertex rightVertex) {
|
||||
|
||||
MultiDeBruijnVertex currentVertex = backwards ? rightVertex : leftVertex;
|
||||
boolean foundEvent = false;
|
||||
final CountSet pathSizes = new CountSet(10); // typically more than enough.
|
||||
pathSizes.setTo(0);
|
||||
|
||||
// Map between reference vertices where there is some expected open alternative path rejoining and the
|
||||
// predicted length of paths rejoining at that point counting from the beginning of the block.
|
||||
final Map<MultiDeBruijnVertex, CountSet> expectedAlternativePathRejoins = new HashMap<>(4);
|
||||
|
||||
// Keeps record of possible left-clipping veritces; those that are located before any event path furcation
|
||||
// has been found. The value indicates the blockLength at the time we traverse that node.
|
||||
final Deque<Pair<MultiDeBruijnVertex, Integer>> possibleClippingPoints = new LinkedList<>();
|
||||
|
||||
// We keep the distance from the beggining of the block (leftVertex).
|
||||
int blockLength = 0;
|
||||
while (currentVertex != null) {
|
||||
int openingDegree = backwards ? graph.outDegreeOf(currentVertex) : graph.inDegreeOf(currentVertex);
|
||||
if (openingDegree > 1) {
|
||||
final CountSet joiningPathLengths = expectedAlternativePathRejoins.remove(currentVertex);
|
||||
if (joiningPathLengths != null)
|
||||
pathSizes.addAll(joiningPathLengths);
|
||||
}
|
||||
final boolean isValidBlockEnd = isValidBlockEnd(anchoring, currentVertex, expectedAlternativePathRejoins);
|
||||
if (foundEvent && isValidBlockEnd) // !gotcha we found a valid block end.
|
||||
break;
|
||||
else if (!foundEvent && isValidBlockEnd) // if no event has been found yet, still is a good clipping point.
|
||||
possibleClippingPoints.addLast(new Pair<>(currentVertex, blockLength));
|
||||
|
||||
// We reached the end:
|
||||
if (currentVertex == (backwards ? leftVertex : rightVertex))
|
||||
break;
|
||||
|
||||
// process next vertices, the next one on the reference and also possible start of alternative paths,
|
||||
// updates traversal structures accordingly.
|
||||
currentVertex = advanceOnReferencePath(anchoring, backwards, currentVertex, pathSizes, expectedAlternativePathRejoins);
|
||||
foundEvent |= expectedAlternativePathRejoins.size() > 0;
|
||||
pathSizes.incAll(1);
|
||||
blockLength++;
|
||||
}
|
||||
|
||||
// we have not found an event, thus there is no block to report:
|
||||
if (!foundEvent)
|
||||
return null;
|
||||
|
||||
// We try to clip off as much as we can from the beginning of the block before any event, but at least
|
||||
// leaving enough block length to meet the shortest path unless all paths have the same size (SNPs only)
|
||||
final int maxClipping = pathSizes.size() <= 1 ? blockLength : pathSizes.min();
|
||||
MultiDeBruijnVertex clippingEnd = backwards ? anchoring.rightAnchorVertex : anchoring.leftAnchorVertex;
|
||||
while (!possibleClippingPoints.isEmpty()) {
|
||||
final Pair<MultiDeBruijnVertex, Integer> candidate = possibleClippingPoints.removeLast();
|
||||
if (candidate.getSecond() <= maxClipping) {
|
||||
clippingEnd = candidate.getFirst();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return resolveEventBlock(backwards ? new Pair<>(currentVertex, clippingEnd) : new Pair<>(clippingEnd, currentVertex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets or constructs a event-block through the cache.
|
||||
* @param borders the source and sink vertex pair for the requested event block.
|
||||
* @return never {@code null}
|
||||
*/
|
||||
@Requires("borders != null && border.getFirst() != null && border.getSecond() != null")
|
||||
private EventBlock resolveEventBlock(final Pair<MultiDeBruijnVertex,MultiDeBruijnVertex> borders) {
|
||||
EventBlock result = eventBlockCache.get(borders);
|
||||
if (result == null)
|
||||
eventBlockCache.put(borders,result = new EventBlock(graph, borders.getFirst(),borders.getSecond()));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Move on vertex along the reference path checking for the presence of new opening alternative paths.
|
||||
*
|
||||
* @param anchoring anchoring information on the targeted read.
|
||||
* @param backwards whether we are extending the block backwards or forwards.
|
||||
* @param currentVertex the current vertex.
|
||||
* @param pathSizes current block path sizes.
|
||||
* @param expectedAlternativePathRejoins information about location of vertices along the reference path where open alternative paths will rejoin.
|
||||
* @return the next current-vertex, never {@code null} unless there is a bug.
|
||||
*/
|
||||
private MultiDeBruijnVertex advanceOnReferencePath(final ReadAnchoring anchoring, final boolean backwards, final MultiDeBruijnVertex currentVertex, final CountSet pathSizes, final Map<MultiDeBruijnVertex, CountSet> expectedAlternativePathRejoins) {
|
||||
final Set<MultiSampleEdge> nextEdges = backwards ? graph.incomingEdgesOf(currentVertex) : graph.outgoingEdgesOf(currentVertex);
|
||||
MultiDeBruijnVertex nextReferenceVertex = null;
|
||||
for (final MultiSampleEdge e : nextEdges) {
|
||||
final MultiDeBruijnVertex nextVertex = backwards ? graph.getEdgeSource(e) : graph.getEdgeTarget(e);
|
||||
if (e.isRef())
|
||||
nextReferenceVertex = nextVertex;
|
||||
else {
|
||||
final CountSet pathSizesPlusOne = pathSizes.clone();
|
||||
pathSizesPlusOne.incAll(1);
|
||||
graph.calculateRejoins(nextVertex, expectedAlternativePathRejoins, anchoring.referenceWithinAnchorsMap.keySet(), pathSizesPlusOne, true, backwards);
|
||||
}
|
||||
}
|
||||
return nextReferenceVertex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the current vertex is a valid block end.
|
||||
*
|
||||
* @param anchoring reads anchoring information necessary to make the evaluation.
|
||||
* @param currentVertex target potential block end
|
||||
* @param expectedAlternativePathRejoins traversal states regarding open alternative paths.
|
||||
*
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
private boolean isValidBlockEnd(final ReadAnchoring anchoring, final MultiDeBruijnVertex currentVertex, final Map<MultiDeBruijnVertex, CountSet> expectedAlternativePathRejoins) {
|
||||
final boolean isUniqueKmer = anchoring.uniqueKmerOffsets.containsKey(currentVertex);
|
||||
final boolean isAnchorable = graph.getAnchorableVertices().contains(currentVertex) && isUniqueKmer && expectedAlternativePathRejoins.size() == 0;
|
||||
return isUniqueKmer && isAnchorable;
|
||||
}
|
||||
}
|
||||
|
|
@ -58,6 +58,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.EventMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
|
|
@ -182,11 +183,12 @@ public class GenotypingEngine {
|
|||
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {
|
||||
throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles.");
|
||||
// this is possible in GGA mode when the same event is represented in multiple input records
|
||||
throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s).");
|
||||
}
|
||||
final Map<VariantContext, Allele> mergeMap = new LinkedHashMap<>();
|
||||
mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele
|
||||
|
|
@ -335,7 +337,7 @@ public class GenotypingEngine {
|
|||
for( final String sample : alleleReadMap.keySet() ) {
|
||||
final int numHaplotypes = mergedVC.getAlleles().size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true);
|
||||
final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,158 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.FastLoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Read likelihood calculation engine base on applying heuristic on the assembly graph.
|
||||
*/
|
||||
public class GraphBasedLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
|
||||
|
||||
private static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngine.class);
|
||||
|
||||
/**
|
||||
* Gap extension penalty in Phred scale.
|
||||
*/
|
||||
private byte gcpHMM;
|
||||
|
||||
/**
|
||||
* Fast-hmm implementation reused across active regions.
|
||||
*/
|
||||
private FlexibleHMM hmm;
|
||||
|
||||
/**
|
||||
* The worst reference vs best-alternative haplotype ratio for any read. The reference haplotype likelihood
|
||||
* is changes to meet this maximum is needed.
|
||||
*/
|
||||
private double log10GlobalReadMismappingRate;
|
||||
|
||||
/**
|
||||
* How we resolve cases in where we have haplotypes coming from different kmer sizes.
|
||||
*/
|
||||
private HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution;
|
||||
|
||||
private enum DebugMode { NONE, DEBUG, EXTRA_DEBUG };
|
||||
|
||||
private DebugMode debugMode;
|
||||
|
||||
/**
|
||||
* Creates a new likelihood engine.
|
||||
*
|
||||
* @param gapExtensionPenalty the gap extension penalty Phred scale.
|
||||
* @param log10GlobalReadMismappingRate the global read mismapping rate.
|
||||
* @param heterogeneousKmerSizeResolution who to resolve assembly with haplotypes generated from different kmerSizes.
|
||||
* @param debug whether to output some debug messages.
|
||||
* @param debugHaplotypeGraphAndLikelihoods whether to generate haplotype graph and likelihood files, please only use with small intervals.
|
||||
*/
|
||||
public GraphBasedLikelihoodCalculationEngine(final int gapExtensionPenalty, final double log10GlobalReadMismappingRate,
|
||||
final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution,
|
||||
final boolean debug, final boolean debugHaplotypeGraphAndLikelihoods) {
|
||||
gcpHMM = (byte) gapExtensionPenalty;
|
||||
hmm = new FastLoglessPairHMM(gcpHMM);
|
||||
this.log10GlobalReadMismappingRate = log10GlobalReadMismappingRate;
|
||||
this.heterogeneousKmerSizeResolution = heterogeneousKmerSizeResolution;
|
||||
debugMode = debugHaplotypeGraphAndLikelihoods ? DebugMode.EXTRA_DEBUG : debug ? DebugMode.DEBUG : DebugMode.NONE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
|
||||
final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine =
|
||||
new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet,
|
||||
hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution);
|
||||
final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList();
|
||||
final List<Haplotype> supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList();
|
||||
if (supportedHaplotypes.size() != haplotypes.size()) logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size());
|
||||
final Map<String,PerReadAlleleLikelihoodMap> result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes,
|
||||
perSampleReadList );
|
||||
if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* A few debug messages associated with the GraphBased likelihoods engine.
|
||||
*/
|
||||
private void graphLikelihoodDebugDumps(final ActiveRegion originalActiveRegion, final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> result) {
|
||||
if (graphLikelihoodEngine.hasCycles())
|
||||
logger.debug("Resulting haplotype graph combining several kmer sizes has cycles");
|
||||
else if (graphLikelihoodEngine.haplotypeGraph.hasNonReferenceEnds())
|
||||
logger.debug("Resulting haplotype graph has ends that do not belong to the reference: " + originalActiveRegion.getLocation());
|
||||
else if (!graphLikelihoodEngine.hasVariation())
|
||||
logger.debug("Resulting haplotype graph does not contain any alternative haplotype path");
|
||||
if (debugMode == DebugMode.EXTRA_DEBUG) {
|
||||
graphLikelihoodEngine.printGraph(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeGraph.dot");
|
||||
final SeqGraph sq = graphLikelihoodEngine.haplotypeGraph.convertToSequenceGraph();
|
||||
sq.simplifyGraph();
|
||||
sq.printGraph(new File(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeSeqGraph.dot"), 10000);
|
||||
try {
|
||||
FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt"));
|
||||
PrintWriter pw = new PrintWriter(fw);
|
||||
//Note: we only output the first sample likelihoods, perhaps should output all of them but for debugging this is normally what is needed.
|
||||
pw.println(result.entrySet().iterator().next().getValue().toString());
|
||||
pw.close();
|
||||
fw.close();
|
||||
} catch (Exception ex) {
|
||||
throw new StingException("", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,911 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.CountSet;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Fast pseudo-likelihood calculation engine based on the assembly haplotype graph.
|
||||
*
|
||||
* <p>
|
||||
* An instance is good for active region. {@link GraphBasedLikelihoodCalculationEngine} instance them on demand
|
||||
* as requested by the {@code HaplotypeCaller} code.
|
||||
* </p>
|
||||
*/
|
||||
public class GraphBasedLikelihoodCalculationEngineInstance {
|
||||
|
||||
private final static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngineInstance.class);
|
||||
|
||||
|
||||
/**
|
||||
* Unified kmer size used for the Haplotype graph.
|
||||
*/
|
||||
protected final int kmerSize;
|
||||
|
||||
/**
|
||||
* Reference to the haplotype graph.
|
||||
*/
|
||||
protected final HaplotypeGraph haplotypeGraph;
|
||||
|
||||
/**
|
||||
* Haplotypes included in the haplotype graph.
|
||||
*/
|
||||
private final List<Haplotype> haplotypes;
|
||||
|
||||
/**
|
||||
* Whether there is some variation present in the haplotype assembly.
|
||||
*/
|
||||
private final boolean hasVariation;
|
||||
|
||||
|
||||
/**
|
||||
* Counts of reads that anchoread somewhere.
|
||||
*
|
||||
* <p>Used for debugging purposes</p>
|
||||
*/
|
||||
private int anchoredReads = 0;
|
||||
|
||||
/**
|
||||
* Count of reads that didn't anchor anywere.
|
||||
*
|
||||
* <p>Used for debugging purposes</p>
|
||||
*/
|
||||
private int nonAnchoredReads = 0;
|
||||
|
||||
/**
|
||||
* Pair-hmm implementation to use to calculate read likelihoods.
|
||||
*/
|
||||
private final FlexibleHMM hmm;
|
||||
|
||||
/**
|
||||
* Maximum likelihood difference between the reference haplotype and the best alternative haplotype.
|
||||
*
|
||||
* <p>If the difference is greater for a read, the reference haplotype likelihood is increase in order to not go
|
||||
* beyond this limit</p>
|
||||
*/
|
||||
protected final double log10globalReadMismappingRate;
|
||||
|
||||
protected final EventBlockFinder eventBlockSearchEngine;
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a new engine based on the results of the assembly.
|
||||
*
|
||||
* @param assemblyResultSet assembly-result set
|
||||
* @param hmm fast-hmm implementation to use.
|
||||
* @param log10globalReadMismappingRate maximum cost for the reference haplotype vs the best alternative available.
|
||||
* @param heterogeneousKmerSizeResolution multi-kmersize dataset resolution.
|
||||
* @throws NullPointerException if any argument is null.
|
||||
* @throws IllegalArgumentException if log10globalReadMismappingRate >= 0.
|
||||
*/
|
||||
public GraphBasedLikelihoodCalculationEngineInstance(final AssemblyResultSet assemblyResultSet, final FlexibleHMM hmm, final double log10globalReadMismappingRate, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) {
|
||||
if (heterogeneousKmerSizeResolution == null) throw new NullPointerException("the kmerSize resolution cannot be null");
|
||||
if (assemblyResultSet == null) throw new NullPointerException("the assembly result set cannot be null");
|
||||
if (hmm == null) throw new NullPointerException("the fast-hmm component cannot be null");
|
||||
if (log10globalReadMismappingRate >= 0)
|
||||
throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero");
|
||||
|
||||
this.hmm = hmm;
|
||||
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
|
||||
|
||||
haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList());
|
||||
Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR);
|
||||
|
||||
// make sure that kmerSize is not bigger than the smallest haplotype. It can well happen when there are cycles and kmerSize inflates.
|
||||
final Haplotype referenceHaplotype = assemblyResultSet.getReferenceHaplotype();
|
||||
int minHaplotypeLength = referenceHaplotype.length();
|
||||
for (final Haplotype h : haplotypes)
|
||||
if (minHaplotypeLength > h.length())
|
||||
minHaplotypeLength = h.length();
|
||||
|
||||
// Determine the kmerSize to use for the unified haplotype assembly graph
|
||||
|
||||
kmerSize = Math.min(minHaplotypeLength,
|
||||
heterogeneousKmerSizeResolution.useMaximum() ? assemblyResultSet.getMaximumKmerSize() : assemblyResultSet.getMinimumKmerSize());
|
||||
|
||||
haplotypeGraph = new HaplotypeGraph(kmerSize,haplotypes);
|
||||
|
||||
|
||||
if (haplotypeGraph.hasCycles())
|
||||
Utils.warnUser(logger, "cycle caused at merging haplotypes with different kmerSizes: active region " + assemblyResultSet.getRegionForGenotyping() + " will be skipped");
|
||||
|
||||
//TODO haplpotypeGraph.getReferenceSourceVertex() == null
|
||||
//TODO Is a quick patch to ignore cases where the trimming has rendered kmerSize so big that is bigger than the haplotype
|
||||
//TODO and reduction to the minimum haplotype size result in no unique kmers.
|
||||
//TODO the actual solution: we need to impose a maximum trimming at least for Graph-based HC runs as we are loosing
|
||||
//TODO a bit of sensitivity as trimming results in lack of unique kmers.
|
||||
if (haplotypeGraph.hasCycles() || haplotypeGraph.getReferenceHaplotype() == null) {
|
||||
hasVariation = false;
|
||||
eventBlockSearchEngine = null;
|
||||
return;
|
||||
}
|
||||
|
||||
haplotypeGraph.mergeCommonChains();
|
||||
//TODO recover dangling ends. Did not work the last time I tried but may be worth to retry.
|
||||
//haplotypeGraph.recoverDanglingTails(-1);
|
||||
logger.debug("using haplotype graph with kmerSize " + haplotypeGraph.getKmerSize());
|
||||
|
||||
hasVariation = !haplotypeGraph.hasCycles() && haplotypeGraph.getHaplotypes().size() > 1;
|
||||
|
||||
eventBlockSearchEngine = new EventBlockFinder(haplotypeGraph);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether based on result from assembly and the relevant user options we can reuse th existing
|
||||
*
|
||||
* @param assemblyResultSet assembly result set.
|
||||
* @param kmerSize intended kmerSize for the haplotype graph.
|
||||
* @param heterogeneousKmerSizeResolution user instruction as to how to resolve situation where we have haplotypes comming from different kmer sizes.
|
||||
* @return {@code true} iff we can reuse an existing read-threading graph with that kmerSize in the assembly result set.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
private static boolean canReuseReadThreadingGraphAsHaplotypeGraph(final AssemblyResultSet assemblyResultSet, final int kmerSize, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) {
|
||||
return !assemblyResultSet.wasTrimmed() && (!assemblyResultSet.hasMultipleKmerSizes() || heterogeneousKmerSizeResolution.combinesKmerSizes()) &&
|
||||
assemblyResultSet.getUniqueReadThreadingGraph(kmerSize) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the underlying haplotype graph assembly contains any variation worth analyzing.
|
||||
*
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean hasVariation() {
|
||||
return hasVariation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the
|
||||
* active region assembly process.
|
||||
*
|
||||
* @param haplotypes to evaluate.
|
||||
* @param perSampleReadList the input read sets stratified per sample.
|
||||
*
|
||||
* @throws NullPointerException if either parameter is {@code null}.
|
||||
*
|
||||
* @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
|
||||
* The value maps can be potentially empty though.
|
||||
*/
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(
|
||||
final List<Haplotype> haplotypes,
|
||||
final Map<String, List<GATKSAMRecord>> perSampleReadList) {
|
||||
// General preparation on the input haplotypes:
|
||||
Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR);
|
||||
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(haplotypes.size());
|
||||
for (final Haplotype haplotype : haplotypes)
|
||||
alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference()));
|
||||
|
||||
// The actual work:
|
||||
final HashMap<String, PerReadAlleleLikelihoodMap> result = new HashMap<>(perSampleReadList.size());
|
||||
for (final Map.Entry<String, List<GATKSAMRecord>> e : perSampleReadList.entrySet()) {
|
||||
final String sample = e.getKey();
|
||||
final List<GATKSAMRecord> reads = e.getValue();
|
||||
final Set<GATKSAMRecord> mayNeedAdjustment = new HashSet<>(reads.size());
|
||||
// Get the cost/likelihood of each read at relevant subpaths on the tree:
|
||||
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment);
|
||||
// Create the resulting per-read maps:
|
||||
final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions);
|
||||
result.put(sample, prallm);
|
||||
}
|
||||
logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + "");
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Prints a graph into a dot file.
|
||||
*
|
||||
* @param fileName name of the output file.
|
||||
*/
|
||||
public void printGraph(final String fileName) {
|
||||
if (haplotypeGraph != null)
|
||||
haplotypeGraph.printGraph(fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the kmerSize the engine is using to match read vs graph kmers thus reducing computation.
|
||||
*
|
||||
* @return greater than 0.
|
||||
*/
|
||||
public int getKmerSize() {
|
||||
return kmerSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tells whether the underlying haplotype graph contained cycles.
|
||||
*
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean hasCycles() {
|
||||
// It is set to null if it contained cycles.
|
||||
return haplotypeGraph == null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds the result per-read allele likelihood map.
|
||||
*
|
||||
* @param haplotypes haplotypes to process.
|
||||
* @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex.
|
||||
* @param alleleVersions map between haplotypes and the corresponding allele.
|
||||
* @return never {@code null} although perhaps empty.
|
||||
*/
|
||||
protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap(
|
||||
final Collection<Haplotype> haplotypes,
|
||||
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsEndingByVertex, final Map<Haplotype, Allele> alleleVersions) {
|
||||
|
||||
final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap();
|
||||
if (haplotypeGraph == null)
|
||||
return result;
|
||||
final Map<GATKSAMRecord, Double> maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10);
|
||||
final Set<Haplotype> supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes());
|
||||
supportedHaplotypes.retainAll(haplotypes);
|
||||
for (final Haplotype haplotype : supportedHaplotypes)
|
||||
calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex);
|
||||
|
||||
//TODO Does not seem to be needed in practice:
|
||||
//TODO furhter testing/evaluation required before removing it completely.
|
||||
//makeLikelihoodAdjustment(alleleVersions, result, maxAlternativeAlleleLogLk.keySet(), maxAlternativeAlleleLogLk);
|
||||
applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Work done per haplotype to build the result per-read allele likelihood map.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Basically for each haplotype we go through its path in the graph collecting all the read cost that we find
|
||||
* on the way. For each read present we add up all its cost resulting in a single value per read, i.e. its
|
||||
* "likelihood".
|
||||
* </p>
|
||||
*
|
||||
* @param haplotype the target haplotype
|
||||
* @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output.
|
||||
* @param result target where to add the read-vs-haplotype likelihoods.
|
||||
* @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read.
|
||||
* @param costsEndingByVertex read costs assorted by their end vertex.
|
||||
*/
|
||||
private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype,
|
||||
final Map<Haplotype, Allele> alleleVersions,
|
||||
final PerReadAlleleLikelihoodMap result,
|
||||
final Map<GATKSAMRecord, Double> maxAlleleLogLk,
|
||||
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsEndingByVertex) {
|
||||
final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype);
|
||||
final Set<MultiDeBruijnVertex> haplotypeVertices = haplotypeRoute.vertexSet();
|
||||
final Map<GATKSAMRecord, ReadCost> readCostByRead = new HashMap<>();
|
||||
final Set<MultiDeBruijnVertex> visitedVertices = new HashSet<>(haplotypeVertices.size());
|
||||
final List<MultiSampleEdge> edgeList = haplotypeRoute.getEdges();
|
||||
MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex();
|
||||
Route<MultiDeBruijnVertex, MultiSampleEdge> pathSoFar = new Route<>(currentVertex, haplotypeGraph);
|
||||
final Iterator<MultiSampleEdge> edgeIterator = edgeList.iterator();
|
||||
while (true) {
|
||||
visitedVertices.add(currentVertex);
|
||||
final Set<ReadSegmentCost> finishingAtElementCostSet = costsEndingByVertex.get(currentVertex);
|
||||
updateReadCosts(readCostByRead, visitedVertices, pathSoFar, finishingAtElementCostSet);
|
||||
if (!edgeIterator.hasNext()) break;
|
||||
final MultiSampleEdge nextEdge = edgeIterator.next();
|
||||
pathSoFar = new Route<>(pathSoFar, nextEdge);
|
||||
currentVertex = pathSoFar.getLastVertex();
|
||||
}
|
||||
|
||||
final List<ReadCost> readCosts = new ArrayList<>(readCostByRead.values());
|
||||
Collections.sort(readCosts, ReadCost.COMPARATOR);
|
||||
for (final ReadCost rc : readCosts)
|
||||
result.add(rc.read, alleleVersions.get(haplotype), rc.cost);
|
||||
|
||||
for (final ReadCost rc : readCosts) {
|
||||
final Double currentMax = maxAlleleLogLk.get(rc.read);
|
||||
if (currentMax == null || currentMax < rc.cost)
|
||||
maxAlleleLogLk.put(rc.read, rc.cost);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the read cost based on the path cost found at a vertex.
|
||||
*
|
||||
* @param readCosts collection of read costs so far
|
||||
* @param visitedVertices visited vertices collection.
|
||||
* @param pathSoFar the haplotype path visited so far.
|
||||
* @param finishingAtElementCostSet collection of path cost to process
|
||||
*/
|
||||
private void updateReadCosts(final Map<GATKSAMRecord, ReadCost> readCosts,
|
||||
final Set<MultiDeBruijnVertex> visitedVertices,
|
||||
final Route<MultiDeBruijnVertex, MultiSampleEdge> pathSoFar,
|
||||
final Set<ReadSegmentCost> finishingAtElementCostSet) {
|
||||
if (finishingAtElementCostSet != null) {
|
||||
for (final ReadSegmentCost pc : finishingAtElementCostSet) {
|
||||
if (!visitedVertices.contains(pc.path.getFirstVertex()))
|
||||
continue;
|
||||
if (!pathSoFar.isSuffix(pc.path))
|
||||
continue;
|
||||
ReadCost rc = readCosts.get(pc.read);
|
||||
if (rc == null)
|
||||
readCosts.put(pc.read, rc = new ReadCost(pc.read));
|
||||
rc.cost += pc.cost;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Likelihood penalty for unreported haplotype vs read likelihood with respect to the worst reported one.
|
||||
*/
|
||||
private static final int UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY = -3;
|
||||
|
||||
/**
|
||||
* Re-scales all haplotype vs read likelihoods so that for read, the best haplotype, hash likelihood 0.
|
||||
*
|
||||
* @param alleleVersions map between input haplotypes and output alleles.
|
||||
* @param result where to change the likelihoods.
|
||||
* @param mayNeedAdjustment set of read that might need adjustment. Others might be ignored.
|
||||
* @param maxAlternative map from each read and the maximum alternative haplotype likelihood.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
private void makeLikelihoodAdjustment(final Map<Haplotype, Allele> alleleVersions,
|
||||
final PerReadAlleleLikelihoodMap result,
|
||||
final Set<GATKSAMRecord> mayNeedAdjustment,
|
||||
final Map<GATKSAMRecord, Double> maxAlternative) {
|
||||
final Map<GATKSAMRecord, Map<Allele, Double>> map = result.getLikelihoodReadMap();
|
||||
|
||||
for (final GATKSAMRecord read : mayNeedAdjustment) {
|
||||
final Map<Allele, Double> existingLikelihoods = map.get(read);
|
||||
if (existingLikelihoods != null) {
|
||||
Allele bestAllele = null;
|
||||
double worstRelativeLikelihood = 0;
|
||||
double bestRelativeLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for (final Map.Entry<Allele, Double> entry : map.get(read).entrySet()) {
|
||||
final double candidateRelativeLikelihood = entry.getValue();
|
||||
if (candidateRelativeLikelihood > bestRelativeLikelihood) {
|
||||
bestAllele = entry.getKey();
|
||||
bestRelativeLikelihood = candidateRelativeLikelihood;
|
||||
}
|
||||
if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood)
|
||||
worstRelativeLikelihood = candidateRelativeLikelihood;
|
||||
}
|
||||
|
||||
worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY;
|
||||
if (bestAllele == null)
|
||||
throw new IllegalStateException("No best allele for read " + read.getReadName());
|
||||
final double bestLikelihood = 0.0; // the best becomes zero.
|
||||
maxAlternative.put(read, bestLikelihood);
|
||||
for (final Map.Entry<Haplotype, Allele> entry : alleleVersions.entrySet()) {
|
||||
final Allele a = entry.getValue();
|
||||
final Double relativeLikelihoodO = existingLikelihoods.get(a);
|
||||
final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO;
|
||||
final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood;
|
||||
if (likelihood > 0)
|
||||
throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName());
|
||||
existingLikelihoods.put(a, likelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele.
|
||||
* The justification of this constraint is explained in
|
||||
* {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}.
|
||||
*
|
||||
* @param alleleVersions correspondence between input haplotypes and output alleles.
|
||||
* @param result the target result map.
|
||||
* @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele.
|
||||
*/
|
||||
private void applyGlobalReadMismappingRate(final Map<Haplotype, Allele> alleleVersions,
|
||||
final PerReadAlleleLikelihoodMap result,
|
||||
final Map<GATKSAMRecord, Double> maxAlleleLogLk) {
|
||||
if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) {
|
||||
final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype());
|
||||
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : result.getLikelihoodReadMap().entrySet()) {
|
||||
final GATKSAMRecord read = entry.getKey();
|
||||
final Map<Allele, Double> likelihoods = entry.getValue();
|
||||
final Double maxLogLk = maxAlleleLogLk.get(read);
|
||||
if (maxAlleleLogLk == null) continue;
|
||||
final Double referenceLogLk = likelihoods.get(referenceAllele);
|
||||
final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate;
|
||||
if (referenceLogLk == null || referenceLogLk < minReferenceLogLk)
|
||||
likelihoods.put(referenceAllele, minReferenceLogLk);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates path costs for a set of reads.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The resulting map has one entry per read, where the read is the key and the value list of path-cost sets.
|
||||
* Each element in that list corresponds to an event block. Each path cost in one of those sets indicate the
|
||||
* likelihood (cost) of traversing a possible path across the event block using that read.
|
||||
* </p>
|
||||
*
|
||||
* @param reads reads to analyze.
|
||||
* @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
protected Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> calculatePathCostsByRead(
|
||||
final List<GATKSAMRecord> reads, final Set<GATKSAMRecord> mayNeedAdjustment) {
|
||||
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> result = new HashMap<>(reads.size());
|
||||
if (!hasVariation)
|
||||
return Collections.emptyMap();
|
||||
for (final GATKSAMRecord r : reads) {
|
||||
calculatePathCostsByRead(r, mayNeedAdjustment, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates path cost for a single read.
|
||||
*
|
||||
* @param read target read.
|
||||
* @param mayNeedAdjustment set where to add read whose likelihood might need adjustment.
|
||||
* @param result map where to add the result.
|
||||
*/
|
||||
private void calculatePathCostsByRead(final GATKSAMRecord read, final Set<GATKSAMRecord> mayNeedAdjustment,
|
||||
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> result) {
|
||||
|
||||
final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph);
|
||||
// cannot anchor so go the tradition pair-hmm way.
|
||||
hmm.loadRead(read);
|
||||
if (!anchoring.isAnchoredSomewhere()) {
|
||||
defaultToRegularPairHMM(anchoring, result);
|
||||
nonAnchoredReads++;
|
||||
return;
|
||||
}
|
||||
|
||||
calculateReadSegmentCosts(anchoring, hmm, result);
|
||||
|
||||
if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result);
|
||||
mayNeedAdjustment.add(read);
|
||||
anchoredReads++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates read vs haplotype likelihoods using the classic PairHMM approach.
|
||||
* <p/>
|
||||
* <p>
|
||||
* It basically compares the read with each haplotype full path without short cuts.
|
||||
* </p>
|
||||
*
|
||||
* @param anchoring anchoring information of the read.
|
||||
* @param destination where to leave the results indexed by ending veretex.
|
||||
*/
|
||||
private void defaultToRegularPairHMM(final ReadAnchoring anchoring, final Map<MultiDeBruijnVertex,Set<ReadSegmentCost>> destination) {
|
||||
|
||||
for (final Map.Entry<Haplotype, HaplotypeRoute> entry : haplotypeGraph.getHaplotypeRouteMap().entrySet()) {
|
||||
if (entry.getValue() == null) continue;
|
||||
final byte[] haplotypeBases = entry.getKey().getBases();
|
||||
hmm.loadHaplotypeBases(haplotypeBases);
|
||||
final double cost = hmm.calculateLocalLikelihood(0, anchoring.read.getReadLength(), 0, haplotypeBases.length, false);
|
||||
final ReadSegmentCost readSegmentCost = new ReadSegmentCost(anchoring.read, entry.getValue(), cost);
|
||||
addReadSegmentCost(destination, readSegmentCost);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a new read-segment-cost to an ending vertex indexed map.
|
||||
* @param destination where to add the read-segment-cost.
|
||||
* @param cost the read-segment-cost to add.
|
||||
*/
|
||||
private void addReadSegmentCost(final Map<MultiDeBruijnVertex,Set<ReadSegmentCost>> destination, final ReadSegmentCost cost) {
|
||||
final MultiDeBruijnVertex endVertex = cost.path.getLastVertex();
|
||||
Set<ReadSegmentCost> vpcSet = destination.get(endVertex);
|
||||
if (vpcSet == null)
|
||||
destination.put(endVertex, vpcSet = new HashSet<>(10));
|
||||
vpcSet.add(cost);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the likelihood cost of path section of a read across the graph.
|
||||
* <p/>
|
||||
* <p>
|
||||
* Given a read, its anchors and other unique kmer mapable to the reference path we can divide the graph
|
||||
* into event blocks: a set of one or more variations and the possible path across that block.
|
||||
* </p>
|
||||
* <p/>
|
||||
* <p>
|
||||
* The result value will have one element fo reach block. Each element is the set of all path costs (likelihoods)
|
||||
* to traverse the block using all possible paths (different haplotypes).
|
||||
* </p>
|
||||
* <p/>
|
||||
* <p>
|
||||
* The current implementation has some added complexity in order to avoid a situation in where the last part
|
||||
* of the anchored section of the read is thrown out. We first determine the last event block boundaries and we
|
||||
* make sure that we won't run over its left limit when covering for earlier event blocks.
|
||||
* </p>
|
||||
*
|
||||
* @param anchoring target read graph anchoring information.
|
||||
* @param hmm the pair-hmm calculation engine. It must have been loaded with the same {@code read} already.
|
||||
* @param destination where to add the costs.
|
||||
*/
|
||||
private void calculateReadSegmentCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> destination) {
|
||||
|
||||
final EventBlockFinder.Traversal traversal = eventBlockSearchEngine.traversal(anchoring);
|
||||
|
||||
for (final EventBlock eventBlock : traversal) {
|
||||
|
||||
// final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> acrossBlockPaths =
|
||||
// calculateAllPathsBetweenVertices(anchoring,
|
||||
// eventBlock.getSource(), eventBlock.getSink());//eventBlock.getRoutesAcross();
|
||||
|
||||
final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> acrossBlockPaths = eventBlock.getRoutesAcross();
|
||||
|
||||
int leftBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSource());
|
||||
int rightBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSink());
|
||||
calculateCostForPathSet(anchoring.read, acrossBlockPaths, hmm, leftBlockBoundaryIndex, rightBlockBoundaryIndex, true, false, null, null, destination);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate path cost for a set of paths across a event block.
|
||||
*
|
||||
* @param read the target read.
|
||||
* @param acrossBlockPaths event block paths to evaluate.
|
||||
* @param hmm pair-hmm engine to use to calculate likelihoods.
|
||||
* @param beforeBlockReadOffset kmer offset on the read for the vertex kmer before the block.
|
||||
* @param afterBlockReadOffset kmer offset on the read for the vertex kmer after the block.
|
||||
* @param doClipping whether to perform any clipping in order to save cpu time.
|
||||
* @param prependVertex if not null, the end cost path with be prepended with this vertex.
|
||||
* @param appendVertex if not null, the end cost path will be appended with this vertex.
|
||||
* @param includePathEnds whether to include or exclude the vertices at the very end or beginning of the paths.
|
||||
*/
|
||||
private void calculateCostForPathSet(
|
||||
final GATKSAMRecord read, final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> acrossBlockPaths,
|
||||
final FlexibleHMM hmm, final int beforeBlockReadOffset, final int afterBlockReadOffset,
|
||||
final boolean doClipping, final boolean includePathEnds,
|
||||
final MultiDeBruijnVertex prependVertex,
|
||||
final MultiDeBruijnVertex appendVertex,
|
||||
final Map<MultiDeBruijnVertex,Set<ReadSegmentCost>> destination) {
|
||||
|
||||
|
||||
final Set<ReadSegmentCost> readSegmentCosts = new TreeSet<>(ReadSegmentComparator.INSTANCE);
|
||||
|
||||
final int readStart = beforeBlockReadOffset + kmerSize;
|
||||
final int readEnd = Math.max(readStart, afterBlockReadOffset + kmerSize - 1);
|
||||
final byte[][] pathBases = new byte[acrossBlockPaths.size()][];
|
||||
final CountSet pathSizes = new CountSet(acrossBlockPaths.size());
|
||||
int nextPath = 0;
|
||||
|
||||
// Complete the read segment cost with the corresponding path bases
|
||||
for (final Route<MultiDeBruijnVertex, MultiSampleEdge> p : acrossBlockPaths) {
|
||||
final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN);
|
||||
pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds);
|
||||
pathSizes.add(readSegmentCost.bases.length);
|
||||
readSegmentCosts.add(readSegmentCost);
|
||||
}
|
||||
|
||||
// Add the read 'path size'.
|
||||
pathSizes.add(readEnd - readStart);
|
||||
|
||||
final byte[] readBases = hmm.getReadBases();
|
||||
|
||||
// Perform right clipping of bases that are common to all paths and read.
|
||||
int rightClipping = !doClipping ? 0 : calculateRightClipping(readEnd, pathBases, readBases,pathSizes);
|
||||
|
||||
// Calculate the costs.
|
||||
for (final ReadSegmentCost readSegmentCost : readSegmentCosts) {
|
||||
hmm.loadHaplotypeBases(readSegmentCost.bases);
|
||||
readSegmentCost.cost = hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false);
|
||||
if (prependVertex != null)
|
||||
readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path);
|
||||
if (appendVertex != null)
|
||||
readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex);
|
||||
addReadSegmentCost(destination,readSegmentCost);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines how much we can clip away from the right side of a set of path without loosing accuracy when comparing
|
||||
* likelihood vs the read.
|
||||
*
|
||||
* @param readEnd exclusive position right after the last one of the region considered.
|
||||
* @param pathBases bases of possible path in the same event block.
|
||||
* @param readBases full length read bases.
|
||||
* @param pathSizes path size set.
|
||||
*
|
||||
* @return 0 or greater.
|
||||
*/
|
||||
private int calculateRightClipping(final int readEnd, final byte[][] pathBases,
|
||||
final byte[] readBases, final CountSet pathSizes) {
|
||||
final int maxClipping = pathSizes.size() > 1 ? 0 : Math.min(pathSizes.min(), kmerSize - 1);
|
||||
int rightClipping = 0;
|
||||
while (rightClipping < maxClipping) {
|
||||
final byte readBase = readBases[readEnd - rightClipping - 1];
|
||||
boolean dontGoFurther = false;
|
||||
for (int i = 0; !dontGoFurther && i < pathBases.length; i++)
|
||||
if (pathBases[i][pathBases[i].length - rightClipping - 1] != readBase)
|
||||
dontGoFurther = true;
|
||||
if (dontGoFurther)
|
||||
break;
|
||||
rightClipping++;
|
||||
}
|
||||
return rightClipping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates a graph path bases.
|
||||
* <p/>
|
||||
* <p>
|
||||
* When the path starts on a source vertex, all its sequence is considered as part of the path bases. For regular
|
||||
* vertices start only the suffix (last) base is considered.
|
||||
* </p>
|
||||
*
|
||||
* @param path the targeted path.
|
||||
* @param includePathEnds whether the bases included in the first and last vertex of the path should be included or excluded.
|
||||
* @return never {@code null} but perhaps a zero-length base array if the final requested path length is zero.
|
||||
*/
|
||||
//TODO this method could be moved to the Path class, but require consider how to make the API more concise.
|
||||
private byte[] eventBlockPathBases(final Path<MultiDeBruijnVertex, MultiSampleEdge> path,
|
||||
final boolean includePathEnds) {
|
||||
// We first calculate the size of the return.
|
||||
final List<MultiDeBruijnVertex> vertices = path.getVertices();
|
||||
final boolean pathStartsAtSource = haplotypeGraph.isSource(path.getFirstVertex());
|
||||
final int resultLength = includePathEnds
|
||||
? vertices.size() + (pathStartsAtSource ? path.getFirstVertex().getSequence().length - 1 : 0)
|
||||
: vertices.size() - 2;
|
||||
// Trivial empty return cases:
|
||||
if (resultLength <= 0)
|
||||
return new byte[0];
|
||||
final byte[] result = new byte[resultLength];
|
||||
if (result.length == 0) {
|
||||
return result;
|
||||
}
|
||||
// General return cases:
|
||||
final ListIterator<MultiDeBruijnVertex> it = vertices.listIterator(includePathEnds ? 0 : 1); // skip the vertex (exclusive)
|
||||
for (int i = 0; i < resultLength; i++) { // i < resultLength implicitly skips the last vertex (exclusive).
|
||||
final MultiDeBruijnVertex vertex = it.next();
|
||||
if (i == 0 && includePathEnds && pathStartsAtSource) {
|
||||
System.arraycopy(vertex.getSequence(), 0, result, 0, kmerSize);
|
||||
i = kmerSize - 1;
|
||||
} else
|
||||
result[i] = vertex.getSuffix();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the path cost of dangling ends.
|
||||
* <p/>
|
||||
* <p>
|
||||
* A dangling end is the section of the read that falls before the left anchor or after the right anchor.
|
||||
* </p>
|
||||
*
|
||||
* @param anchoring anchoring information of the read vs the haplotype assembly graph.
|
||||
* @param hmm the PairHMM engine to use to calculate likelihoods.
|
||||
* @param destination cost destination.
|
||||
*/
|
||||
private void danglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map<MultiDeBruijnVertex,Set<ReadSegmentCost>> destination) {
|
||||
if (anchoring.leftAnchorIndex > 0 || anchoring.leftAnchorIndex == 0
|
||||
&& anchoring.leftAnchorVertex.hasAmbiguousSequence())
|
||||
leftDanglingEndPathCosts(anchoring, hmm,destination);
|
||||
|
||||
if (anchoring.rightAnchorIndex < anchoring.read.getReadLength() - kmerSize)
|
||||
rightDanglingEndPathCosts(anchoring, hmm, destination);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates all relevant right dangling end path costs.
|
||||
*
|
||||
* @param anchoring the anchoring information for the read under analysis.
|
||||
* @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with
|
||||
* the same read as {@code anchoring} refers to.
|
||||
* @param destination where the place the resulting read-segment-costs.
|
||||
*/
|
||||
private void rightDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm,
|
||||
final Map<MultiDeBruijnVertex,Set<ReadSegmentCost>> destination) {
|
||||
final int readStart = anchoring.rightAnchorIndex;
|
||||
final int readEnd = anchoring.read.getReadLength() - kmerSize + 1;
|
||||
final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> haplotypeRoutes =
|
||||
extendsHaplotypeRoutesForwards(anchoring.rightAnchorVertex);
|
||||
if (haplotypeRoutes.size() >= 2)
|
||||
calculateCostForPathSet(anchoring.read,
|
||||
haplotypeRoutes, hmm, readStart, readEnd, false, true,anchoring.rightAnchorVertex,null,destination);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates all relevant left dangling end path costs.
|
||||
*
|
||||
* @param anchoring the anchoring information for the read under analysis.
|
||||
* @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with
|
||||
* the same read as {@code anchoring} refers to.
|
||||
* @param destination where the place the resulting read-segment-costs.
|
||||
*/
|
||||
private void leftDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm,
|
||||
final Map<MultiDeBruijnVertex,Set<ReadSegmentCost>> destination) {
|
||||
final int readStart = -kmerSize;
|
||||
final int readEnd = anchoring.leftAnchorIndex;
|
||||
final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> haplotypeRoutes =
|
||||
extendsHaplotypeRoutesBackwards(anchoring.leftAnchorVertex);
|
||||
if (haplotypeRoutes.size() >= 2) // if there is just one haplotype route there is no relevant variation in the dangling end.
|
||||
calculateCostForPathSet(anchoring.read, haplotypeRoutes, hmm,
|
||||
readStart, readEnd, false, true, null, anchoring.leftAnchorVertex, destination);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct haplotype routes prefixes to an anchor vertex.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The output should contain a route for each haplotype that includes the input anchor vertex.
|
||||
* This route would be the prefix of the haplotype that finishes at that vertex.
|
||||
* </p>
|
||||
*
|
||||
* @param anchorVertex the target anchor vertex.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
private Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> extendsHaplotypeRoutesBackwards(
|
||||
final MultiDeBruijnVertex anchorVertex) {
|
||||
final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> result = new HashSet<>(haplotypes.size());
|
||||
for (final MultiDeBruijnVertex parent : haplotypeGraph.incomingVerticesOf(anchorVertex))
|
||||
extendsHaplotypeRoutesFrom(parent, result, false);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct haplotype routes suffix from an anchor vertex.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The output should contain a route for each haplotype that includes the input anchor vertex.
|
||||
* This route would be the suffix of the haplotype that starts at that vertex.
|
||||
* </p>
|
||||
*
|
||||
* @param anchorVertex the target anchor vertex.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
private Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> extendsHaplotypeRoutesForwards(
|
||||
final MultiDeBruijnVertex anchorVertex) {
|
||||
final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> result = new HashSet<>(haplotypes.size());
|
||||
for (final MultiDeBruijnVertex parent : haplotypeGraph.outgoingVerticesOf(anchorVertex))
|
||||
extendsHaplotypeRoutesFrom(parent, result, true);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extends from a vertex considering path furcations that are part of some valid haplotype
|
||||
* <p/>
|
||||
* <p>
|
||||
* In other words, it will ignore subpaths that are not valid part of an assembled haplotype.
|
||||
* </p>
|
||||
*
|
||||
* @param start start seed vertex.
|
||||
* @param result destination for found extensions.
|
||||
* @param forward whether to traverse edges forward or backwards.
|
||||
*/
|
||||
private void extendsHaplotypeRoutesFrom(final MultiDeBruijnVertex start, final Set<Route<MultiDeBruijnVertex, MultiSampleEdge>> result, final boolean forward) {
|
||||
final Set<HaplotypeRoute> validHaplotypeRoutes = haplotypeGraph.getEnclosingHaplotypeRoutes(start);
|
||||
if (validHaplotypeRoutes.size() == 0) return;
|
||||
final Deque<Pair<Route<MultiDeBruijnVertex, MultiSampleEdge>, Set<HaplotypeRoute>>> queue = new LinkedList<>();
|
||||
queue.add(new Pair<>(new Route<>(start, haplotypeGraph), validHaplotypeRoutes));
|
||||
while (!queue.isEmpty()) {
|
||||
final Pair<Route<MultiDeBruijnVertex, MultiSampleEdge>, Set<HaplotypeRoute>> current = queue.remove();
|
||||
final Route<MultiDeBruijnVertex, MultiSampleEdge> path = current.getFirst();
|
||||
final MultiDeBruijnVertex vertex = forward ? path.getLastVertex() : path.getFirstVertex();
|
||||
final Set<HaplotypeRoute> validRoutes = current.getSecond();
|
||||
for (final HaplotypeRoute hr : validRoutes) {
|
||||
final MultiDeBruijnVertex routeEndVertex = forward ? hr.getLastVertex() : hr.getFirstVertex();
|
||||
if (vertex.equals(routeEndVertex)) {
|
||||
result.add(path);
|
||||
break;
|
||||
}
|
||||
}
|
||||
final Set<MultiDeBruijnVertex> nextVertices = forward ? haplotypeGraph.outgoingVerticesOf(vertex) :
|
||||
haplotypeGraph.incomingVerticesOf(vertex);
|
||||
for (final MultiDeBruijnVertex candidate : nextVertices) {
|
||||
extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(forward, queue, path, validRoutes, candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check on an candidate vertice to exted a path.
|
||||
*
|
||||
* <p>
|
||||
* This method updates the traversal queue accordingly.
|
||||
* </p>
|
||||
*
|
||||
* @param forward whether the extension is forward, or backwards.
|
||||
* @param queue queue with open paths yet to be explored.
|
||||
* @param path path extension to evaluate.
|
||||
* @param validRoutes collection of valid haplotype routes used to discard non-informative extensions.
|
||||
* @param candidate the candidate extending vertex.
|
||||
*/
|
||||
private void extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(
|
||||
final boolean forward,
|
||||
final Deque<Pair<Route<MultiDeBruijnVertex, MultiSampleEdge>, Set<HaplotypeRoute>>> queue,
|
||||
final Route<MultiDeBruijnVertex, MultiSampleEdge> path,
|
||||
final Set<HaplotypeRoute> validRoutes, final MultiDeBruijnVertex candidate) {
|
||||
final Set<HaplotypeRoute> parentValidHaplotypes = haplotypeGraph.getEnclosingHaplotypeRoutes(candidate);
|
||||
switch (parentValidHaplotypes.size()) {
|
||||
case 0:
|
||||
return;
|
||||
case 1:
|
||||
if (validRoutes.containsAll(parentValidHaplotypes))
|
||||
queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes));
|
||||
else
|
||||
return;
|
||||
break;
|
||||
default:
|
||||
if (parentValidHaplotypes.size() == validRoutes.size() && parentValidHaplotypes.containsAll(validRoutes)) {
|
||||
queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes));
|
||||
} else {
|
||||
final Set<HaplotypeRoute> newValidHaplotypeRoutes = new HashSet<>(validRoutes.size());
|
||||
for (final HaplotypeRoute hr : validRoutes)
|
||||
if (parentValidHaplotypes.contains(hr))
|
||||
newValidHaplotypeRoutes.add(hr);
|
||||
if (newValidHaplotypeRoutes.size() == 0)
|
||||
return;
|
||||
queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), newValidHaplotypeRoutes));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<Haplotype> getHaplotypeList() {
|
||||
return new ArrayList<>(haplotypeGraph.getHaplotypes());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the haplotype graph associated with this instance.
|
||||
* @return never {@code null}
|
||||
*/
|
||||
public HaplotypeGraph getHaplotypeGraph() {
|
||||
return haplotypeGraph;
|
||||
}
|
||||
}
|
||||
|
|
@ -92,6 +92,7 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
|
|||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
|
||||
|
|
@ -154,6 +155,17 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Output(doc="File to which variants should be written")
|
||||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine",
|
||||
doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false)
|
||||
protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM;
|
||||
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false)
|
||||
protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN;
|
||||
|
||||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
|
|
@ -200,6 +212,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
private double log10GlobalReadMismappingRate;
|
||||
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
|
||||
/**
|
||||
|
|
@ -290,7 +304,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
* B <= X < C
|
||||
* X >= C
|
||||
*
|
||||
* The default bands give the following GQ blocks:
|
||||
* The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks:
|
||||
*
|
||||
* [0, 0]
|
||||
* (0, 10]
|
||||
|
|
@ -304,7 +318,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false)
|
||||
protected List<Integer> GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50);
|
||||
protected List<Integer> GVCFGQBands = Arrays.asList(5, 20, 60);
|
||||
|
||||
/**
|
||||
* This parameter determines the maximum size of an indel considered as potentially segregating in the
|
||||
|
|
@ -321,6 +335,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// general advanced arguments to control haplotype caller behavior
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Users should be aware that this argument can really affect the results of the variant calling and should exercise caution.
|
||||
* Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the
|
||||
* calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values
|
||||
* tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher
|
||||
* depth to produce calls).
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 2;
|
||||
|
|
@ -475,10 +496,11 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
/**
|
||||
* Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base
|
||||
* insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator).
|
||||
* VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false)
|
||||
public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
|
||||
public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// done with Haplotype caller parameters
|
||||
|
|
@ -497,10 +519,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// the genotyping engine
|
||||
private GenotypingEngine genotypingEngine = null;
|
||||
|
||||
private VariantAnnotatorEngine annotationEngine = null;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private CachingIndexedFastaSequenceFile referenceReader;
|
||||
protected CachingIndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 500;
|
||||
|
|
@ -524,6 +544,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
ReferenceConfidenceModel referenceConfidenceModel = null;
|
||||
|
||||
// as determined experimentally Nov-Dec 2013
|
||||
protected final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR;
|
||||
protected final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000;
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
|
|
@ -541,7 +565,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
samplesList.addAll( samples );
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
|
||||
// HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this?
|
||||
// HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine
|
||||
UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
|
@ -553,16 +577,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
simpleUAC.CONTAMINATION_FRACTION = 0.0;
|
||||
simpleUAC.CONTAMINATION_FRACTION_FILE=null;
|
||||
simpleUAC.CONTAMINATION_FRACTION_FILE = null;
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
if( UAC.CONTAMINATION_FRACTION_FILE !=null) {
|
||||
if( UAC.CONTAMINATION_FRACTION_FILE != null ) {
|
||||
UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger));
|
||||
}
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<>();
|
||||
|
||||
|
|
@ -589,6 +613,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently");
|
||||
headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines());
|
||||
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) {
|
||||
// a kluge to enforce the use of this indexing strategy
|
||||
if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE ||
|
||||
getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) {
|
||||
throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
}
|
||||
|
||||
try {
|
||||
vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands);
|
||||
} catch ( IllegalArgumentException e ) {
|
||||
|
|
@ -623,7 +653,6 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1;
|
||||
|
||||
// configure the global mismapping rate
|
||||
final double log10GlobalReadMismappingRate;
|
||||
if ( phredScaledGlobalReadMismappingRate < 0 ) {
|
||||
log10GlobalReadMismappingRate = - Double.MAX_VALUE;
|
||||
} else {
|
||||
|
|
@ -632,7 +661,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// create our likelihood calculation engine
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
|
||||
likelihoodCalculationEngine = createLikelihoodCalculationEngine();
|
||||
|
||||
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();
|
||||
|
||||
|
|
@ -650,6 +679,26 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
getToolkit().getGenomeLocParser());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates the appropriate likelihood calculation engine.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
private LikelihoodCalculationEngine createLikelihoodCalculationEngine() {
|
||||
switch (likelihoodEngineImplementation) {
|
||||
case PairHMM:
|
||||
return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
|
||||
case GraphBased:
|
||||
return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations);
|
||||
case Random:
|
||||
return new RandomLikelihoodCalculationEngine();
|
||||
default:
|
||||
//Note: we do not include in the error message list as it is of no grand public interest.
|
||||
throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine +
|
||||
"'. Please use one of the following instead: 'PairHMM' and 'GraphBased'.");
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// isActive
|
||||
|
|
@ -747,7 +796,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// run the local assembler, getting back a collection of information on how we should proceed
|
||||
final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
|
||||
final AssemblyResultSet assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype);
|
||||
final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping();
|
||||
|
||||
// abort early if something is out of the acceptable range
|
||||
if( ! assemblyResult.isVariationPresent() ) {
|
||||
|
|
@ -757,17 +807,26 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
if (dontGenotype) return NO_CALLS; // user requested we not proceed
|
||||
|
||||
// filter out reads from genotyping which fail mapping quality based criteria
|
||||
final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping );
|
||||
final Collection<GATKSAMRecord> filteredReads = filterNonPassingReads( regionForGenotyping );
|
||||
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
|
||||
if( assemblyResult.regionForGenotyping.size() == 0 ) {
|
||||
if( regionForGenotyping.size() == 0 ) {
|
||||
// no reads remain after filtering so nothing else to do!
|
||||
return referenceModelForNoVariation(originalActiveRegion, false);
|
||||
}
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
//logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads");
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) );
|
||||
final List<Haplotype> haplotypes = assemblyResult.getHaplotypeList();
|
||||
final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() );
|
||||
|
||||
// Calculate the likelihoods: CPU intesive part.
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap =
|
||||
likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
|
||||
// was a bad interaction between that selection and the marginalization that happens over each event when computing
|
||||
|
|
@ -776,12 +835,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
|
||||
|
||||
final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine,
|
||||
assemblyResult.haplotypes,
|
||||
haplotypes,
|
||||
stratifiedReadMap,
|
||||
perSampleFilteredReadList,
|
||||
assemblyResult.fullReferenceWithPadding,
|
||||
assemblyResult.paddedReferenceLoc,
|
||||
assemblyResult.regionForGenotyping.getLocation(),
|
||||
assemblyResult.getFullReferenceWithPadding(),
|
||||
assemblyResult.getPaddedReferenceLoc(),
|
||||
regionForGenotyping.getLocation(),
|
||||
getToolkit().getGenomeLocParser(),
|
||||
metaDataTracker,
|
||||
activeAllelesToGenotype );
|
||||
|
|
@ -789,9 +848,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// TODO -- must disable if we are doing NCT, or set the output type of ! presorted
|
||||
if ( bamWriter != null ) {
|
||||
haplotypeBAMWriter.writeReadsAlignedToHaplotypes(
|
||||
assemblyResult.haplotypes,
|
||||
assemblyResult.paddedReferenceLoc,
|
||||
assemblyResult.haplotypes,
|
||||
haplotypes,
|
||||
assemblyResult.getPaddedReferenceLoc(),
|
||||
haplotypes,
|
||||
calledHaplotypes.getCalledHaplotypes(),
|
||||
stratifiedReadMap);
|
||||
}
|
||||
|
|
@ -803,50 +862,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// no called all of the potential haplotypes
|
||||
return referenceModelForNoVariation(originalActiveRegion, false);
|
||||
} else
|
||||
return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(),
|
||||
calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping,
|
||||
return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
|
||||
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
|
||||
stratifiedReadMap, calledHaplotypes.getCalls());
|
||||
} else {
|
||||
return calledHaplotypes.getCalls();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class AssemblyResult {
|
||||
final List<Haplotype> haplotypes;
|
||||
final ActiveRegion regionForGenotyping;
|
||||
final byte[] fullReferenceWithPadding;
|
||||
final GenomeLoc paddedReferenceLoc;
|
||||
final boolean variationPresent;
|
||||
final Haplotype refHaplotype;
|
||||
|
||||
private AssemblyResult(List<Haplotype> haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) {
|
||||
this.haplotypes = haplotypes;
|
||||
this.regionForGenotyping = regionForGenotyping;
|
||||
this.fullReferenceWithPadding = fullReferenceWithPadding;
|
||||
this.paddedReferenceLoc = paddedReferenceLoc;
|
||||
this.variationPresent = variationPresent;
|
||||
|
||||
Haplotype firstRefHaplotype = null;
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
if ( h.isReference() ) {
|
||||
if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h);
|
||||
firstRefHaplotype = h;
|
||||
}
|
||||
}
|
||||
|
||||
if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes);
|
||||
this.refHaplotype = firstRefHaplotype;
|
||||
}
|
||||
|
||||
public Haplotype getRefHaplotype() {
|
||||
return refHaplotype;
|
||||
}
|
||||
|
||||
public boolean isVariationPresent() {
|
||||
return variationPresent && haplotypes.size() > 1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* High-level function that runs the assembler on the active region reads,
|
||||
* returning a data structure with the resulting information needed
|
||||
|
|
@ -856,7 +879,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
* @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty)
|
||||
* @return the AssemblyResult describing how to proceed with genotyping
|
||||
*/
|
||||
protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List<VariantContext> activeAllelesToGenotype) {
|
||||
protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List<VariantContext> activeAllelesToGenotype) {
|
||||
// Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails
|
||||
|
||||
|
|
@ -867,17 +890,23 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
// Create ReadErrorCorrector object if requested - will be used within assembly engine.
|
||||
ReadErrorCorrector readErrorCorrector = null;
|
||||
if (errorCorrectReads)
|
||||
readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG,fullReferenceWithPadding);
|
||||
readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding);
|
||||
|
||||
try {
|
||||
final List<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
|
||||
if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) {
|
||||
return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc);
|
||||
} else {
|
||||
// we don't want to trim active regions, so go ahead and use the old one
|
||||
return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
}
|
||||
} catch ( Exception e ) {
|
||||
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector );
|
||||
assemblyResultSet.debugDump(logger);
|
||||
|
||||
if ( ! dontTrimActiveRegions ) {
|
||||
final ActiveRegion trimmedActiveRegion = trimActiveRegion(assemblyResultSet,activeAllelesToGenotype);
|
||||
if (trimmedActiveRegion != null)
|
||||
return trimAssemblyResultSet(assemblyResultSet, trimmedActiveRegion);
|
||||
else {
|
||||
assemblyResultSet.resetVariationPresent();
|
||||
return assemblyResultSet;
|
||||
}
|
||||
} else
|
||||
return assemblyResultSet;
|
||||
} catch ( final Exception e ) {
|
||||
// Capture any exception that might be thrown, and write out the assembly failure BAM if requested
|
||||
if ( captureAssemblyFailureBAM ) {
|
||||
final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5);
|
||||
|
|
@ -947,73 +976,89 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* Trim down the active region to just enough to properly genotype the events among the haplotypes
|
||||
*
|
||||
* @param originalActiveRegion our full active region
|
||||
* @param haplotypes the list of haplotypes we've created from assembly
|
||||
* @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty)
|
||||
* @param fullReferenceWithPadding the reference bases over the full padded location
|
||||
* @param paddedReferenceLoc the span of the reference bases
|
||||
* @return an AssemblyResult containing the trimmed active region with all of the reads we should use
|
||||
* trimmed down as well, and a revised set of haplotypes. If trimming down the active region results
|
||||
* in only the reference haplotype over the non-extended active region, returns null.
|
||||
*/
|
||||
private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion,
|
||||
final List<Haplotype> haplotypes,
|
||||
final List<VariantContext> activeAllelesToGenotype,
|
||||
final byte[] fullReferenceWithPadding,
|
||||
final GenomeLoc paddedReferenceLoc) {
|
||||
if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes");
|
||||
|
||||
EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG);
|
||||
final TreeSet<VariantContext> allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes);
|
||||
private ActiveRegion trimActiveRegion(final AssemblyResultSet resultSet, final Collection<VariantContext> activeAllelesToGenotype) {
|
||||
if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes");
|
||||
final List<Haplotype> haplotypeList = resultSet.getHaplotypeList();
|
||||
final ActiveRegion originalGenotypingRegion = resultSet.getRegionForGenotyping();
|
||||
EventMap.buildEventMapsForHaplotypes(haplotypeList, resultSet.getFullReferenceWithPadding(), resultSet.getPaddedReferenceLoc(), DEBUG);
|
||||
final TreeSet<VariantContext> allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypeList);
|
||||
allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype);
|
||||
final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion);
|
||||
|
||||
final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalGenotypingRegion, allVariantsWithinFullActiveRegion,false);
|
||||
if ( trimmedActiveRegion == null ) {
|
||||
// there were no variants found within the active region itself, so just return null
|
||||
if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)");
|
||||
return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, false);
|
||||
return null;
|
||||
}
|
||||
|
||||
// trim down the haplotypes
|
||||
final Set<Haplotype> haplotypeSet = new HashSet<>(haplotypes.size());
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
|
||||
if ( trimmed != null ) {
|
||||
haplotypeSet.add(trimmed);
|
||||
} else if ( DEBUG ) {
|
||||
logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc());
|
||||
}
|
||||
}
|
||||
|
||||
// create the final list of trimmed haplotypes
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(haplotypeSet);
|
||||
|
||||
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
|
||||
Collections.sort( trimmedHaplotypes, new HaplotypeSizeAndBaseComparator() );
|
||||
|
||||
if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size());
|
||||
if ( DEBUG ) {
|
||||
for ( final Haplotype remaining: trimmedHaplotypes ) {
|
||||
logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// trim down the reads and add them to the trimmed active region
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size());
|
||||
for( final GATKSAMRecord read : originalActiveRegion.getReads() ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
|
||||
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) {
|
||||
final List<GATKSAMRecord> trimmedReads = new ArrayList<>(originalGenotypingRegion.getReads().size());
|
||||
for( final GATKSAMRecord read : originalGenotypingRegion.getReads() ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read,
|
||||
trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() );
|
||||
if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 )
|
||||
trimmedReads.add(clippedRead);
|
||||
}
|
||||
}
|
||||
trimmedActiveRegion.clearReads();
|
||||
trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads));
|
||||
|
||||
return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true);
|
||||
return trimmedActiveRegion;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Trims a assembly result set according to the active-region trimming.
|
||||
*
|
||||
* @param resultSet the original assembly result set.
|
||||
* @param trimmedActiveRegion the trimmed active region to trim to.
|
||||
* @return the assembly result set trimmed.
|
||||
*/
|
||||
private AssemblyResultSet trimAssemblyResultSet(final AssemblyResultSet resultSet, final ActiveRegion trimmedActiveRegion) {
|
||||
if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes");
|
||||
|
||||
final List<Haplotype> haplotypeList = resultSet.getHaplotypeList();
|
||||
|
||||
// trim down the haplotypes
|
||||
final Map<Haplotype,Haplotype> originalByTrimmedHaplotypes = new HashMap<>();
|
||||
|
||||
for ( final Haplotype h : haplotypeList ) {
|
||||
final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc());
|
||||
|
||||
if ( trimmed != null ) {
|
||||
if (originalByTrimmedHaplotypes.containsKey(trimmed)) {
|
||||
if (trimmed.isReference()) {
|
||||
originalByTrimmedHaplotypes.remove(trimmed);
|
||||
originalByTrimmedHaplotypes.put(trimmed, h);
|
||||
}
|
||||
} else
|
||||
originalByTrimmedHaplotypes.put(trimmed,h);
|
||||
} else if (h.isReference())
|
||||
throw new IllegalStateException("trimming eliminates the reference haplotype");
|
||||
else if ( DEBUG ) {
|
||||
logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() +
|
||||
" because it starts with or ends with an insertion or deletion when trimmed to " +
|
||||
trimmedActiveRegion.getExtendedLoc());
|
||||
}
|
||||
}
|
||||
|
||||
// create the final list of trimmed haplotypes
|
||||
final List<Haplotype> trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet());
|
||||
|
||||
// resort the trimmed haplotypes.
|
||||
Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator());
|
||||
final Map<Haplotype,Haplotype> sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size());
|
||||
for (final Haplotype trimmed : trimmedHaplotypes)
|
||||
sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed));
|
||||
|
||||
|
||||
if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " +
|
||||
trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " +
|
||||
haplotypeList.size() + " to only " + trimmedHaplotypes.size());
|
||||
if ( DEBUG )
|
||||
for ( final Haplotype remaining: trimmedHaplotypes )
|
||||
logger.info("Remains: " + remaining + " cigar " + remaining.getCigar());
|
||||
|
||||
return resultSet.trimTo(trimmedActiveRegion,sortedOriginalByTrimmedHaplotypes);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -1039,7 +1084,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
public void onTraversalDone(Integer result) {
|
||||
if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it
|
||||
referenceConfidenceModel.close();
|
||||
likelihoodCalculationEngine.close();
|
||||
//TODO remove the need to call close here for debugging, the likelihood output stream should be managed
|
||||
//TODO (open & close) at the walker, not the engine.
|
||||
//likelihoodCalculationEngine.close();
|
||||
logger.info("Ran local assembly on " + result + " active regions");
|
||||
}
|
||||
|
||||
|
|
@ -1050,6 +1097,8 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void finalizeActiveRegion( final ActiveRegion activeRegion ) {
|
||||
if (activeRegion.isFinalized()) return;
|
||||
|
||||
if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); }
|
||||
|
||||
// Loop through the reads hard clipping the adaptor and low quality tails
|
||||
|
|
@ -1094,9 +1143,10 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
|
||||
activeRegion.clearReads();
|
||||
activeRegion.addAll(downsampledReads);
|
||||
activeRegion.setFinalized(true);
|
||||
}
|
||||
|
||||
private Set<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
private Set<GATKSAMRecord> filterNonPassingReads( final ActiveRegion activeRegion ) {
|
||||
final Set<GATKSAMRecord> readsToRemove = new LinkedHashSet<>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
|
|
@ -1107,7 +1157,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
return readsToRemove;
|
||||
}
|
||||
|
||||
private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) {
|
||||
final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1);
|
||||
final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength());
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Graph route that represent an haplotype on the haplotype assembly graph.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.com>
|
||||
*/
|
||||
public class HaplotypeRoute extends Route<MultiDeBruijnVertex,MultiSampleEdge> {
|
||||
|
||||
protected final Set<MultiDeBruijnVertex> vertexSet;
|
||||
|
||||
protected final Map<MultiDeBruijnVertex,Integer> vertexOrder;
|
||||
|
||||
protected final Set<MultiDeBruijnVertex> forkAndJoins;
|
||||
|
||||
/**
|
||||
* Constructs a HaplotypeRoute given its route.
|
||||
*
|
||||
* @param route the haplotype route.
|
||||
*/
|
||||
public HaplotypeRoute(final Route<MultiDeBruijnVertex, MultiSampleEdge> route) {
|
||||
super(route);
|
||||
vertexOrder = new LinkedHashMap<>(route.length() + 1);
|
||||
int nextOrder = 0;
|
||||
vertexOrder.put(getFirstVertex(),nextOrder++);
|
||||
for (final MultiSampleEdge edge : edgesInOrder)
|
||||
vertexOrder.put(graph.getEdgeTarget(edge), nextOrder++);
|
||||
Route<MultiDeBruijnVertex,MultiSampleEdge> currentRoute = this;
|
||||
forkAndJoins = new HashSet<>(route.length());
|
||||
while (currentRoute != null) {
|
||||
if (currentRoute.lastVertexIsForkOrJoin())
|
||||
forkAndJoins.add(currentRoute.getLastVertex());
|
||||
currentRoute = currentRoute.getPrefixRouteWithLastVertexThatIsForkOrJoin();
|
||||
}
|
||||
vertexSet = Collections.unmodifiableSet(new HashSet<>(vertexOrder.keySet()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public Route<MultiDeBruijnVertex,MultiSampleEdge> subRoute(final MultiDeBruijnVertex start, final MultiDeBruijnVertex end) {
|
||||
final Integer startOrder = vertexOrder.get(start);
|
||||
final Integer endOrder = vertexOrder.get(end);
|
||||
if (startOrder == null || endOrder == null)
|
||||
return null;
|
||||
else if (startOrder > endOrder)
|
||||
return null;
|
||||
else {
|
||||
Route<MultiDeBruijnVertex,MultiSampleEdge> result = new Route<>(start,graph);
|
||||
for (final MultiSampleEdge edge : edgesInOrder.subList(startOrder,endOrder))
|
||||
result = new Route(result,edge);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of vertex on the route.
|
||||
* @return read only, never {@code null} vertex set.
|
||||
*/
|
||||
public Set<MultiDeBruijnVertex> vertexSet() {
|
||||
return vertexSet;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the position of the vertex in the route.
|
||||
*
|
||||
* @param vertex the query vertex.
|
||||
*
|
||||
* @throws NullPointerException if {@code vertex} is {@code null}.
|
||||
*
|
||||
* @return -1 if there is no such a vertex in the route, otherwise a number between 0 and {@link #length()} - 1.
|
||||
*/
|
||||
public int getVertexPosition(final MultiDeBruijnVertex vertex) {
|
||||
final Integer result = vertexOrder.get(vertex);
|
||||
return result == null ? -1 : result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
/**
|
||||
* How to resolve the haplotype graph when haplotypes where generated from a mixture of different kmerSizes.
|
||||
*/
|
||||
public enum HeterogeneousKmerSizeResolution {
|
||||
|
||||
/**
|
||||
* Combine haplotypes using a haplotype graph with the largest kmerSize amongst the ones that generated some haplotype.
|
||||
*/
|
||||
COMBO_MAX,
|
||||
|
||||
/**
|
||||
* Combine haplotypes using a haplotype graph with the largest kmerSize amongst the ones that generated some haplotype.
|
||||
*/
|
||||
COMBO_MIN,
|
||||
|
||||
/**
|
||||
* Take just the haplotypes from largest kmersize that generated any.
|
||||
*/
|
||||
MAX_ONLY,
|
||||
|
||||
/**
|
||||
* Take just the haplotypes from the smallest kmerSize that generated any.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
MIN_ONLY;
|
||||
|
||||
/**
|
||||
* Indicates whether we should use the maximum kmerSize for the haplotypeGraph or not.
|
||||
*
|
||||
* @return true if we need to use the maximum, false otherwise.
|
||||
*/
|
||||
public boolean useMaximum() {
|
||||
switch (this) {
|
||||
case COMBO_MAX: return true;
|
||||
case MAX_ONLY: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether we should use the minimum kmerSize for the haplotypeGraph or not.
|
||||
*
|
||||
* @return true if we need to use the minimum, false otherwise.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public boolean useMinimum() {
|
||||
return ! useMaximum();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tell whether this policy combines kmer-sizes or not.
|
||||
* @return true iff it does.
|
||||
*/
|
||||
public boolean combinesKmerSizes() {
|
||||
switch (this) {
|
||||
case COMBO_MAX: return true;
|
||||
case COMBO_MIN: return true;
|
||||
default: return false;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -49,8 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Fast wrapper for byte[] kmers
|
||||
|
|
@ -68,7 +66,7 @@ import java.util.Map;
|
|||
*/
|
||||
public class Kmer {
|
||||
// this values may be updated in the course of interacting with this kmer
|
||||
private byte[] bases;
|
||||
protected byte[] bases;
|
||||
protected int start;
|
||||
|
||||
// two constants
|
||||
|
|
@ -126,6 +124,16 @@ public class Kmer {
|
|||
this.hash = kmer.hash;
|
||||
}
|
||||
|
||||
public Kmer(final Kmer kmer, final byte nextChar) {
|
||||
final byte[] sequence = new byte[kmer.length];
|
||||
System.arraycopy(kmer.bases,kmer.start + 1,sequence,0,kmer.length - 1);
|
||||
sequence[kmer.length - 1] = nextChar;
|
||||
bases = sequence;
|
||||
start = 0;
|
||||
length = kmer.length;
|
||||
hash = myHashCode(bases,start,length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a derived shallow kmer that starts at newStart and has newLength bases
|
||||
* @param newStart the new start of kmer, where 0 means that start of the kmer, 1 means skip the first base
|
||||
|
|
@ -144,6 +152,7 @@ public class Kmer {
|
|||
* @return a non-null byte[] containing length() bases of this kmer, regardless of how this kmer was created
|
||||
*/
|
||||
public byte[] bases() {
|
||||
|
||||
if ( start != 0 || bases.length != length ) {
|
||||
// update operation. Rip out the exact byte[] and update start so we don't ever do this again
|
||||
bases = Arrays.copyOfRange(bases, start, start + length);
|
||||
|
|
@ -153,6 +162,44 @@ public class Kmer {
|
|||
return bases;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Copies kmer bytes into a byte array.
|
||||
*
|
||||
* @param start first position of the kmer to copy
|
||||
* @param dest what array to copy into
|
||||
* @param offset what position the first byte to copy should go into the destination array.
|
||||
* @param length how many bytes to copy
|
||||
*
|
||||
* @throws IllegalArgumentException if <code>start</code> is negative or combined with <code>length</code> it goes
|
||||
* beyond the end of the kmer. Also if <code>length</code> is negative.
|
||||
* @throws NullPointerException if dest is <code>null</code>
|
||||
* @throws ArrayIndexOutOfBoundsException if dest does not have capacity to received the data.
|
||||
*/
|
||||
public void copyTo(final int start, final byte[] dest, final int offset, final int length) {
|
||||
if (start + length > this.length) {
|
||||
throw new IllegalArgumentException("request goes beyond end of kmer");
|
||||
}
|
||||
if (length < 0) {
|
||||
throw new IllegalArgumentException("requested length cannot be negative");
|
||||
}
|
||||
System.arraycopy(bases,this.start + start,dest,offset,length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies kmer bytes into a byte array.
|
||||
*
|
||||
* @param dest what array to copy into
|
||||
* @param offset what position the first byte to copy should go into the destination array.
|
||||
*
|
||||
* @throws IllegalArgumentException if <code>start</code> is negative or combined with <code>length</code> it goes
|
||||
* beyond the end of the kmer. Also if <code>length</code> is negative.
|
||||
* @throws NullPointerException if dest is <code>null</code>
|
||||
*/
|
||||
public void copyTo(final byte[] dest, final int offset) {
|
||||
System.arraycopy(bases,start,dest,offset,length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Backdoor method for fast base peeking: avoids copying like bases() and doesn't modify internal state.
|
||||
* Intended to be used for fast computation of neighboring kmers
|
||||
|
|
@ -219,13 +266,13 @@ public class Kmer {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Kmer{" + new String(bases()) + "}";
|
||||
return "Kmer{" + new String(bases,start,length) + "}";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
if (o == null || !Kmer.class.isAssignableFrom(o.getClass())) return false;
|
||||
|
||||
final Kmer kmer = (Kmer) o;
|
||||
|
||||
|
|
@ -264,4 +311,23 @@ public class Kmer {
|
|||
|
||||
return result;
|
||||
}
|
||||
|
||||
public byte base(final int i) {
|
||||
return bases[start + i];
|
||||
}
|
||||
|
||||
public Kmer shift(final byte nextChar) {
|
||||
if (bases.length > start + length && bases[start + length] == nextChar) {
|
||||
return new Kmer(bases,start + 1,length);
|
||||
} else {
|
||||
final byte[] newBases = new byte[length];
|
||||
System.arraycopy(bases,start + 1,newBases,0,length - 1);
|
||||
newBases[length - 1] = nextChar;
|
||||
return new Kmer(newBases,0,length);
|
||||
}
|
||||
}
|
||||
|
||||
public byte lastBase() {
|
||||
return bases[start + length - 1];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,461 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
|
||||
import com.sun.istack.internal.NotNull;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Represent a sequence of kmers where any two consecutive kmers overlap in kmer length - 1 elements.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.com>
|
||||
*/
|
||||
public class KmerSequence implements List<Kmer> {
|
||||
private final byte[] sequence;
|
||||
private final int start;
|
||||
private final int size;
|
||||
private final int kmerSize;
|
||||
private final int rawLength;
|
||||
|
||||
/**
|
||||
* Creates a kmer sequence from a read's sequence.
|
||||
*
|
||||
* @param read the read to represent as a sequence of kmers.
|
||||
* @param kmerSize the kmer size.
|
||||
*/
|
||||
public KmerSequence(final SAMRecord read, final int kmerSize) {
|
||||
this(read.getReadBases(), kmerSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a kmer sequence from a haplotype's sequence.
|
||||
*
|
||||
* @param hap the haplotype to represent as a sequence of kmers.
|
||||
* @param kmerSize the kmer size.
|
||||
*/
|
||||
public KmerSequence(final Haplotype hap, final int kmerSize) {
|
||||
this(hap.getBases(), kmerSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a kmer sequence out of a byte sequence.
|
||||
*
|
||||
* @param sequence the byte array to represent as a kmer sequence.
|
||||
* @param kmerSize the kmer size.
|
||||
*/
|
||||
public KmerSequence(final byte[] sequence, final int kmerSize) {
|
||||
this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a kmer sequence out of a range of a byte array
|
||||
*
|
||||
* @param sequence the input array.
|
||||
* @param start inclusive first position of the array that maps to the first position in the first kmer.
|
||||
* @param size number kmers in the output.
|
||||
* @param kmerSize kmer length in bases.
|
||||
* @param rawLength the of the range in bases.
|
||||
*/
|
||||
protected KmerSequence(final byte[] sequence, final int start, final int size, final int kmerSize, final int rawLength) {
|
||||
if (sequence == null) {
|
||||
throw new IllegalArgumentException("start must be 0 or greater");
|
||||
}
|
||||
if (rawLength > sequence.length - start) {
|
||||
throw new IllegalArgumentException("the raw sequence length goes beyond the array capacity");
|
||||
}
|
||||
if (size < 0) {
|
||||
throw new IllegalArgumentException("the length cannot be negative");
|
||||
}
|
||||
if (start < 0) {
|
||||
throw new IllegalArgumentException("start must be 0 or greater");
|
||||
}
|
||||
if (size > 0 && size + kmerSize - 1 > rawLength) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format("the kmerSize (%d) + size (%d) - 1 cannot be larger than rawLength (%d)",kmerSize,size,rawLength) );
|
||||
}
|
||||
this.sequence = sequence;
|
||||
this.start = start;
|
||||
this.size = size;
|
||||
this.kmerSize = kmerSize;
|
||||
this.rawLength = rawLength;
|
||||
}
|
||||
|
||||
public int kmerSize() {
|
||||
return kmerSize;
|
||||
}
|
||||
|
||||
public KmerSequence subsequence(final int from, final int to) {
|
||||
if (from < 0 || from > to) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (to > size) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
return new KmerSequence(sequence,this.start + from,to - from,kmerSize,rawLength - from - (size - to));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return size == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(final Object o) {
|
||||
if (o instanceof Kmer) {
|
||||
if (o instanceof MyKmer) {
|
||||
final MyKmer k = (MyKmer) o;
|
||||
if (k.bases == sequence && k.start >= start && k.length == kmerSize && k.start < start + size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
final Kmer k = (Kmer) o;
|
||||
if (k.length != kmerSize) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
int j;
|
||||
for (j = 0; j < kmerSize; j++) {
|
||||
if (sequence[start + i + j] != k.bases[k.start + j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == kmerSize) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
public Iterator<Kmer> iterator() {
|
||||
return new Iterator<Kmer>() {
|
||||
|
||||
private int offset = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return offset < size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Kmer next() {
|
||||
return new Kmer(sequence,start + offset,kmerSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Object[] toArray() {
|
||||
return toArray(new Kmer[size()]);
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
@SuppressWarnings("unchecked")
|
||||
public <T> T[] toArray(@NotNull final T[] a) {
|
||||
if (a == null) {
|
||||
throw new IllegalArgumentException();
|
||||
} else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) {
|
||||
throw new IllegalArgumentException();
|
||||
} else {
|
||||
T[] result;
|
||||
if (a.length < size) {
|
||||
result = (T[]) Array.newInstance(a.getClass().getComponentType(), size);
|
||||
} else {
|
||||
result = a;
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
result[i] = (T) new Kmer(sequence,start + i,kmerSize);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean add(final Kmer kmer) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean remove(final Object o) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsAll(final Collection<?> c) {
|
||||
for (final Object o : c)
|
||||
if (!contains(o))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(final Collection<? extends Kmer> c) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(final int index, @NotNull final Collection<? extends Kmer> c) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean removeAll(@NotNull final Collection<?> c) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean retainAll(@NotNull final Collection<?> c) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Kmer get(final int index) {
|
||||
if (index < 0 || index >= size) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
return new Kmer(sequence,start + index,kmerSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Kmer set(final int index, final Kmer element) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final int index, final Kmer element) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Kmer remove(final int index) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int indexOf(final Object o) {
|
||||
if (o instanceof Kmer) {
|
||||
final Kmer k = (Kmer) o;
|
||||
if (k.length != kmerSize) {
|
||||
return -1;
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
int j;
|
||||
for (j = 0; j < kmerSize; j++) {
|
||||
if (sequence[start + i + j] != k.bases[k.start + j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == kmerSize) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int lastIndexOf(final Object o) {
|
||||
if (o instanceof Kmer) {
|
||||
final Kmer k = (Kmer) o;
|
||||
if (k.length != kmerSize) {
|
||||
return -1;
|
||||
}
|
||||
for (int i = size - 1; i >= 0; i--) {
|
||||
int j;
|
||||
for (j = kmerSize - 1; j >= 0; j--) {
|
||||
if (sequence[start + i + j] != k.bases[k.start + j]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
public ListIterator<Kmer> listIterator() {
|
||||
return new MyListIterator(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
public ListIterator<Kmer> listIterator(final int index) {
|
||||
return new MyListIterator(index);
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
public List<Kmer> subList(final int fromIndex, final int toIndex) {
|
||||
return subsequence(fromIndex,toIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the byte array representation of the kmer sequence.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
@NotNull
|
||||
public byte[] getBytes() {
|
||||
if (start == 0 && rawLength == sequence.length)
|
||||
return sequence;
|
||||
else
|
||||
return Arrays.copyOfRange(sequence, start, rawLength + start);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal class that implements the {@link Kmer} more efficiently
|
||||
* making reference to the sequence's own byte array.
|
||||
*/
|
||||
protected class MyKmer extends Kmer {
|
||||
|
||||
/**
|
||||
* Create a new instance give the offset in the byte array.
|
||||
* @param start the start base offset for the kmer.
|
||||
*/
|
||||
public MyKmer(final int start) {
|
||||
super(sequence,start,kmerSize);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterator implementation of Kmer elements.
|
||||
*/
|
||||
private class MyListIterator implements ListIterator<Kmer> {
|
||||
|
||||
private int i = 0;
|
||||
|
||||
/**
|
||||
* Creates a iterator at certain offset in the sequence.
|
||||
* @param idx the start position or kmer offset.
|
||||
*/
|
||||
private MyListIterator(final int idx) {
|
||||
i = idx;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Kmer next() {
|
||||
return new Kmer(sequence,start + i++,kmerSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPrevious() {
|
||||
return i > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Kmer previous() {
|
||||
return new Kmer(sequence,start + --i,kmerSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextIndex() {
|
||||
return i;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previousIndex() {
|
||||
return i - 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(final Kmer kmer) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final Kmer kmer) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.BaseEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.BaseVertex;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.KmerSearchableGraph;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collections;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Contains information as to how a kmer sequence maps to an (assembly) graph.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.com>
|
||||
*/
|
||||
public class KmerSequenceGraphMap<V extends BaseVertex,E extends BaseEdge> {
|
||||
|
||||
protected final KmerSequence sequence;
|
||||
protected final KmerSearchableGraph<V,E> graph;
|
||||
protected final int kmerSize;
|
||||
|
||||
private List<V> vertexList;
|
||||
|
||||
private List<V> vertexMatchOnlyList;
|
||||
|
||||
private Set<V> vertexSet;
|
||||
|
||||
private Map<V,Integer> vertexOffset;
|
||||
|
||||
//private List<VertexSegment<V>> vertexSegmentList;
|
||||
|
||||
/**
|
||||
* Constructs a new Kmer sequence graph map give the graph and sequence.
|
||||
*
|
||||
* @param g the graph to map to.
|
||||
* @param s the sequence to map.
|
||||
* @throws NullPointerException if either the graph or the sequence is <code>null</code>.
|
||||
* @throws IllegalArgumentException if the kmer sizes of the input graph and sequence are not the same.
|
||||
*/
|
||||
public KmerSequenceGraphMap(final KmerSearchableGraph<V, E> g, final KmerSequence s) {
|
||||
if (s.kmerSize() != g.getKmerSize()) {
|
||||
throw new IllegalArgumentException("kmer size for the graph (" + g.getKmerSize() + ") and the sequence (" + s.kmerSize() + ") are different");
|
||||
}
|
||||
sequence = s;
|
||||
graph = g;
|
||||
kmerSize = s.kmerSize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Vertices that form part of the kmer sequence path along the graph.
|
||||
*
|
||||
* <p>The ith position in the resulting list corresponds to the ith kmer in the sequence</p>.
|
||||
*
|
||||
* <p>
|
||||
* The resulting list will contain null values for those kmers where there is no unique kmer match in the
|
||||
* graph.
|
||||
* </p>
|
||||
*
|
||||
* @return never {@code null}
|
||||
*/
|
||||
public List<V> vertexList() {
|
||||
if (vertexList == null)
|
||||
buildVertexCollections();
|
||||
return vertexList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Vertices that form part of the kmer sequence path along the graph.
|
||||
*
|
||||
* <p> Only contains unique kmer vertices where the non-unique ones have been sliced out from the list</p>
|
||||
*
|
||||
* @return never {@code null}
|
||||
*/
|
||||
public List<V> vertexMatchOnlyList() {
|
||||
if (vertexMatchOnlyList == null) {
|
||||
buildVertexCollections();
|
||||
}
|
||||
return vertexMatchOnlyList;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a map from vertices to their kmer offset in the kmer sequence.
|
||||
* @return never {@code null}
|
||||
*/
|
||||
public Map<V,Integer> vertexOffset() {
|
||||
if (vertexOffset == null) {
|
||||
buildVertexCollections();
|
||||
}
|
||||
return vertexOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set of all vertices with unique kmers in the kmer sequence.
|
||||
* <p>
|
||||
* This structure is more appropriate to query whether a vertex belong or not to such a set.
|
||||
* </p>
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public Set<V> vertexSet() {
|
||||
if (vertexSet == null) {
|
||||
buildVertexCollections();
|
||||
}
|
||||
return vertexSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates vertex structures.
|
||||
*/
|
||||
protected void buildVertexCollections() {
|
||||
@SuppressWarnings("unchecked")
|
||||
final V[] result = (V[]) new BaseVertex[sequence.size()];
|
||||
final Set<V> set = new HashSet<>(sequence.size());
|
||||
final Map<V,Integer> posMap = new HashMap<>(sequence.size());
|
||||
@SuppressWarnings("unchecked")
|
||||
final V[] matchOnly = (V[]) new BaseVertex[sequence.size()];
|
||||
int next = 0;
|
||||
int matchOnlyNext = 0;
|
||||
for (int i = 0; i < sequence.size(); i++) {
|
||||
final Kmer k = sequence.get(i);
|
||||
final V v = graph.findKmer(k);
|
||||
if (v != null) {
|
||||
set.add(v);
|
||||
posMap.put(v,i);
|
||||
matchOnly[matchOnlyNext++] = v;
|
||||
}
|
||||
result[next++] = v;
|
||||
}
|
||||
vertexList = Arrays.asList(result);
|
||||
vertexMatchOnlyList = Arrays.asList(Arrays.copyOf(matchOnly,matchOnlyNext));
|
||||
vertexSet = Collections.unmodifiableSet(set);
|
||||
vertexOffset = Collections.unmodifiableMap(posMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of kmers in the sequence that do not have a unique mapping on the graph.
|
||||
* @return never {@code null}
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public List<Kmer> missingKmers() {
|
||||
if (vertexList == null) {
|
||||
buildVertexCollections();
|
||||
}
|
||||
if (vertexList.size() == vertexMatchOnlyList.size()) {
|
||||
return Collections.emptyList();
|
||||
} else {
|
||||
final List<Kmer> result = new ArrayList<>(vertexList.size() - vertexMatchOnlyList.size());
|
||||
final int size = sequence.size();
|
||||
for (int i = 0; i < vertexList.size(); i++) {
|
||||
if (vertexList.get(i) == null) {
|
||||
result.add(sequence.get(i));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -46,535 +46,47 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator;
|
||||
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
|
||||
/**
|
||||
* Common interface for assembly-haplotype vs reads likelihood engines.
|
||||
*/
|
||||
public interface LikelihoodCalculationEngine {
|
||||
|
||||
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
|
||||
enum Implementation {
|
||||
/**
|
||||
* Classic full pair-hmm all haplotypes vs all reads.
|
||||
*/
|
||||
PairHMM,
|
||||
|
||||
private final byte constantGCP;
|
||||
private final double log10globalReadMismappingRate;
|
||||
private final boolean DEBUG;
|
||||
/**
|
||||
* Graph-base likelihoods.
|
||||
*/
|
||||
GraphBased,
|
||||
|
||||
private final PairHMM.HMM_IMPLEMENTATION hmmType;
|
||||
private final boolean noFpga;
|
||||
|
||||
private final ThreadLocal<PairHMM> pairHMM = new ThreadLocal<PairHMM>() {
|
||||
@Override
|
||||
protected PairHMM initialValue() {
|
||||
switch (hmmType) {
|
||||
case EXACT: return new Log10PairHMM(true);
|
||||
case ORIGINAL: return new Log10PairHMM(false);
|
||||
case LOGLESS_CACHING:
|
||||
if (noFpga || !CnyPairHMM.isAvailable())
|
||||
return new LoglessPairHMM();
|
||||
else
|
||||
return new CnyPairHMM();
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
|
||||
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
|
||||
private final PrintStream likelihoodsStream;
|
||||
|
||||
public enum PCR_ERROR_MODEL {
|
||||
/** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */
|
||||
NONE,
|
||||
/** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */
|
||||
AGGRESSIVE,
|
||||
/** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */
|
||||
CONSERVATIVE
|
||||
/**
|
||||
* Random likelihoods, used to establish a baseline benchmark for other meaningful implementations.
|
||||
*/
|
||||
Random
|
||||
}
|
||||
|
||||
private final PCR_ERROR_MODEL pcrErrorModel;
|
||||
|
||||
/**
|
||||
* The expected rate of random sequencing errors for a read originating from its true haplotype.
|
||||
* Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the
|
||||
* active region assembly process.
|
||||
*
|
||||
* For example, if this is 0.01, then we'd expect 1 error per 100 bp.
|
||||
* @param assemblyResultSet the input assembly results.
|
||||
* @param perSampleReadList the input read sets stratified per sample.
|
||||
*
|
||||
* @throws NullPointerException if either parameter is {@code null}.
|
||||
*
|
||||
* @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
|
||||
* The value maps can be potentially empty though.
|
||||
*/
|
||||
private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
|
||||
|
||||
/**
|
||||
* Create a new LikelihoodCalculationEngine using provided parameters and hmm to do its calculations
|
||||
*
|
||||
* @param constantGCP the gap continuation penalty to use with the PairHMM
|
||||
* @param debug should we emit debugging information during the calculation?
|
||||
* @param hmmType the type of the HMM to use
|
||||
* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of
|
||||
* -3 means that the chance that a read doesn't actually belong at this
|
||||
* location in the genome is 1 in 1000. The effect of this parameter is
|
||||
* to cap the maximum likelihood difference between the reference haplotype
|
||||
* and the best alternative haplotype by -3 log units. So if the best
|
||||
* haplotype is at -10 and this parameter has a value of -3 then even if the
|
||||
* reference haplotype gets a score of -100 from the pairhmm it will be
|
||||
* assigned a likelihood of -13.
|
||||
* @param noFpga disable FPGA acceleration
|
||||
*/
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
|
||||
this.hmmType = hmmType;
|
||||
this.constantGCP = constantGCP;
|
||||
this.DEBUG = debug;
|
||||
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
|
||||
this.noFpga = noFpga;
|
||||
this.pcrErrorModel = pcrErrorModel;
|
||||
|
||||
initializePCRErrorModel();
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
try {
|
||||
likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME)));
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
likelihoodsStream = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if ( likelihoodsStream != null ) likelihoodsStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
|
||||
*
|
||||
* After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
|
||||
* against the set of haplotypes
|
||||
*
|
||||
* @param haplotypes a non-null list of haplotypes
|
||||
* @param perSampleReadList a mapping from sample -> reads
|
||||
*/
|
||||
private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
for( final GATKSAMRecord read : sample.getValue() ) {
|
||||
final int readLength = read.getReadLength();
|
||||
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
|
||||
}
|
||||
}
|
||||
int Y_METRIC_LENGTH = 0;
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final int haplotypeLength = h.getBases().length;
|
||||
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
|
||||
}
|
||||
|
||||
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
}
|
||||
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
|
||||
// configure the HMM
|
||||
initializePairHMM(haplotypes, perSampleReadList);
|
||||
|
||||
// Add likelihoods for each sample's reads to our stratifiedReadMap
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
|
||||
|
||||
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), map);
|
||||
}
|
||||
|
||||
return stratifiedReadMap;
|
||||
}
|
||||
|
||||
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
|
||||
// first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time)
|
||||
final BatchPairHMM batchPairHMM = (pairHMM.get() instanceof BatchPairHMM) ? (BatchPairHMM)pairHMM.get() : null;
|
||||
final Vector<GATKSAMRecord> batchedReads = new Vector<GATKSAMRecord>(reads.size());
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(numHaplotypes);
|
||||
Allele refAllele = null;
|
||||
for ( final Haplotype haplotype : haplotypes ) {
|
||||
final Allele allele = Allele.create(haplotype, true);
|
||||
alleleVersions.put(haplotype, allele);
|
||||
if ( haplotype.isReference() ) refAllele = allele;
|
||||
}
|
||||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
|
||||
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
|
||||
final byte[] readQuals = read.getBaseQualities().clone();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
|
||||
|
||||
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
|
||||
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
|
||||
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
|
||||
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
|
||||
}
|
||||
|
||||
if ( batchPairHMM != null ) {
|
||||
batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
|
||||
batchedReads.add(read);
|
||||
continue;
|
||||
}
|
||||
|
||||
// keep track of the reference likelihood and the best non-ref likelihood
|
||||
double refLog10l = Double.NEGATIVE_INFINITY;
|
||||
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
|
||||
|
||||
// iterate over all haplotypes, calculating the likelihood of the read for each haplotype
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
final boolean isFirstHaplotype = jjj == 0;
|
||||
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
|
||||
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
|
||||
haplotype.getBaseString(),
|
||||
new String(readBases),
|
||||
SAMUtils.phredToFastq(readQuals),
|
||||
SAMUtils.phredToFastq(readInsQuals),
|
||||
SAMUtils.phredToFastq(readDelQuals),
|
||||
SAMUtils.phredToFastq(overallGCP),
|
||||
log10l);
|
||||
}
|
||||
|
||||
if ( haplotype.isNonReference() )
|
||||
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
|
||||
else
|
||||
refLog10l = log10l;
|
||||
|
||||
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
|
||||
}
|
||||
|
||||
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
|
||||
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
|
||||
// that are very divergent from reference, but are supported by only one read. In effect
|
||||
// we capping how badly scoring the reference can be for any read by the chance that the read
|
||||
// itself just doesn't belong here
|
||||
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
|
||||
if ( refLog10l < (worstRefLog10Allowed) ) {
|
||||
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
|
||||
}
|
||||
}
|
||||
|
||||
if ( batchPairHMM != null ) {
|
||||
for( final GATKSAMRecord read : batchedReads ) {
|
||||
double refLog10l = Double.NEGATIVE_INFINITY;
|
||||
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
|
||||
final double[] likelihoods = batchPairHMM.batchGetResult();
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
final double log10l = likelihoods[jjj];
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
|
||||
final byte[] readQuals = read.getBaseQualities().clone();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities();
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
|
||||
// TODO -- why is Q18 hard-coded here???
|
||||
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
}
|
||||
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
|
||||
haplotype.getBaseString(),
|
||||
new String(read.getReadBases()),
|
||||
SAMUtils.phredToFastq(readQuals),
|
||||
SAMUtils.phredToFastq(readInsQuals),
|
||||
SAMUtils.phredToFastq(readDelQuals),
|
||||
SAMUtils.phredToFastq(overallGCP),
|
||||
log10l);
|
||||
}
|
||||
|
||||
if ( haplotype.isNonReference() )
|
||||
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
|
||||
else
|
||||
refLog10l = log10l;
|
||||
|
||||
|
||||
perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l);
|
||||
}
|
||||
|
||||
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
|
||||
if ( refLog10l < (worstRefLog10Allowed) ) {
|
||||
perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return perReadAlleleLikelihoodMap;
|
||||
}
|
||||
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering,
|
||||
final boolean normalize ) {
|
||||
return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize);
|
||||
}
|
||||
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering,
|
||||
final boolean normalize) {
|
||||
|
||||
final int numHaplotypes = alleleOrdering.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
final Allele iii_allele = alleleOrdering.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
final Allele jjj_allele = alleleOrdering.get(jjj);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix;
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
|
||||
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
|
||||
final int numHaplotypes = likelihoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
|
||||
int index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj];
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
|
||||
index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++];
|
||||
}
|
||||
}
|
||||
return likelihoodMatrix;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// System to compute the best N haplotypes for genotyping
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
|
||||
* @param map an annoying map object that moves us between the allele and haplotype representation
|
||||
* @param haplotypeAsAllele the allele version of the haplotype
|
||||
* @return the haplotype version, with its score incremented by 1 if its non-reference
|
||||
*/
|
||||
private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
|
||||
final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
|
||||
if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
|
||||
return h;
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the best N haplotypes and return them as a list
|
||||
*
|
||||
* Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
|
||||
* as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
|
||||
* order of score (so higher score haplotypes are preferred). The N we take is determined by
|
||||
*
|
||||
* N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
|
||||
*
|
||||
* where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
|
||||
* bounded by maxNumHaplotypesInPopulation as that number can grow without bound
|
||||
*
|
||||
* @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
|
||||
* @param nSamples the number of samples used to select the haplotypes
|
||||
* @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
|
||||
* @return a list of N or fewer haplotypes, with the reference haplotype first
|
||||
*/
|
||||
private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
|
||||
final List<Haplotype> selectedHaplotypesList = new ArrayList<Haplotype>(selectedHaplotypes);
|
||||
Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
|
||||
final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
|
||||
final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
|
||||
final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
|
||||
if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select the best haplotypes for genotyping the samples in stratifiedReadMap
|
||||
*
|
||||
* Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
|
||||
* haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
|
||||
* all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
|
||||
* one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
|
||||
* the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
|
||||
* haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
|
||||
*
|
||||
* @param haplotypes a list of all haplotypes we're considering
|
||||
* @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
|
||||
* @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
|
||||
* @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
|
||||
*/
|
||||
public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
|
||||
if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
|
||||
|
||||
if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
|
||||
|
||||
// all of the haplotypes that at least one sample called as one of the most likely
|
||||
final Set<Haplotype> selectedHaplotypes = new HashSet<>();
|
||||
selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
|
||||
|
||||
// our annoying map from allele -> haplotype
|
||||
final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
|
||||
for ( final Haplotype h : haplotypes ) {
|
||||
h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
|
||||
allele2Haplotype.put(Allele.create(h, h.isReference()), h);
|
||||
}
|
||||
|
||||
// for each sample, compute the most likely pair of haplotypes
|
||||
for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
|
||||
// get the two most likely haplotypes under a diploid model for this sample
|
||||
final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
|
||||
|
||||
if ( mla != null ) { // there was something to evaluate in this sample
|
||||
// note that there must be at least 2 haplotypes
|
||||
final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
|
||||
final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
|
||||
|
||||
// if ( DEBUG ) {
|
||||
// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
|
||||
// }
|
||||
|
||||
// add these two haplotypes to the set of haplotypes that have been selected
|
||||
selectedHaplotypes.add(best);
|
||||
selectedHaplotypes.add(second);
|
||||
|
||||
// we've already selected all of our haplotypes, and we don't need to prune them down
|
||||
if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// take the best N haplotypes forward, in order of the number of samples that choose them
|
||||
final int nSamples = stratifiedReadMap.size();
|
||||
final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
|
||||
|
||||
if ( DEBUG ) {
|
||||
logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
|
||||
for ( final Haplotype h : bestHaplotypes ) {
|
||||
logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
|
||||
}
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
|
||||
* @param haplotypes non-null list of haplotypes
|
||||
* @return the reference haplotype
|
||||
*/
|
||||
private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.isReference() ) return h;
|
||||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Experimental attempts at PCR error rate modeling
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
protected static final int MAX_STR_UNIT_LENGTH = 8;
|
||||
protected static final int MAX_REPEAT_LENGTH = 20;
|
||||
protected static final int MIN_ADJUSTED_QSCORE = 10;
|
||||
protected static final double INITIAL_QSCORE = 40.0;
|
||||
|
||||
private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1];
|
||||
private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
|
||||
|
||||
private void initializePCRErrorModel() {
|
||||
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
|
||||
return;
|
||||
|
||||
repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH);
|
||||
|
||||
pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1];
|
||||
|
||||
final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0;
|
||||
|
||||
for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ )
|
||||
pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor);
|
||||
}
|
||||
|
||||
protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {
|
||||
return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 ));
|
||||
}
|
||||
|
||||
protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {
|
||||
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
|
||||
return;
|
||||
|
||||
for ( int iii = 1; iii < readBases.length; iii++ ) {
|
||||
final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond();
|
||||
readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
|
||||
readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(AssemblyResultSet assemblyResultSet,
|
||||
Map<String, List<GATKSAMRecord>> perSampleReadList);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -124,15 +124,16 @@ public abstract class LocalAssemblyEngine {
|
|||
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
|
||||
* @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode
|
||||
* @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
|
||||
* @return a non-empty list of all the haplotypes that are produced during assembly
|
||||
* @return the resulting assembly-result-set
|
||||
*/
|
||||
public List<Haplotype> runLocalAssembly(final ActiveRegion activeRegion,
|
||||
public AssemblyResultSet runLocalAssembly(final ActiveRegion activeRegion,
|
||||
final Haplotype refHaplotype,
|
||||
final byte[] fullReferenceWithPadding,
|
||||
final GenomeLoc refLoc,
|
||||
final List<VariantContext> activeAllelesToGenotype,
|
||||
final ReadErrorCorrector readErrorCorrector) {
|
||||
if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); }
|
||||
if( activeRegion.getExtendedLoc() == null ) { throw new IllegalArgumentException("Active region must have an extended location."); }
|
||||
if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); }
|
||||
if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); }
|
||||
if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); }
|
||||
|
|
@ -153,26 +154,32 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
final List<SeqGraph> nonRefGraphs = new LinkedList<>();
|
||||
final AssemblyResultSet resultSet = new AssemblyResultSet();
|
||||
resultSet.setRegionForGenotyping(activeRegion);
|
||||
resultSet.setFullReferenceWithPadding(fullReferenceWithPadding);
|
||||
resultSet.setPaddedReferenceLoc(refLoc);
|
||||
final GenomeLoc activeRegionExtendedLocation = activeRegion.getExtendedLoc();
|
||||
refHaplotype.setGenomeLocation(activeRegionExtendedLocation);
|
||||
resultSet.add(refHaplotype);
|
||||
final Map<SeqGraph,AssemblyResult> assemblyResultByGraph = new HashMap<>();
|
||||
// create the graphs by calling our subclass assemble method
|
||||
for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) {
|
||||
if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) {
|
||||
// do some QC on the graph
|
||||
sanityCheckGraph(result.getGraph(), refHaplotype);
|
||||
// add it to graphs with meaningful non-reference features
|
||||
assemblyResultByGraph.put(result.getGraph(),result);
|
||||
nonRefGraphs.add(result.getGraph());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
findBestPaths (nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet);
|
||||
|
||||
// print the graphs if the appropriate debug option has been turned on
|
||||
if ( graphWriter != null ) { printGraphs(nonRefGraphs); }
|
||||
|
||||
if ( nonRefGraphs.isEmpty() ) {
|
||||
// we couldn't assemble any meaningful graphs, so return just the reference haplotype
|
||||
return Collections.singletonList(refHaplotype);
|
||||
} else {
|
||||
// find the best paths in the graphs and return them as haplotypes
|
||||
return findBestPaths( nonRefGraphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() );
|
||||
}
|
||||
return resultSet;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -198,8 +205,10 @@ public abstract class LocalAssemblyEngine {
|
|||
return new ArrayList<>(returnHaplotypes);
|
||||
}
|
||||
|
||||
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow) {
|
||||
protected List<Haplotype> findBestPaths(final List<SeqGraph> graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow,
|
||||
final Map<SeqGraph,AssemblyResult> assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) {
|
||||
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
|
||||
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
|
||||
returnHaplotypes.add( refHaplotype );
|
||||
|
|
@ -235,7 +244,9 @@ public abstract class LocalAssemblyEngine {
|
|||
h.setCigar(cigar);
|
||||
h.setAlignmentStartHapwrtRef(activeRegionStart);
|
||||
h.setScore(path.getScore());
|
||||
h.setGenomeLocation(activeRegionWindow);
|
||||
returnHaplotypes.add(h);
|
||||
assemblyResultSet.add(h, assemblyResultByGraph.get(graph));
|
||||
|
||||
if ( debug )
|
||||
logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize());
|
||||
|
|
@ -243,8 +254,6 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
}
|
||||
|
||||
// add genome locs to the haplotypes
|
||||
for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow);
|
||||
|
||||
if ( returnHaplotypes.size() < returnHaplotypes.size() )
|
||||
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc);
|
||||
|
|
@ -262,8 +271,8 @@ public abstract class LocalAssemblyEngine {
|
|||
}
|
||||
|
||||
return new ArrayList<>(returnHaplotypes);
|
||||
}
|
||||
|
||||
}
|
||||
/**
|
||||
* We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal
|
||||
* @param c the cigar to test
|
||||
|
|
@ -326,7 +335,6 @@ public abstract class LocalAssemblyEngine {
|
|||
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
|
||||
}
|
||||
printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot"));
|
||||
|
||||
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,834 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pairhmm.*;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.vcf.VCFConstants;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class);
|
||||
|
||||
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
|
||||
|
||||
private final byte constantGCP;
|
||||
private final double log10globalReadMismappingRate;
|
||||
private final boolean DEBUG;
|
||||
|
||||
private final PairHMM.HMM_IMPLEMENTATION hmmType;
|
||||
private final boolean noFpga;
|
||||
|
||||
private final ThreadLocal<PairHMM> pairHMMThreadLocal = new ThreadLocal<PairHMM>() {
|
||||
@Override
|
||||
protected PairHMM initialValue() {
|
||||
switch (hmmType) {
|
||||
case EXACT: return new Log10PairHMM(true);
|
||||
case ORIGINAL: return new Log10PairHMM(false);
|
||||
case LOGLESS_CACHING:
|
||||
if (noFpga || !CnyPairHMM.isAvailable())
|
||||
return new LoglessPairHMM();
|
||||
else
|
||||
return new CnyPairHMM();
|
||||
case ARRAY_LOGLESS:
|
||||
if (noFpga || !CnyPairHMM.isAvailable())
|
||||
return new ArrayLoglessPairHMM();
|
||||
else
|
||||
return new CnyPairHMM();
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS.");
|
||||
}
|
||||
}
|
||||
};
|
||||
// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException
|
||||
// private final PairHMM pairHMM = pairHMMThreadLocal.get();
|
||||
|
||||
private final static boolean WRITE_LIKELIHOODS_TO_FILE = false;
|
||||
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
|
||||
private final PrintStream likelihoodsStream;
|
||||
|
||||
public enum PCR_ERROR_MODEL {
|
||||
/** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */
|
||||
NONE,
|
||||
/** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */
|
||||
AGGRESSIVE,
|
||||
/** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */
|
||||
CONSERVATIVE
|
||||
}
|
||||
|
||||
private final PCR_ERROR_MODEL pcrErrorModel;
|
||||
|
||||
/**
|
||||
* The expected rate of random sequencing errors for a read originating from its true haplotype.
|
||||
*
|
||||
* For example, if this is 0.01, then we'd expect 1 error per 100 bp.
|
||||
*/
|
||||
private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02;
|
||||
|
||||
/**
|
||||
* Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations
|
||||
*
|
||||
* @param constantGCP the gap continuation penalty to use with the PairHMM
|
||||
* @param debug should we emit debugging information during the calculation?
|
||||
* @param hmmType the type of the HMM to use
|
||||
* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of
|
||||
* -3 means that the chance that a read doesn't actually belong at this
|
||||
* location in the genome is 1 in 1000. The effect of this parameter is
|
||||
* to cap the maximum likelihood difference between the reference haplotype
|
||||
* and the best alternative haplotype by -3 log units. So if the best
|
||||
* haplotype is at -10 and this parameter has a value of -3 then even if the
|
||||
* reference haplotype gets a score of -100 from the pairhmm it will be
|
||||
* assigned a likelihood of -13.
|
||||
* @param noFpga disable FPGA acceleration
|
||||
*/
|
||||
public PairHMMLikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
|
||||
this.hmmType = hmmType;
|
||||
this.constantGCP = constantGCP;
|
||||
this.DEBUG = debug;
|
||||
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
|
||||
this.noFpga = noFpga;
|
||||
this.pcrErrorModel = pcrErrorModel;
|
||||
|
||||
initializePCRErrorModel();
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
try {
|
||||
likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME)));
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} else {
|
||||
likelihoodsStream = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if ( likelihoodsStream != null ) likelihoodsStream.close();
|
||||
}
|
||||
|
||||
private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
|
||||
haplotype.getBaseString(),
|
||||
new String(processedRead.getReadBases() ),
|
||||
SAMUtils.phredToFastq(processedRead.getBaseQualities() ),
|
||||
SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
|
||||
SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
|
||||
SAMUtils.phredToFastq(constantGCP),
|
||||
log10l);
|
||||
}
|
||||
}
|
||||
|
||||
private Map<Allele, Haplotype> createAlleleMap(List<Haplotype> haplotypes){
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Map<Allele, Haplotype> alleleMap = new LinkedHashMap<>(numHaplotypes);
|
||||
for ( final Haplotype haplotype : haplotypes ) {
|
||||
final Allele allele = Allele.create(haplotype, true);
|
||||
alleleMap.put(allele, haplotype);
|
||||
}
|
||||
return alleleMap;
|
||||
}
|
||||
|
||||
private Map<GATKSAMRecord, byte[]> fillGCPArrays(List<GATKSAMRecord> reads){
|
||||
final Map<GATKSAMRecord, byte []> GCPArrayMap = new LinkedHashMap<>();
|
||||
for (GATKSAMRecord read: reads){
|
||||
byte [] GCPArray = new byte[read.getReadBases().length];
|
||||
Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
GCPArrayMap.put(read, GCPArray);
|
||||
}
|
||||
return GCPArrayMap;
|
||||
}
|
||||
|
||||
private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
|
||||
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
|
||||
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-processing of the reads to be evaluated at the current location from the current sample.
|
||||
* We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read.
|
||||
* Modified copies of reads are packed into a new list, while original reads are retained for downstream use
|
||||
*
|
||||
* @param reads The original list of unmodified reads
|
||||
* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding
|
||||
*/
|
||||
private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) {
|
||||
List<GATKSAMRecord> processedReads = new LinkedList<>();
|
||||
for ( GATKSAMRecord read : reads ) {
|
||||
|
||||
final byte[] readBases = read.getReadBases();
|
||||
|
||||
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
|
||||
final byte[] readQuals = read.getBaseQualities().clone();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
|
||||
|
||||
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
|
||||
capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals);
|
||||
|
||||
// Create a new copy of the read and sets its base qualities to the modified versions.
|
||||
// Pack this into a new list for return
|
||||
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals);
|
||||
processedReads.add(processedRead);
|
||||
}
|
||||
return processedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Post-processing of the read/allele likelihoods.
|
||||
*
|
||||
* We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads.
|
||||
* We wish to return a map containing the original, unmodified reads.
|
||||
*
|
||||
* At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate.
|
||||
* This protects us from the case where the assembly has produced haplotypes
|
||||
* that are very divergent from reference, but are supported by only one read. In effect
|
||||
* we capping how badly scoring the reference can be for any read by the chance that the read
|
||||
* itself just doesn't belong here
|
||||
*
|
||||
* @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls
|
||||
* @param reads Our original, unmodified reads
|
||||
* @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls
|
||||
* @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype
|
||||
*
|
||||
* @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls
|
||||
*/
|
||||
private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List<GATKSAMRecord> reads, List<GATKSAMRecord> processedReads, Map<Allele, Haplotype> alleleHaplotypeMap){
|
||||
|
||||
// a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls
|
||||
final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
Allele refAllele = null;
|
||||
final int numReads = reads.size();
|
||||
for (int readIndex = 0; readIndex < numReads; readIndex++) {
|
||||
|
||||
// Get the original and quality-modified read from their respective lists
|
||||
// Note that this requires both lists to have reads in the same order
|
||||
final GATKSAMRecord originalRead = reads.get(readIndex);
|
||||
final GATKSAMRecord processedRead = processedReads.get(readIndex);
|
||||
|
||||
// keep track of the reference likelihood and the best non-ref likelihood
|
||||
double refLog10l = Double.NEGATIVE_INFINITY;
|
||||
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for ( Allele allele : alleleHaplotypeMap.keySet() ) {
|
||||
final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
|
||||
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
|
||||
if ( haplotype.isNonReference() )
|
||||
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
|
||||
else {
|
||||
refAllele = allele;
|
||||
refLog10l = log10l;
|
||||
}
|
||||
writeDebugLikelihoods(processedRead, haplotype, log10l);
|
||||
|
||||
// add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l
|
||||
processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l);
|
||||
}
|
||||
|
||||
// ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global
|
||||
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
|
||||
// that are very divergent from reference, but are supported by only one read. In effect
|
||||
// we capping how badly scoring the reference can be for any read by the chance that the read
|
||||
// itself just doesn't belong here
|
||||
final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate;
|
||||
if ( refLog10l < (worstRefLog10Allowed) ) {
|
||||
processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed);
|
||||
}
|
||||
}
|
||||
return processedReadAlleleLikelihoodMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
|
||||
*
|
||||
* After calling this routine the PairHMM will be configured to best evaluate all reads in the samples
|
||||
* against the set of haplotypes
|
||||
*
|
||||
* @param haplotypes a non-null list of haplotypes
|
||||
* @param perSampleReadList a mapping from sample -> reads
|
||||
*/
|
||||
private void initializePairHMM(final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
for( final GATKSAMRecord read : sample.getValue() ) {
|
||||
final int readLength = read.getReadLength();
|
||||
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
|
||||
}
|
||||
}
|
||||
int Y_METRIC_LENGTH = 0;
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final int haplotypeLength = h.getBases().length;
|
||||
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
|
||||
}
|
||||
|
||||
// initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
|
||||
|
||||
final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList();
|
||||
// configure the HMM
|
||||
initializePairHMM(haplotypes, perSampleReadList);
|
||||
|
||||
// Add likelihoods for each sample's reads to our stratifiedReadMap
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
|
||||
|
||||
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), map);
|
||||
}
|
||||
|
||||
return stratifiedReadMap;
|
||||
}
|
||||
|
||||
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final List<Haplotype> haplotypes, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
|
||||
|
||||
// Add likelihoods for each sample's reads to our stratifiedReadMap
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>();
|
||||
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
|
||||
|
||||
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), map);
|
||||
}
|
||||
|
||||
return stratifiedReadMap;
|
||||
}
|
||||
|
||||
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
|
||||
|
||||
// Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities
|
||||
List<GATKSAMRecord> processedReads = modifyReadQualities(reads);
|
||||
|
||||
// Get alleles corresponding to our haplotypees
|
||||
Map<Allele, Haplotype> alleleHaplotypeMap = createAlleleMap(haplotypes);
|
||||
|
||||
// Get an array containing the constantGCP for each read in our modified read list
|
||||
Map<GATKSAMRecord,byte[]> GCPArrayMap = fillGCPArrays(processedReads);
|
||||
|
||||
// Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype
|
||||
PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap);
|
||||
|
||||
// Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate
|
||||
|
||||
return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap);
|
||||
}
|
||||
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering,
|
||||
final boolean normalize ) {
|
||||
return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize);
|
||||
}
|
||||
|
||||
@Requires({"alleleOrdering.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap,
|
||||
final List<Allele> alleleOrdering,
|
||||
final boolean normalize) {
|
||||
|
||||
final int numHaplotypes = alleleOrdering.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
final Allele iii_allele = alleleOrdering.get(iii);
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
final Allele jjj_allele = alleleOrdering.get(jjj);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) *
|
||||
( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix;
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
|
||||
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
|
||||
final int numHaplotypes = likelihoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
|
||||
int index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj];
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
|
||||
index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++];
|
||||
}
|
||||
}
|
||||
return likelihoodMatrix;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// System to compute the best N haplotypes for genotyping
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// /**
|
||||
// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
|
||||
// * @param map an annoying map object that moves us between the allele and haplotype representation
|
||||
// * @param haplotypeAsAllele the allele version of the haplotype
|
||||
// * @return the haplotype version, with its score incremented by 1 if its non-reference
|
||||
// */
|
||||
// private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
|
||||
// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
|
||||
// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
|
||||
// return h;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Take the best N haplotypes and return them as a list
|
||||
// *
|
||||
// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
|
||||
// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
|
||||
// * order of score (so higher score haplotypes are preferred). The N we take is determined by
|
||||
// *
|
||||
// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
|
||||
// *
|
||||
// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
|
||||
// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
|
||||
// *
|
||||
// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
|
||||
// * @param nSamples the number of samples used to select the haplotypes
|
||||
// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
|
||||
// * @return a list of N or fewer haplotypes, with the reference haplotype first
|
||||
// */
|
||||
// private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
|
||||
// final List<Haplotype> selectedHaplotypesList = new ArrayList<>(selectedHaplotypes);
|
||||
// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
|
||||
// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
|
||||
// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
|
||||
// final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
|
||||
// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
|
||||
// return bestHaplotypes;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Select the best haplotypes for genotyping the samples in stratifiedReadMap
|
||||
// *
|
||||
// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
|
||||
// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
|
||||
// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
|
||||
// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
|
||||
// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
|
||||
// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
|
||||
// *
|
||||
// * @param haplotypes a list of all haplotypes we're considering
|
||||
// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
|
||||
// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
|
||||
// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
|
||||
// */
|
||||
// public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
|
||||
// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
|
||||
//
|
||||
// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
|
||||
//
|
||||
// // all of the haplotypes that at least one sample called as one of the most likely
|
||||
// final Set<Haplotype> selectedHaplotypes = new HashSet<>();
|
||||
// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
|
||||
//
|
||||
// // our annoying map from allele -> haplotype
|
||||
// final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
|
||||
// for ( final Haplotype h : haplotypes ) {
|
||||
// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
|
||||
// allele2Haplotype.put(Allele.create(h, h.isReference()), h);
|
||||
// }
|
||||
//
|
||||
// // for each sample, compute the most likely pair of haplotypes
|
||||
// for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
|
||||
// // get the two most likely haplotypes under a diploid model for this sample
|
||||
// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
|
||||
//
|
||||
// if ( mla != null ) { // there was something to evaluate in this sample
|
||||
// // note that there must be at least 2 haplotypes
|
||||
// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
|
||||
// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
|
||||
//
|
||||
//// if ( DEBUG ) {
|
||||
//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
|
||||
//// }
|
||||
//
|
||||
// // add these two haplotypes to the set of haplotypes that have been selected
|
||||
// selectedHaplotypes.add(best);
|
||||
// selectedHaplotypes.add(second);
|
||||
//
|
||||
// // we've already selected all of our haplotypes, and we don't need to prune them down
|
||||
// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // take the best N haplotypes forward, in order of the number of samples that choose them
|
||||
// final int nSamples = stratifiedReadMap.size();
|
||||
// final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
|
||||
//
|
||||
// if ( DEBUG ) {
|
||||
// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
|
||||
// for ( final Haplotype h : bestHaplotypes ) {
|
||||
// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
|
||||
// }
|
||||
// }
|
||||
// return bestHaplotypes;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found
|
||||
// * @param haplotypes non-null list of haplotypes
|
||||
// * @return the reference haplotype
|
||||
// */
|
||||
// private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
|
||||
// for( final Haplotype h : haplotypes ) {
|
||||
// if( h.isReference() ) return h;
|
||||
// }
|
||||
// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
// }
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Experimental attempts at PCR error rate modeling
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
protected static final int MAX_STR_UNIT_LENGTH = 8;
|
||||
protected static final int MAX_REPEAT_LENGTH = 20;
|
||||
protected static final int MIN_ADJUSTED_QSCORE = 10;
|
||||
protected static final double INITIAL_QSCORE = 40.0;
|
||||
|
||||
private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1];
|
||||
private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
|
||||
|
||||
private void initializePCRErrorModel() {
|
||||
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
|
||||
return;
|
||||
|
||||
repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH);
|
||||
|
||||
pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1];
|
||||
|
||||
final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0;
|
||||
|
||||
for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ )
|
||||
pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor);
|
||||
}
|
||||
|
||||
protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {
|
||||
return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 ));
|
||||
}
|
||||
|
||||
protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {
|
||||
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
|
||||
return;
|
||||
|
||||
for ( int iii = 1; iii < readBases.length; iii++ ) {
|
||||
final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond();
|
||||
readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
|
||||
readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Posterior GL calculations
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
public static VariantContext calculatePosteriorGLs(final VariantContext vc1,
|
||||
final Collection<VariantContext> resources,
|
||||
final int numRefSamplesFromMissingResources,
|
||||
final double globalFrequencyPriorDirichlet,
|
||||
final boolean useInputSamples,
|
||||
final boolean useEM,
|
||||
final boolean useAC) {
|
||||
if ( useEM )
|
||||
throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented");
|
||||
|
||||
final Map<Allele,Integer> totalAlleleCounts = new HashMap<>();
|
||||
for ( final VariantContext resource : resources ) {
|
||||
addAlleleCounts(totalAlleleCounts,resource,useAC);
|
||||
}
|
||||
|
||||
if ( useInputSamples ) {
|
||||
addAlleleCounts(totalAlleleCounts,vc1,useAC);
|
||||
}
|
||||
|
||||
totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources);
|
||||
|
||||
// now extract the counts of the alleles present within vc1, and in order
|
||||
final double[] alleleCounts = new double[vc1.getNAlleles()];
|
||||
int alleleIndex = 0;
|
||||
for ( final Allele allele : vc1.getAlleles() ) {
|
||||
|
||||
alleleCounts[alleleIndex++] = globalFrequencyPriorDirichlet + ( totalAlleleCounts.containsKey(allele) ?
|
||||
totalAlleleCounts.get(allele) : 0 );
|
||||
}
|
||||
|
||||
final List<double[]> likelihoods = new ArrayList<>(vc1.getNSamples());
|
||||
for ( final Genotype genotype : vc1.getGenotypes() ) {
|
||||
likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null );
|
||||
}
|
||||
|
||||
final List<double[]> posteriors = calculatePosteriorGLs(likelihoods,alleleCounts,vc1.getMaxPloidy(2));
|
||||
|
||||
final GenotypesContext newContext = GenotypesContext.create();
|
||||
for ( int genoIdx = 0; genoIdx < vc1.getNSamples(); genoIdx ++ ) {
|
||||
final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx));
|
||||
if ( posteriors.get(genoIdx) != null ) {
|
||||
GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder,
|
||||
GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles());
|
||||
builder.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY,
|
||||
Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs()));
|
||||
|
||||
}
|
||||
newContext.add(builder.make());
|
||||
}
|
||||
|
||||
final List<Integer> priors = Utils.listFromPrimitives(
|
||||
GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2))).getAsPLs());
|
||||
|
||||
return new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG",priors).make();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given genotype likelihoods and known allele counts, calculate the posterior likelihoods
|
||||
* over the genotype states
|
||||
* @param genotypeLikelihoods - the genotype likelihoods for the individual
|
||||
* @param knownAlleleCountsByAllele - the known allele counts in the population. For AC=2 AN=12 site, this is {10,2}
|
||||
* @param ploidy - the ploidy to assume
|
||||
* @return - the posterior genotype likelihoods
|
||||
*/
|
||||
protected static List<double[]> calculatePosteriorGLs(final List<double[]> genotypeLikelihoods,
|
||||
final double[] knownAlleleCountsByAllele,
|
||||
final int ploidy) {
|
||||
if ( ploidy != 2 ) {
|
||||
throw new IllegalStateException("Genotype posteriors not yet implemented for ploidy != 2");
|
||||
}
|
||||
|
||||
final double[] genotypePriorByAllele = getDirichletPrior(knownAlleleCountsByAllele,ploidy);
|
||||
final List<double[]> posteriors = new ArrayList<>(genotypeLikelihoods.size());
|
||||
for ( final double[] likelihoods : genotypeLikelihoods ) {
|
||||
double[] posteriorLikelihoods = null;
|
||||
|
||||
if ( likelihoods != null ) {
|
||||
if ( likelihoods.length != genotypePriorByAllele.length ) {
|
||||
throw new IllegalStateException(String.format("Likelihoods not of correct size: expected %d, observed %d",
|
||||
knownAlleleCountsByAllele.length*(knownAlleleCountsByAllele.length+1)/2,likelihoods.length));
|
||||
}
|
||||
|
||||
posteriorLikelihoods = new double[genotypePriorByAllele.length];
|
||||
for ( int genoIdx = 0; genoIdx < likelihoods.length; genoIdx ++ ) {
|
||||
posteriorLikelihoods[genoIdx] = likelihoods[genoIdx] + genotypePriorByAllele[genoIdx];
|
||||
}
|
||||
|
||||
posteriorLikelihoods = MathUtils.toLog10(MathUtils.normalizeFromLog10(posteriorLikelihoods));
|
||||
|
||||
}
|
||||
|
||||
posteriors.add(posteriorLikelihoods);
|
||||
}
|
||||
|
||||
return posteriors;
|
||||
}
|
||||
|
||||
// convenience function for a single genotypelikelihoods array. Just wraps.
|
||||
protected static double[] calculatePosteriorGLs(final double[] genotypeLikelihoods,
|
||||
final double[] knownAlleleCountsByAllele,
|
||||
final int ploidy) {
|
||||
return calculatePosteriorGLs(Arrays.asList(genotypeLikelihoods),knownAlleleCountsByAllele,ploidy).get(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Given known allele counts (whether external, from the sample, or both), calculate the prior distribution
|
||||
* over genotype states. This assumes
|
||||
* 1) Random sampling of alleles (known counts are unbiased, and frequency estimate is Dirichlet)
|
||||
* 2) Genotype states are independent (Hardy-Weinberg)
|
||||
* These assumptions give rise to a Dirichlet-Multinomial distribution of genotype states as a prior
|
||||
* (the "number of trials" for the multinomial is simply the ploidy)
|
||||
* @param knownCountsByAllele - the known counts per allele. For an AC=2, AN=12 site this is {10,2}
|
||||
* @param ploidy - the number of chromosomes in the sample. For now restricted to 2.
|
||||
* @return - the Dirichlet-Multinomial distribution over genotype states
|
||||
*/
|
||||
protected static double[] getDirichletPrior(final double[] knownCountsByAllele, final int ploidy) {
|
||||
if ( ploidy != 2 ) {
|
||||
throw new IllegalStateException("Genotype priors not yet implemented for ploidy != 2");
|
||||
}
|
||||
|
||||
// multi-allelic format is
|
||||
// AA AB BB AC BC CC AD BD CD DD ...
|
||||
final double sumOfKnownCounts = MathUtils.sum(knownCountsByAllele);
|
||||
final double[] priors = new double[knownCountsByAllele.length*(knownCountsByAllele.length+1)/2];
|
||||
int priorIndex = 0;
|
||||
for ( int allele2 = 0; allele2 < knownCountsByAllele.length; allele2++ ) {
|
||||
for ( int allele1 = 0; allele1 <= allele2; allele1++) {
|
||||
final int[] counts = new int[knownCountsByAllele.length];
|
||||
counts[allele1] += 1;
|
||||
counts[allele2] += 1;
|
||||
priors[priorIndex++] = MathUtils.dirichletMultinomial(knownCountsByAllele,sumOfKnownCounts,counts,ploidy);
|
||||
}
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
private static void addAlleleCounts(final Map<Allele,Integer> counts, final VariantContext context, final boolean useAC) {
|
||||
final int[] ac;
|
||||
if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) {
|
||||
ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY));
|
||||
} else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
|
||||
ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY));
|
||||
} else {
|
||||
ac = new int[context.getAlternateAlleles().size()];
|
||||
int idx = 0;
|
||||
for ( final Allele allele : context.getAlternateAlleles() ) {
|
||||
ac[idx++] = context.getCalledChrCount(allele);
|
||||
}
|
||||
}
|
||||
|
||||
for ( final Allele allele : context.getAlleles() ) {
|
||||
final int count;
|
||||
if ( allele.isReference() ) {
|
||||
if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) {
|
||||
count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac);
|
||||
} else {
|
||||
count = context.getCalledChrCount() - (int) MathUtils.sum(ac);
|
||||
}
|
||||
} else {
|
||||
count = ac[context.getAlternateAlleles().indexOf(allele)];
|
||||
}
|
||||
if ( ! counts.containsKey(allele) ) {
|
||||
counts.put(allele,0);
|
||||
}
|
||||
counts.put(allele,count + counts.get(allele));
|
||||
}
|
||||
}
|
||||
|
||||
public static int[] extractInts(final Object integerListContainingVCField) {
|
||||
List<Integer> mleList = null;
|
||||
if ( integerListContainingVCField instanceof List ) {
|
||||
if ( ((List) integerListContainingVCField).get(0) instanceof String ) {
|
||||
mleList = new ArrayList<>(((List) integerListContainingVCField).size());
|
||||
for ( Object s : ((List)integerListContainingVCField)) {
|
||||
mleList.add(Integer.parseInt((String) s));
|
||||
}
|
||||
} else {
|
||||
mleList = (List<Integer>) integerListContainingVCField;
|
||||
}
|
||||
} else if ( integerListContainingVCField instanceof Integer ) {
|
||||
mleList = Arrays.asList((Integer) integerListContainingVCField);
|
||||
} else if ( integerListContainingVCField instanceof String ) {
|
||||
mleList = Arrays.asList(Integer.parseInt((String)integerListContainingVCField));
|
||||
}
|
||||
if ( mleList == null )
|
||||
throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+
|
||||
VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY));
|
||||
|
||||
final int[] mle = new int[mleList.size()];
|
||||
|
||||
if ( ! ( mleList.get(0) instanceof Integer ) ) {
|
||||
throw new IllegalStateException("BUG: The AC values should be an Integer, but was "+mleList.get(0).getClass().getCanonicalName());
|
||||
}
|
||||
|
||||
for ( int idx = 0; idx < mle.length; idx++) {
|
||||
mle[idx] = mleList.get(idx);
|
||||
}
|
||||
|
||||
return mle;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Random likelihoods generator, used for testing/benchmarking purposes.
|
||||
*/
|
||||
public class RandomLikelihoodCalculationEngine implements LikelihoodCalculationEngine {
|
||||
|
||||
@Override
|
||||
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map<String, List<GATKSAMRecord>> reads) {
|
||||
final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList();
|
||||
final Map<String,PerReadAlleleLikelihoodMap> result = new HashMap<>(reads.size());
|
||||
final Map<Haplotype,Allele> alleles = new HashMap<>(haplotypes.size());
|
||||
for (final Haplotype haplotype : haplotypes)
|
||||
alleles.put(haplotype,Allele.create(haplotype,false));
|
||||
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
|
||||
for (final String sample : reads.keySet()) {
|
||||
final PerReadAlleleLikelihoodMap pralm = new PerReadAlleleLikelihoodMap();
|
||||
for (final GATKSAMRecord read : reads.get(sample))
|
||||
for (final Haplotype haplotype : haplotypes )
|
||||
pralm.add(read,alleles.get(haplotype),-Math.abs(rnd.nextDouble()));
|
||||
result.put(sample,pralm);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,366 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Collects information as to how a read maps into the haplotype graph that is needed to calculate its likelihood
|
||||
* using the graph-based approach.
|
||||
*/
|
||||
public class ReadAnchoring {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(ReadAnchoring.class);
|
||||
|
||||
/** Holds a reference to the read itself */
|
||||
protected final GATKSAMRecord read;
|
||||
protected final Map<MultiDeBruijnVertex,Integer> uniqueKmerOffsets;
|
||||
|
||||
/**
|
||||
* Kmer offset on the read of the left anchor
|
||||
* <p>
|
||||
* {@code -1} if there is no left anchor.
|
||||
* </p>
|
||||
*/
|
||||
protected int leftAnchorIndex;
|
||||
|
||||
/**
|
||||
* Vertex in the graph where the left anchor falls.
|
||||
* <p>
|
||||
* {@code null} if there is no left anchor.
|
||||
* </p>
|
||||
*/
|
||||
protected MultiDeBruijnVertex leftAnchorVertex;
|
||||
|
||||
/**
|
||||
* Kmer offset on the read of the right anchor.
|
||||
*
|
||||
* <p>
|
||||
* {@code -1} if there is no right anchor.
|
||||
* </p>
|
||||
*/
|
||||
protected int rightAnchorIndex;
|
||||
|
||||
/**
|
||||
* Vertex in the graph where the right anchor falls.
|
||||
*
|
||||
* <p>
|
||||
* {@code null} if there is no right anchor.
|
||||
* </p>
|
||||
*/
|
||||
protected MultiDeBruijnVertex rightAnchorVertex;
|
||||
|
||||
/**
|
||||
* Kmer sequence mapping information for the read sequence.
|
||||
*
|
||||
* never {@code null}.
|
||||
*/
|
||||
protected final KmerSequenceGraphMap<MultiDeBruijnVertex, MultiSampleEdge> graphMap;
|
||||
|
||||
/**
|
||||
* Alignment of read kmers on the reference haplotype kmers.
|
||||
*
|
||||
* <p>
|
||||
* There is one entry for each base in the read.
|
||||
*
|
||||
* </p>
|
||||
* <p>
|
||||
* The i-th entry indicates what kmer in the reference haplotype correspond to the kmer on the read starting
|
||||
* at is i-th base.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* {@code -1} means that there is no match.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The last kmerSize - 1 entry of the array are {@code -1}
|
||||
* </p>
|
||||
*/
|
||||
protected final int[] referenceAlignment;
|
||||
|
||||
/**
|
||||
* Maps between reference path vertex that are found between anchors and the kmer offset they map uniquely to
|
||||
* on the read.
|
||||
*/
|
||||
protected final Map<MultiDeBruijnVertex, Integer> referenceWithinAnchorsMap;
|
||||
|
||||
/**
|
||||
* Creates the read's anchoring information for the haplotype-graph.
|
||||
*
|
||||
* @param read the targeted read.
|
||||
* @param haplotypeGraph the targeted graph.
|
||||
*
|
||||
* @throws NullPointerException if any argument is {@code null}.
|
||||
* @throws IllegalArgumentException if elements in {@code anchorableVertices} are not vertex in {@code haplotypeGraph}
|
||||
*/
|
||||
public ReadAnchoring(final GATKSAMRecord read, final HaplotypeGraph haplotypeGraph) {
|
||||
this.read = read;
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final KmerSequence readKmers = new KmerSequence(read, haplotypeGraph.getKmerSize());
|
||||
graphMap = new KmerSequenceGraphMap<>(haplotypeGraph, readKmers);
|
||||
final Map<MultiDeBruijnVertex, Integer> vertexOffset = graphMap.vertexOffset();
|
||||
referenceAlignment = calculateUniqueKmerAlignment(0, readBases.length, haplotypeGraph.getReferenceRoute(), vertexOffset, haplotypeGraph.getKmerSize());
|
||||
leftAnchorIndex = -1;
|
||||
leftAnchorVertex = null;
|
||||
for (int i = 0; i < readBases.length - haplotypeGraph.getKmerSize() + 1; i++) {
|
||||
if (referenceAlignment[i] == -1) continue;
|
||||
final MultiDeBruijnVertex candidate = haplotypeGraph.findKmer(readKmers.get(i));
|
||||
if (candidate != null && haplotypeGraph.getAnchorableVertices().contains(candidate)) {
|
||||
leftAnchorIndex = i;
|
||||
leftAnchorVertex = candidate;
|
||||
break;
|
||||
}
|
||||
}
|
||||
rightAnchorIndex = leftAnchorIndex;
|
||||
rightAnchorVertex = leftAnchorVertex;
|
||||
if (leftAnchorIndex != -1) {
|
||||
for (int i = readBases.length - haplotypeGraph.getKmerSize(); i > leftAnchorIndex; i--) {
|
||||
if (referenceAlignment[i] == -1) continue;
|
||||
final MultiDeBruijnVertex candidate = haplotypeGraph.findKmer(readKmers.get(i));
|
||||
if (candidate != null && haplotypeGraph.getAnchorableVertices().contains(candidate)) {
|
||||
rightAnchorIndex = i;
|
||||
rightAnchorVertex = candidate;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
referenceWithinAnchorsMap = buildReferenceWithinBoundariesMap(read, haplotypeGraph,
|
||||
vertexOffset, leftAnchorVertex, rightAnchorVertex);
|
||||
uniqueKmerOffsets = buildReadUniqueKmerOffsets(haplotypeGraph);
|
||||
}
|
||||
|
||||
/**
|
||||
* For a given read, returns the set of reference path vertices that falls between the two anchor vertices.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The resulting map has as key the reference vertices between those two boundaries (inclusive) and
|
||||
* the value is the corresponding offset in the kmer.
|
||||
* </p>
|
||||
*
|
||||
* @param read the target read.
|
||||
* @param readVertexKmerOffset map between vertices and their kmer offset on the read.
|
||||
* @param leftAnchorVertex left anchor vertex.
|
||||
* @param rightAnchorVertex right anchor vertex.
|
||||
* @return never {@code null}, but empty if the anchors are out of order in the reference.
|
||||
*/
|
||||
private Map<MultiDeBruijnVertex, Integer> buildReferenceWithinBoundariesMap(
|
||||
final GATKSAMRecord read, final HaplotypeGraph haplotypeGraph,
|
||||
final Map<MultiDeBruijnVertex, Integer> readVertexKmerOffset,
|
||||
final MultiDeBruijnVertex leftAnchorVertex, final MultiDeBruijnVertex rightAnchorVertex) {
|
||||
if (leftAnchorVertex == null)
|
||||
return Collections.emptyMap();
|
||||
|
||||
final Map<MultiDeBruijnVertex, Integer> result = new HashMap<>();
|
||||
MultiDeBruijnVertex nextVertex = leftAnchorVertex;
|
||||
|
||||
int leftAnchorOffset = 0;
|
||||
while (nextVertex != null) {
|
||||
result.put(nextVertex, leftAnchorOffset++);
|
||||
if (nextVertex == rightAnchorVertex)
|
||||
break;
|
||||
nextVertex = haplotypeGraph.getNextReferenceVertex(nextVertex);
|
||||
}
|
||||
if (nextVertex == null) {
|
||||
logger.warn("unexpected event kmers out of order between read anchor kmers: " + read.getReadString()
|
||||
+ " Offending kmer offsets: " + readVertexKmerOffset.get(leftAnchorVertex) + " " + readVertexKmerOffset.get(rightAnchorVertex)
|
||||
+ " sequences: " +
|
||||
read.getReadString().substring(readVertexKmerOffset.get(leftAnchorVertex), haplotypeGraph.getKmerSize() + readVertexKmerOffset.get(leftAnchorVertex)) +
|
||||
" " + read.getReadString().substring(readVertexKmerOffset.get(rightAnchorVertex), haplotypeGraph.getKmerSize() + readVertexKmerOffset.get(rightAnchorVertex)) +
|
||||
" Reference haplotype: " + haplotypeGraph.getReferenceHaplotype().getBaseString());
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a map between unique kmers in the reference path and their kmer offset in the read.
|
||||
*
|
||||
* @param haplotypeGraph the anchoring graph.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
private Map<MultiDeBruijnVertex,Integer> buildReadUniqueKmerOffsets(final HaplotypeGraph haplotypeGraph) {
|
||||
if (!hasValidAnchors())
|
||||
return Collections.emptyMap();
|
||||
final Map<MultiDeBruijnVertex, Integer> vertexOffset = graphMap.vertexOffset();
|
||||
final Set<MultiDeBruijnVertex> readUniqueKmerVertices = new HashSet<>(vertexOffset.size());
|
||||
readUniqueKmerVertices.add(leftAnchorVertex);
|
||||
readUniqueKmerVertices.add(rightAnchorVertex);
|
||||
for (int i = leftAnchorIndex + 1; i < rightAnchorIndex; i++) {
|
||||
if (referenceAlignment[i] != -1) {
|
||||
readUniqueKmerVertices.add(haplotypeGraph.findKmer(graphMap.sequence.get(i)));
|
||||
}
|
||||
}
|
||||
final Map<MultiDeBruijnVertex, Integer> validVertexOffset = new HashMap<>(graphMap.vertexOffset());
|
||||
validVertexOffset.keySet().retainAll(readUniqueKmerVertices);
|
||||
return validVertexOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether it has some anchoring kmer and these are valid, i.e. the left anchor is the same or preceedes the right anchor in the reference path.
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean hasValidAnchors() {
|
||||
return referenceWithinAnchorsMap.size() >= 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates an array indicating for each kmer in the read what is the offset of that kmer in a path.
|
||||
* <p/>
|
||||
* <p>
|
||||
* The result is of the same length as the read. Position ith indicates the offset of the read kmer that
|
||||
* start at that position in the input path. Non matching kmers have -1 instead.
|
||||
* </p>
|
||||
*
|
||||
* @param readStart inclusive first position of the read to consider.
|
||||
* @param readEnd exclusive position after last to be considered.
|
||||
* @param path the path to which to align against.
|
||||
* @param readUniqueKmerOffset map of vertices to the kmer offset with the read.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
private int[] calculateUniqueKmerAlignment(final int readStart, final int readEnd, final Path<MultiDeBruijnVertex, MultiSampleEdge> path,
|
||||
final Map<MultiDeBruijnVertex, Integer> readUniqueKmerOffset, final int kmerSize) {
|
||||
|
||||
final int[] result = new int[readEnd - readStart];
|
||||
Arrays.fill(result, -1);
|
||||
int i = 0;
|
||||
for (final MultiDeBruijnVertex v : path.getVertices()) {
|
||||
final Integer kmerReadOffset = readUniqueKmerOffset.get(v);
|
||||
if (kmerReadOffset != null) {
|
||||
final int kro = kmerReadOffset;
|
||||
if (kro >= readStart && kro < readEnd - kmerSize + 1) {
|
||||
result[kro - readStart] = i;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
// Now we remove conflicting mappings:
|
||||
// A conflicting mapping is when to kmer mapping suggest that
|
||||
// the same read position maps to two different bases in the path.
|
||||
maskOutConflictingKmerAlignments(result,kmerSize);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark with -1 those kmer matches that result in read base mapping conflicts.
|
||||
*
|
||||
* @param result in/out changed in-situ.
|
||||
*/
|
||||
@Requires("result != null")
|
||||
private void maskOutConflictingKmerAlignments(final int[] result, final int kmerSize) {
|
||||
int i;
|
||||
int lastKmer = -1;
|
||||
int lastKmerPos = -1;
|
||||
for (i = 0; i < result.length; i++) {
|
||||
final int kmer = result[i];
|
||||
if (kmer == -1)
|
||||
continue;
|
||||
if (lastKmer == -1) {
|
||||
lastKmer = kmer;
|
||||
lastKmerPos = i;
|
||||
} else if (lastKmer + kmerSize - 1 >= kmer && (i - lastKmerPos) != (kmer - lastKmer)) { // kmer overlap. fixing by eliminating offending kmers alignments.
|
||||
int iSkip = result.length; // iSkip will contain the next position minus 1 to visit in the next iteration of the enclosing loop.
|
||||
for (int j = i; j < result.length; j++)
|
||||
if (result[j] != -1) {
|
||||
if (lastKmer + kmerSize - 1 >= result[j])
|
||||
result[j] = -1;
|
||||
else {
|
||||
iSkip = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// then backwards and do the same.
|
||||
int j = lastKmerPos;
|
||||
lastKmer = -1;
|
||||
lastKmerPos = -1;
|
||||
for (; j >= 0; j--)
|
||||
if (result[j] != -1) {
|
||||
if (result[j] + kmerSize - 1 >= kmer)
|
||||
result[j] = -1;
|
||||
else {
|
||||
lastKmer = result[j];
|
||||
lastKmerPos = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
i = iSkip;
|
||||
} else {
|
||||
lastKmer = kmer;
|
||||
lastKmerPos = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether it is anchored at all.
|
||||
*
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean isAnchoredSomewhere() {
|
||||
return hasValidAnchors();
|
||||
//return hasValidAnchors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the read is anchored perfectly, there are no non-aligned bases.
|
||||
*
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean isPerfectAnchoring() {
|
||||
return hasValidAnchors() && leftAnchorIndex == 0 && rightAnchorIndex == read.getReadLength() - graphMap.kmerSize &&
|
||||
!leftAnchorVertex.hasAmbiguousSequence();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* A pair read-likelihood (cost).
|
||||
*/
|
||||
public class ReadCost {
|
||||
public final GATKSAMRecord read;
|
||||
|
||||
/**
|
||||
* Holds the cost value. Public for convenience, please use with care.
|
||||
*/
|
||||
public double cost;
|
||||
|
||||
public ReadCost(final GATKSAMRecord r) {
|
||||
read = r;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Comparator used to sort ReadCosts
|
||||
*/
|
||||
public static final Comparator<ReadCost> COMPARATOR = new Comparator<ReadCost>() {
|
||||
@Override
|
||||
public int compare(final ReadCost o1, final ReadCost o2) {
|
||||
final String s1 = o1.read.getReadName() + (o1.read.getReadPairedFlag() ? (o1.read.getFirstOfPairFlag() ? "/1" : "/2") : "");
|
||||
final String s2 = o2.read.getReadName() + (o2.read.getReadPairedFlag() ? (o2.read.getFirstOfPairFlag() ? "/1" : "/2") : "");
|
||||
return s1.compareTo(s2);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,86 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Sorts path costs.
|
||||
* <p>
|
||||
* Path costs are first sorted by their path base sequence in alphanumerical order.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* When these are the same, we consider their unique ids {@link ReadSegmentCost#uniqueId()} to break the tie.
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
class ReadSegmentComparator implements Comparator<ReadSegmentCost> {
|
||||
|
||||
public static final Comparator<? super ReadSegmentCost> INSTANCE = new ReadSegmentComparator();
|
||||
|
||||
@Override
|
||||
public int compare(final ReadSegmentCost o1, final ReadSegmentCost o2) {
|
||||
int minLength = Math.min(o1.bases.length, o2.bases.length);
|
||||
for (int i = 0; i < minLength; i++) {
|
||||
if (o1.bases[i] == o2.bases[i])
|
||||
continue;
|
||||
else if (o1.bases[i] < o2.bases[i]) {
|
||||
return -1;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
if (o1.bases.length < o2.bases.length) {
|
||||
return -1;
|
||||
} else if (o1.bases.length > o2.bases.length) {
|
||||
return 1;
|
||||
} else {
|
||||
return Long.compare(o1.uniqueId(),o2.uniqueId());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* Path cost indicate the cost (alignment likelihood) of traversing a section of the graph using a segement of a read.
|
||||
*
|
||||
* <p>A path can be a whole haplotype path as well as just a smaller haplotype segment</p>.
|
||||
*
|
||||
* <p>We would generate many of this objects for each read. The final likelihood of a read vs each haplotype
|
||||
* would be the summation of the path-cost of that read along the corresponding haplotype path.</p>
|
||||
*/
|
||||
class ReadSegmentCost {
|
||||
|
||||
public Route<MultiDeBruijnVertex, MultiSampleEdge> path;
|
||||
public GATKSAMRecord read;
|
||||
|
||||
/**
|
||||
* Holds the cost value. It public and non-final for convenience.
|
||||
*/
|
||||
protected double cost;
|
||||
|
||||
/**
|
||||
* Caches the path bases (the haplotype segment bases).
|
||||
*/
|
||||
protected byte[] bases;
|
||||
|
||||
/**
|
||||
* Construct a new path cost.
|
||||
* @param read the corresponding read.
|
||||
* @param path the corresponding path.
|
||||
* @param cost initial cost estimate. Might be updated later.
|
||||
*/
|
||||
@Requires("route != null")
|
||||
public ReadSegmentCost(final GATKSAMRecord read,
|
||||
final Route<MultiDeBruijnVertex, MultiSampleEdge> path, double cost) {
|
||||
this.read = read;
|
||||
this.path = path;
|
||||
this.cost = cost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to generate unique identifiers for path cost object.
|
||||
*/
|
||||
private static final AtomicLong pathCostUniqueIdGenerator = new AtomicLong();
|
||||
|
||||
/**
|
||||
* Holds the path cost unique identifier.
|
||||
*/
|
||||
private Long uniqueId;
|
||||
|
||||
/**
|
||||
* Returns the this path-cost unique identifier.
|
||||
* @return
|
||||
*/
|
||||
public long uniqueId() {
|
||||
if (uniqueId == null)
|
||||
uniqueId = pathCostUniqueIdGenerator.incrementAndGet();
|
||||
return uniqueId;
|
||||
}
|
||||
}
|
||||
|
|
@ -68,5 +68,13 @@ final class RefVsAnyResult {
|
|||
/**
|
||||
* @return Get the DP (sum of AD values)
|
||||
*/
|
||||
public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; }
|
||||
protected int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; }
|
||||
|
||||
/**
|
||||
* Cap the het and hom var likelihood values by the hom ref likelihood.
|
||||
*/
|
||||
protected void capByHomRefLikelihood() {
|
||||
genotypeLikelihoods[1] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[1]);
|
||||
genotypeLikelihoods[2] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[2]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
|||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.*;
|
||||
import org.broadinstitute.variant.vcf.VCFFormatHeaderLine;
|
||||
import org.broadinstitute.variant.vcf.VCFHeaderLine;
|
||||
|
|
@ -81,10 +82,9 @@ import java.util.*;
|
|||
* Time: 12:52 PM
|
||||
*/
|
||||
public class ReferenceConfidenceModel {
|
||||
public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF";
|
||||
public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site
|
||||
|
||||
public final static String INDEL_INFORMATIVE_DEPTH = "CD";
|
||||
//public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now
|
||||
public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele
|
||||
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
private final Set<String> samples;
|
||||
|
|
@ -94,6 +94,8 @@ public class ReferenceConfidenceModel {
|
|||
private final static boolean WRITE_DEBUGGING_BAM = false;
|
||||
private final SAMFileWriter debuggingWriter;
|
||||
|
||||
private final static byte REF_MODEL_DELETION_QUAL = (byte) 30;
|
||||
|
||||
/**
|
||||
* Create a new ReferenceConfidenceModel
|
||||
*
|
||||
|
|
@ -124,6 +126,8 @@ public class ReferenceConfidenceModel {
|
|||
} else {
|
||||
debuggingWriter = null;
|
||||
}
|
||||
|
||||
initializeIndelPLCache();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -132,8 +136,9 @@ public class ReferenceConfidenceModel {
|
|||
*/
|
||||
public Set<VCFHeaderLine> getVCFHeaderLines() {
|
||||
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>();
|
||||
headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
|
||||
headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
|
||||
// TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles?
|
||||
headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location"));
|
||||
//headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize));
|
||||
return headerLines;
|
||||
}
|
||||
|
||||
|
|
@ -161,7 +166,7 @@ public class ReferenceConfidenceModel {
|
|||
* @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes
|
||||
* @param variantCalls calls made in this region. The return result will contain any variant call in this list in the
|
||||
* correct order by genomic position, and any variant in this list will stop us emitting a ref confidence
|
||||
* under any position is covers (for snps that 1 bp, but for deletion its the entire ref span)
|
||||
* under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span)
|
||||
* @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence
|
||||
* contexts as well as calls from variantCalls if any were provided
|
||||
*/
|
||||
|
|
@ -181,7 +186,7 @@ public class ReferenceConfidenceModel {
|
|||
if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different");
|
||||
|
||||
final GenomeLoc refSpan = activeRegion.getLocation();
|
||||
final List<ReadBackedPileup> refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap);
|
||||
final List<ReadBackedPileup> refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap);
|
||||
final byte[] ref = refHaplotype.getBases();
|
||||
final List<VariantContext> results = new ArrayList<>(refSpan.size());
|
||||
final String sampleName = stratifiedReadMap.keySet().iterator().next();
|
||||
|
|
@ -201,9 +206,10 @@ public class ReferenceConfidenceModel {
|
|||
final int refOffset = offset + globalRefOffset;
|
||||
final byte refBase = ref[refOffset];
|
||||
final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null);
|
||||
homRefCalc.capByHomRefLikelihood();
|
||||
|
||||
final Allele refAllele = Allele.create(refBase, true);
|
||||
final List<Allele> refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE);
|
||||
final List<Allele> refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles);
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele));
|
||||
gb.AD(homRefCalc.AD_Ref_Any);
|
||||
|
|
@ -224,7 +230,7 @@ public class ReferenceConfidenceModel {
|
|||
|
||||
gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF)));
|
||||
gb.PL(leastConfidenceGLs.getAsPLs());
|
||||
gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
|
||||
//gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads);
|
||||
|
||||
vcb.genotypes(gb.make());
|
||||
results.add(vcb.make());
|
||||
|
|
@ -252,14 +258,21 @@ public class ReferenceConfidenceModel {
|
|||
* @return non-null GenotypeLikelihoods given N
|
||||
*/
|
||||
protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) {
|
||||
// TODO -- optimization -- this could easily be optimized with some caching
|
||||
final double homRef = 0.0;
|
||||
final double het = - LOG10_2 * nInformativeReads;
|
||||
final double homVar = INDEL_ERROR_RATE * nInformativeReads;
|
||||
return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar});
|
||||
return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads];
|
||||
}
|
||||
|
||||
protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway
|
||||
private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1];
|
||||
private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
|
||||
|
||||
private void initializeIndelPLCache() {
|
||||
for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) {
|
||||
final double homRef = 0.0;
|
||||
final double het = MathUtils.LOG_ONE_HALF * nInformativeReads;
|
||||
final double homVar = INDEL_ERROR_RATE * nInformativeReads;
|
||||
indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar});
|
||||
}
|
||||
}
|
||||
private final static double LOG10_2 = Math.log10(2);
|
||||
private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
|
||||
|
||||
/**
|
||||
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
|
||||
|
|
@ -274,8 +287,8 @@ public class ReferenceConfidenceModel {
|
|||
final RefVsAnyResult result = new RefVsAnyResult();
|
||||
|
||||
for( final PileupElement p : pileup ) {
|
||||
final byte qual = p.getQual();
|
||||
if( p.isDeletion() || qual > minBaseQual) {
|
||||
final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
|
||||
if( p.isDeletion() || qual > minBaseQual ) {
|
||||
int AA = 0; final int AB = 1; int BB = 2;
|
||||
if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
AA = 2;
|
||||
|
|
@ -283,9 +296,9 @@ public class ReferenceConfidenceModel {
|
|||
if( hqSoftClips != null && p.isNextToSoftClip() ) {
|
||||
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
|
||||
}
|
||||
result.AD_Ref_Any[1]++;
|
||||
result.AD_Ref_Any[1] += p.getRepresentativeCount();
|
||||
} else {
|
||||
result.AD_Ref_Any[0]++;
|
||||
result.AD_Ref_Any[0] += p.getRepresentativeCount();
|
||||
}
|
||||
result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual);
|
||||
result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
|
||||
|
|
@ -302,20 +315,37 @@ public class ReferenceConfidenceModel {
|
|||
private List<ReadBackedPileup> getPileupsOverReference(final Haplotype refHaplotype,
|
||||
final Collection<Haplotype> calledHaplotypes,
|
||||
final GenomeLoc paddedReferenceLoc,
|
||||
final ActiveRegion activeRegion,
|
||||
final GenomeLoc activeRegionSpan,
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) {
|
||||
final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO");
|
||||
final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest);
|
||||
writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves
|
||||
writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap);
|
||||
final List<GATKSAMRecord> realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads());
|
||||
|
||||
if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
|
||||
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
|
||||
if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype");
|
||||
if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null");
|
||||
if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null");
|
||||
if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null");
|
||||
if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size());
|
||||
|
||||
List<GATKSAMRecord> realignedReads;
|
||||
|
||||
if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner
|
||||
realignedReads = activeRegion.getReads();
|
||||
} else {
|
||||
final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO");
|
||||
final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest);
|
||||
writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves
|
||||
writer.setOnlyRealignInformativeReads(true);
|
||||
writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap);
|
||||
realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads());
|
||||
}
|
||||
|
||||
if ( debuggingWriter != null )
|
||||
for ( final GATKSAMRecord read : realignedReads )
|
||||
debuggingWriter.addAlignment(read);
|
||||
|
||||
final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING,
|
||||
false, genomeLocParser, samples, false);
|
||||
true, genomeLocParser, samples, false);
|
||||
|
||||
final List<ReadBackedPileup> pileups = new LinkedList<>();
|
||||
final int startPos = activeRegionSpan.getStart();
|
||||
|
|
@ -378,7 +408,7 @@ public class ReferenceConfidenceModel {
|
|||
final byte refBase = refBases[refStart + i];
|
||||
if ( readBase != refBase ) {
|
||||
sum += readQuals[readStart + i];
|
||||
if ( sum > maxSum )
|
||||
if ( sum > maxSum ) // abort early
|
||||
return sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -403,7 +433,10 @@ public class ReferenceConfidenceModel {
|
|||
final byte[] refBases,
|
||||
final int refStart,
|
||||
final int maxIndelSize) {
|
||||
// todo -- fast exit when n bases left < maxIndelSize
|
||||
// fast exit when n bases left < maxIndelSize
|
||||
if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE);
|
||||
|
||||
|
|
@ -445,12 +478,16 @@ public class ReferenceConfidenceModel {
|
|||
final int offset = p.getOffset();
|
||||
|
||||
// doesn't count as evidence
|
||||
if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() )
|
||||
if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() )
|
||||
continue;
|
||||
|
||||
// todo -- this code really should handle CIGARs directly instead of relying on the above tests
|
||||
if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize))
|
||||
nInformative++;
|
||||
if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) {
|
||||
nInformative += p.getRepresentativeCount();
|
||||
if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) {
|
||||
return MAX_N_INDEL_INFORMATIVE_READS;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nInformative;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ import java.util.*;
|
|||
@Invariant("!this.isAllowingMultipleEdges()")
|
||||
public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends DefaultDirectedGraph<V, E> {
|
||||
protected final static Logger logger = Logger.getLogger(BaseGraph.class);
|
||||
private final int kmerSize;
|
||||
protected final int kmerSize;
|
||||
|
||||
/**
|
||||
* Construct a TestGraph with kmerSize
|
||||
|
|
@ -95,10 +95,13 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
*/
|
||||
public boolean isReferenceNode( final V v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final BaseEdge e : edgesOf(v) ) {
|
||||
if( e.isRef() ) { return true; }
|
||||
|
||||
for ( final BaseEdge e : edgesOf(v) ) {
|
||||
if ( e.isRef() ) { return true; }
|
||||
}
|
||||
return false;
|
||||
|
||||
// edge case: if the graph only has one node then it's a ref node, otherwise it's not
|
||||
return (vertexSet().size() == 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -154,62 +157,46 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
return v.getAdditionalSequence(isSource(v));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param e the edge to test
|
||||
* @return true if this edge is a reference source edge
|
||||
*/
|
||||
public boolean isRefSource( final E e ) {
|
||||
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
|
||||
for( final E edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference source
|
||||
*/
|
||||
public boolean isRefSource( final V v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final E edgeToTest : incomingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
|
||||
// confirm that no incoming edges are reference edges
|
||||
for ( final E edgeToTest : incomingEdgesOf(v) ) {
|
||||
if ( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
|
||||
// confirm that there is an outgoing reference edge
|
||||
for ( final E edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if ( edgeToTest.isRef() ) { return true; }
|
||||
}
|
||||
|
||||
// edge case: if the graph only has one node then it's a ref sink, otherwise it's not
|
||||
return (vertexSet().size() == 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param e the edge to test
|
||||
* @return true if this edge is a reference sink edge
|
||||
*/
|
||||
public boolean isRefSink( final E e ) {
|
||||
if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); }
|
||||
for( final E edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* // TODO -- the logic of this test is just wrong
|
||||
* @param v the vertex to test
|
||||
* @return true if this vertex is a reference sink
|
||||
*/
|
||||
public boolean isRefSink( final V v ) {
|
||||
if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); }
|
||||
for( final E edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this both a refsink node and a reference node
|
||||
* @param v a non-null vertex
|
||||
* @return true if v is both a sink and a reference node
|
||||
*/
|
||||
public boolean isRefNodeAndRefSink(final V v) {
|
||||
return isRefSink(v) && isReferenceNode(v);
|
||||
// confirm that no outgoing edges are reference edges
|
||||
for ( final E edgeToTest : outgoingEdgesOf(v) ) {
|
||||
if ( edgeToTest.isRef() ) { return false; }
|
||||
}
|
||||
|
||||
// confirm that there is an incoming reference edge
|
||||
for ( final E edgeToTest : incomingEdgesOf(v) ) {
|
||||
if ( edgeToTest.isRef() ) { return true; }
|
||||
}
|
||||
|
||||
// edge case: if the graph only has one node then it's a ref source, otherwise it's not
|
||||
return (vertexSet().size() == 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -217,7 +204,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
*/
|
||||
public V getReferenceSourceVertex( ) {
|
||||
for( final V v : vertexSet() ) {
|
||||
if( isReferenceNode(v) && isRefSource(v) ) {
|
||||
if( isRefSource(v) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
|
@ -229,7 +216,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
*/
|
||||
public V getReferenceSinkVertex( ) {
|
||||
for( final V v : vertexSet() ) {
|
||||
if( isReferenceNode(v) && isRefSink(v) ) {
|
||||
if( isRefSink(v) ) {
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
|
@ -490,7 +477,7 @@ public class BaseGraph<V extends BaseVertex, E extends BaseEdge> extends Default
|
|||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<V> verticesToRemove = new LinkedList<>();
|
||||
for( final V v : vertexSet() ) {
|
||||
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
|
||||
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 && !isRefSource(v) ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
|
|
@ -181,7 +183,7 @@ public class BaseVertex {
|
|||
|
||||
/**
|
||||
* Set additional debugging information for this vertex
|
||||
* @param info
|
||||
* @param info the new info value.
|
||||
*/
|
||||
public void setAdditionalInfo(final String info) {
|
||||
if ( info == null ) throw new IllegalArgumentException("info cannot be null");
|
||||
|
|
@ -192,4 +194,32 @@ public class BaseVertex {
|
|||
* @return the additional information for display about this vertex
|
||||
*/
|
||||
public String additionalInfo() { return additionalInfo; }
|
||||
|
||||
/**
|
||||
* Checks whether the vertex sequence is ambiguous or not.
|
||||
*
|
||||
* <p>
|
||||
* Ambiguity may come about as a result of either:
|
||||
* <ul>
|
||||
* <li>by construction as the generating sequence (read or haplotype) had ambiguous bases</li>
|
||||
* <li>or because this vertex is the result of merging two or more vertices with some variation upstream
|
||||
* no more than kmerSize bases away (e.g. by executing {@link HaplotypeGraph#mergeCommonChains}</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean hasAmbiguousSequence() {
|
||||
for (final byte base : sequence)
|
||||
switch (Character.toUpperCase(base)) {
|
||||
case 'A' :
|
||||
case 'T' :
|
||||
case 'G' :
|
||||
case 'C' :
|
||||
continue;
|
||||
default :
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ public class DeBruijnVertex extends BaseVertex {
|
|||
* @return integer >= 1
|
||||
*/
|
||||
@Ensures("result >= 1")
|
||||
public int getKmer() {
|
||||
public int getKmerSize() {
|
||||
return sequence.length;
|
||||
}
|
||||
|
||||
|
|
@ -100,7 +100,7 @@ public class DeBruijnVertex extends BaseVertex {
|
|||
* @return a byte
|
||||
*/
|
||||
public byte getSuffix() {
|
||||
return sequence[getKmer() - 1];
|
||||
return sequence[getKmerSize() - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer;
|
||||
|
||||
/**
|
||||
* Common interface for those graphs that implement vertex by kmer look-up.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public interface KmerSearchableGraph<V extends BaseVertex, E extends BaseEdge> {
|
||||
|
||||
/**
|
||||
* Returns the vertex that represents or contains the last base of a given kmer.
|
||||
* @param k the query kmer.
|
||||
*
|
||||
* @throws NullPointerException if {@code k} is {@code null}.
|
||||
* @return {@code null} if there is no such a kmer in the graph or it is not unique.
|
||||
*/
|
||||
V findKmer(Kmer k);
|
||||
|
||||
/**
|
||||
* The kmer-size of indexed kmers.
|
||||
*
|
||||
* @return greater than 0.
|
||||
*/
|
||||
int getKmerSize();
|
||||
|
||||
}
|
||||
|
|
@ -72,17 +72,17 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
private final static Logger logger = Logger.getLogger(Path.class);
|
||||
|
||||
// the last vertex seen in the path
|
||||
private final T lastVertex;
|
||||
protected final T lastVertex;
|
||||
|
||||
// the list of edges comprising the path
|
||||
private Set<E> edgesAsSet = null;
|
||||
private final LinkedList<E> edgesInOrder;
|
||||
protected final ArrayList<E> edgesInOrder;
|
||||
|
||||
// the scores for the path
|
||||
private final int totalScore;
|
||||
protected final int totalScore;
|
||||
|
||||
// the graph from which this path originated
|
||||
private final BaseGraph<T, E> graph;
|
||||
protected final BaseGraph<T, E> graph;
|
||||
|
||||
// used in the bubble state machine to apply Smith-Waterman to the bubble sequence
|
||||
// these values were chosen via optimization against the NA12878 knowledge base
|
||||
|
|
@ -99,7 +99,7 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph);
|
||||
|
||||
lastVertex = initialVertex;
|
||||
edgesInOrder = new LinkedList<E>();
|
||||
edgesInOrder = new ArrayList<>(0);
|
||||
totalScore = 0;
|
||||
this.graph = graph;
|
||||
}
|
||||
|
|
@ -114,11 +114,29 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new path with the same field values.
|
||||
*
|
||||
* @param p the template path.
|
||||
*
|
||||
* @throws NullPointerException if {@code p} is {@code null}.
|
||||
*/
|
||||
protected Path(final Path<T,E> p) {
|
||||
this.edgesInOrder = p.edgesInOrder;
|
||||
this.lastVertex = p.lastVertex;
|
||||
this.edgesAsSet = p.edgesAsSet;
|
||||
this.totalScore = p.totalScore;
|
||||
this.graph = p.graph;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Path extending p with edge
|
||||
*
|
||||
* @param p the path to extend
|
||||
* @param edge the edge to extend path by
|
||||
* @param p the path to extend.
|
||||
* @param edge the edge to extend path with.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is
|
||||
* not part of {@code p}'s graph, or {@code edge} does not have as a source the last vertex in {@code p}.
|
||||
*/
|
||||
public Path(final Path<T,E> p, final E edge) {
|
||||
if ( p == null ) throw new IllegalArgumentException("Path cannot be null");
|
||||
|
|
@ -128,11 +146,43 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
|
||||
graph = p.graph;
|
||||
lastVertex = p.graph.getEdgeTarget(edge);
|
||||
edgesInOrder = new LinkedList<E>(p.getEdges());
|
||||
edgesInOrder = new ArrayList<>(p.length() + 1);
|
||||
edgesInOrder.addAll(p.edgesInOrder);
|
||||
edgesInOrder.add(edge);
|
||||
totalScore = p.totalScore + edge.getMultiplicity();
|
||||
}
|
||||
|
||||
/**
|
||||
* Length of the path in edges.
|
||||
*
|
||||
* @return {@code 0} or greater.
|
||||
*/
|
||||
public int length() {
|
||||
return edgesInOrder.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepend a path with an edge.
|
||||
*
|
||||
* @param edge the extending edge.
|
||||
* @param p the original path.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is
|
||||
* not part of {@code p}'s graph, or {@code edge} does not have as a target the first vertex in {@code p}.
|
||||
*/
|
||||
public Path(final E edge, final Path<T,E> p) {
|
||||
if ( p == null ) throw new IllegalArgumentException("Path cannot be null");
|
||||
if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null");
|
||||
if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't");
|
||||
if ( ! p.graph.getEdgeTarget(edge).equals(p.getFirstVertex())) { throw new IllegalStateException("Edges added to path must be contiguous."); }
|
||||
graph = p.graph;
|
||||
lastVertex = p.lastVertex;
|
||||
edgesInOrder = new ArrayList<>(p.length() + 1);
|
||||
edgesInOrder.add(edge);
|
||||
edgesInOrder.addAll(p.getEdges());
|
||||
totalScore = p.totalScore + edge.getMultiplicity();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the collection of edges leaving the last vertex of this path
|
||||
* @return a non-null collection
|
||||
|
|
@ -168,6 +218,27 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
return getVertices().contains(v);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a given path is a suffix of this path.
|
||||
*
|
||||
* @param other the path to compare against.
|
||||
* @throws IllegalArgumentException if <code>other</code> is <code>null</code>, or the come from
|
||||
* different graphs.
|
||||
* @return true if <code>other</code> is a suffix of this path.
|
||||
*/
|
||||
public boolean isSuffix(final Path<T, E> other) {
|
||||
if ( other == null ) throw new IllegalArgumentException("path cannot be null");
|
||||
if (other.getGraph() != this.getGraph()) throw new IllegalArgumentException("the other path most belong to the same path");
|
||||
if (!lastVertex.equals(other.lastVertex))
|
||||
return false;
|
||||
final ListIterator<E> myIt = edgesInOrder.listIterator(edgesInOrder.size());
|
||||
final ListIterator<E> otherIt = other.edgesInOrder.listIterator(other.edgesInOrder.size());
|
||||
while (myIt.hasPrevious() && otherIt.hasPrevious())
|
||||
if (otherIt.previous() != myIt.previous())
|
||||
return false;
|
||||
return !otherIt.hasPrevious();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that two paths have the same edges and total score
|
||||
* @param path the other path we might be the same as
|
||||
|
|
@ -182,13 +253,13 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path=");
|
||||
boolean first = true;
|
||||
for ( final T v : getVertices() ) {
|
||||
if ( first ) {
|
||||
if ( first )
|
||||
first = false;
|
||||
} else {
|
||||
else
|
||||
b.append(" -> ");
|
||||
}
|
||||
b.append(v.getSequenceString());
|
||||
}
|
||||
b.append('}');
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
|
|
@ -249,7 +320,11 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
* @return a non-null vertex
|
||||
*/
|
||||
public T getFirstVertex() {
|
||||
return getGraph().getEdgeSource(edgesInOrder.pollFirst());
|
||||
if (edgesInOrder.size() == 0) {
|
||||
return lastVertex;
|
||||
} else {
|
||||
return getGraph().getEdgeSource(edgesInOrder.get(0));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -260,7 +335,7 @@ public class Path<T extends BaseVertex, E extends BaseEdge> {
|
|||
public byte[] getBases() {
|
||||
if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); }
|
||||
|
||||
byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst()));
|
||||
byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.get(0)));
|
||||
for( final E e : edgesInOrder ) {
|
||||
bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e)));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,285 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* Represents a route or path through a graph.
|
||||
* <p>
|
||||
* In contrast with a {@link Path}, a route keeps track of the
|
||||
* path taken at furcations in order to speed up some path comparisions like the
|
||||
* one implemented by {@link #isSuffix}.
|
||||
* </p>
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class Route<V extends BaseVertex, E extends BaseEdge> extends Path<V,E> {
|
||||
|
||||
protected final Route<V,E> previousRouteWithLastVertexThatIsForkOrJoin;
|
||||
protected final boolean lastVertexIsForkOrJoin;
|
||||
|
||||
/**
|
||||
* Create a zero length route with a start in a particular vertex:
|
||||
*
|
||||
* @param initialVertex the first vertex of the route.
|
||||
* @param graph the new route's graph.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code initialVertex} or {@code graph} are {@code null}.
|
||||
* or if {@code initialVertex} does not belong to {@code graph}.
|
||||
*/
|
||||
public Route(final V initialVertex, final BaseGraph<V, E> graph) {
|
||||
super(initialVertex, graph);
|
||||
previousRouteWithLastVertexThatIsForkOrJoin = null;
|
||||
lastVertexIsForkOrJoin = graph.inDegreeOf(initialVertex) > 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if (other == null) return false;
|
||||
if (other == this) return true;
|
||||
if (! (other instanceof Route)) return false;
|
||||
@SuppressWarnings("unchecked")
|
||||
final Route<V,E> otherRoute = (Route<V,E>) other;
|
||||
return otherRoute.length() == this.length() && isSuffix(otherRoute);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extends a route into a new instance.
|
||||
*
|
||||
* @param prefix the route to extend.
|
||||
* @param nextVertex the vertex to extend the route to.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code prefix} is {@code null} or {@code nextVertex} is {@code null}
|
||||
* or {@code nextVertex} does not belong to {@code prefix}'s graph or there is no edge that in the graph
|
||||
* that would connect {@code prefix}'s last vertex with {@code nextVertex} directly.
|
||||
*/
|
||||
public Route(final Route<V,E> prefix, final V nextVertex) {
|
||||
this(prefix,resolveSuffixEdge(prefix,nextVertex));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Extends a route into a new instance.
|
||||
*
|
||||
* @param prevVertex the vertex to extend the route to.
|
||||
* @param suffix the route to extend.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code suffix} is {@code null} or {@code prevVertex} is {@code null}
|
||||
* or {@code prevVertex} does not belong to {@code suffix}'s graph or there is no edge that in the graph
|
||||
* that would connect {@code suffix}'s first vertex with {@code prevVertex} directly.
|
||||
*/
|
||||
public Route(final V prevVertex, final Route<V,E> suffix) {
|
||||
this(resolvePrefixEdge(prevVertex, suffix),suffix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the prefix edge as required by {@link Route(V,Route)}.
|
||||
*/
|
||||
private static <V extends BaseVertex,E extends BaseEdge> E resolvePrefixEdge(final V prevVertex, final Route<V, E> suffix) {
|
||||
if (prevVertex == null) throw new NullPointerException();
|
||||
if (!suffix.getGraph().containsVertex(prevVertex)) throw new IllegalArgumentException();
|
||||
final E result = suffix.getGraph().getEdge(prevVertex,suffix.getFirstVertex());
|
||||
if (result == null)
|
||||
throw new IllegalArgumentException("there is no such edge in the graph");
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves the suffix edge as required by {@link Route(Route,V)}
|
||||
*/
|
||||
private static <V extends BaseVertex,E extends BaseEdge> E resolveSuffixEdge(final Route<V,E> prefix, final V nextVertex) {
|
||||
if (nextVertex == null) throw new IllegalArgumentException();
|
||||
if (!prefix.getGraph().containsVertex(nextVertex)) throw new IllegalArgumentException();
|
||||
final E result = prefix.getGraph().getEdge(prefix.getLastVertex(),nextVertex);
|
||||
if (result == null)
|
||||
throw new IllegalArgumentException("there is no such edge in the graph");
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extends a route by prefixing an edge.
|
||||
*
|
||||
* @param initialEdge the extending edge.
|
||||
* @param suffix the original path.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code suffix} or {@code initialEdge} are {@code null}, or {@code initialEdge} is
|
||||
* not part of {@code suffix}'s graph, or {@code initialEdge} does not have as a target the first vertex in {@code suffix}.
|
||||
*/
|
||||
public Route(final E initialEdge, final Route<V,E> suffix) {
|
||||
super(initialEdge,suffix);
|
||||
final V firstVertex = getFirstVertex();
|
||||
if(suffix.length() == 0) {
|
||||
lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin || graph.outDegreeOf(firstVertex) > 1;
|
||||
previousRouteWithLastVertexThatIsForkOrJoin = graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null;
|
||||
} else {
|
||||
lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin;
|
||||
if (suffix.previousRouteWithLastVertexThatIsForkOrJoin != null)
|
||||
previousRouteWithLastVertexThatIsForkOrJoin = new Route<>(initialEdge,suffix.previousRouteWithLastVertexThatIsForkOrJoin);
|
||||
else
|
||||
previousRouteWithLastVertexThatIsForkOrJoin = graph.outDegreeOf(firstVertex) > 1 ?
|
||||
new Route<>(new Route<>(firstVertex,graph),edgesInOrder.get(0)) :
|
||||
graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create copy of an existing route.
|
||||
* @param route the route to copy
|
||||
*
|
||||
* @throws NullPointerException if {@code route} is {@code null}.
|
||||
*/
|
||||
protected Route(final Route<V, E> route) {
|
||||
super(route);
|
||||
lastVertexIsForkOrJoin = route.lastVertexIsForkOrJoin;
|
||||
previousRouteWithLastVertexThatIsForkOrJoin = route.previousRouteWithLastVertexThatIsForkOrJoin;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Route extending another one with an edge
|
||||
*
|
||||
* @param route the route to extend.
|
||||
* @param edge the edge to extend the route with.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code route} or {@code edge} are {@code null}, or {@code edge} is
|
||||
* not part of {@code route}'s graph, or {@code edge} does not have as a source the last vertex in {@code route}.
|
||||
*/
|
||||
public Route(final Route<V, E> route, final E edge) {
|
||||
super(route, edge);
|
||||
lastVertexIsForkOrJoin = graph.outDegreeOf(route.lastVertex) > 1 || graph.inDegreeOf(lastVertex) > 1;
|
||||
previousRouteWithLastVertexThatIsForkOrJoin = route.lastVertexIsForkOrJoin ? route : route.previousRouteWithLastVertexThatIsForkOrJoin;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSuffix(final Path<V,E> other) {
|
||||
if (other == this)
|
||||
return true;
|
||||
else if (other == null)
|
||||
throw new IllegalArgumentException("other path must not be null");
|
||||
else if (getGraph() != other.getGraph())
|
||||
throw new IllegalArgumentException("other path must be part of the same graph");
|
||||
else if (other instanceof Route)
|
||||
return isRouteSuffix((Route<V,E>)other);
|
||||
else
|
||||
return super.isSuffix(other);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString().replace("Path{", "Route{");
|
||||
}
|
||||
|
||||
/**
|
||||
* Faster version when comparing with a route.
|
||||
*/
|
||||
protected boolean isRouteSuffix(final Route<V,E> other) {
|
||||
if (other.getGraph() != this.getGraph())
|
||||
throw new IllegalArgumentException("you cannot compare routes on different graphs");
|
||||
else if (lastVertex != other.lastVertex) // obvious case.
|
||||
return false;
|
||||
else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null
|
||||
&& other.previousRouteWithLastVertexThatIsForkOrJoin != null) // I am shorter or different path for sure.
|
||||
return false;
|
||||
else if (this.edgesInOrder.size() < other.edgesInOrder.size()) // I am shorter regardless of path, no way Jose!
|
||||
return false;
|
||||
else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null || other.previousRouteWithLastVertexThatIsForkOrJoin == null) {
|
||||
final ListIterator<E> myEdges = edgesInOrder.listIterator(edgesInOrder.size());
|
||||
final ListIterator<E> otherEdges = other.edgesInOrder.listIterator(other.edgesInOrder.size());
|
||||
while (otherEdges.hasPrevious())
|
||||
if (myEdges.previous() != otherEdges.previous())
|
||||
return false;
|
||||
return true;
|
||||
} else
|
||||
return (other.previousRouteWithLastVertexThatIsForkOrJoin == this.previousRouteWithLastVertexThatIsForkOrJoin)
|
||||
|| (previousRouteWithLastVertexThatIsForkOrJoin.lastVertex == other.previousRouteWithLastVertexThatIsForkOrJoin.lastVertex
|
||||
&& previousRouteWithLastVertexThatIsForkOrJoin.isRouteSuffix(other.previousRouteWithLastVertexThatIsForkOrJoin));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the last vertex in the route is a fork or a joining vertex.
|
||||
* @return {@code true} iff so.
|
||||
*/
|
||||
public boolean lastVertexIsForkOrJoin() {
|
||||
return lastVertexIsForkOrJoin;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the longest prefix route that has as a last vertex a join or furcation vertex.
|
||||
*
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public Route<V,E> getPrefixRouteWithLastVertexThatIsForkOrJoin() {
|
||||
return previousRouteWithLastVertexThatIsForkOrJoin;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Splice out the first few vertices of the route.
|
||||
*
|
||||
* @param length how many vertices to splice out
|
||||
* @return a new route without those spliced vertices.
|
||||
*
|
||||
* @throws IllegalArgumentException if {@code length} is equal to the route's length or greater or if it is negative.
|
||||
* Notice that non-vertex route are no legal routes.
|
||||
*/
|
||||
public Route<V,E> splicePrefix(final int length) {
|
||||
if (length == 0)
|
||||
return this;
|
||||
if (length >= length())
|
||||
throw new IllegalArgumentException("prefix slicing to long");
|
||||
if (length < 0)
|
||||
throw new IllegalArgumentException("prefix cannot be negative");
|
||||
|
||||
final List<E> resultEdges = getEdges().subList(length,length());
|
||||
Route<V,E> result = new Route<>(graph.getEdgeSource(resultEdges.get(0)),this);
|
||||
for (final E edge : resultEdges)
|
||||
result = new Route<>(result,edge);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KmerSequence;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* A collection of route building methods.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class RouteFinder {
|
||||
|
||||
|
||||
/**
|
||||
* Completes a path backwards in the graph that would explain the sequence if bytes ending in the vertex provided.
|
||||
*
|
||||
* @param graph the graph to build the path upon.
|
||||
* @param sequence contains the sequence to backtrack.
|
||||
* @param start inclusive start position of the sequence to backtrack.
|
||||
* @param end exclusive end position of the sequence to backtrack.
|
||||
* @param vertex final vertex of the resulting path.
|
||||
* @return {@code null} if there is not such path, otherwise a path such that vertex is the last vertex of it
|
||||
* and its sequence is squence[start to end] + v.getSuffix();
|
||||
*/
|
||||
private static <V extends BaseVertex, E extends BaseEdge> Route<V,E> extendRouteBackwards(final BaseGraph<V, E> graph,
|
||||
final byte[] sequence,
|
||||
final int start,
|
||||
final int end,
|
||||
final V vertex) {
|
||||
final Route<V,E> emptyPath = new Route<>(vertex,graph);
|
||||
if (end <= start) // trivial case.
|
||||
return emptyPath;
|
||||
final int kmerSize = graph.getKmerSize();
|
||||
final Stack<Pair<Route<V,E>,Integer>> stack = new Stack<>();
|
||||
stack.ensureCapacity(end - start + 1);
|
||||
stack.push(new Pair<>(emptyPath,end));
|
||||
while (!stack.isEmpty()) {
|
||||
final Pair<Route<V,E>,Integer> next = stack.pop();
|
||||
final Route<V,E> nextRoute = next.getFirst();
|
||||
final int nextEnd = next.getSecond();
|
||||
if (nextEnd <= start) {
|
||||
return nextRoute.splicePrefix(kmerSize - 1); // gotcha!!!
|
||||
}
|
||||
final V nextFirstVertex = nextRoute.getFirstVertex();
|
||||
if (graph.isSource(nextFirstVertex)) {
|
||||
final byte[] fullFirstVertexSequence = nextFirstVertex.getSequence();
|
||||
if (nextEnd - start != fullFirstVertexSequence.length - 1) {
|
||||
continue; // you need to have the right length to accept a source vertex.
|
||||
}
|
||||
boolean mismatchFound = false;
|
||||
for (int i = 0; i < fullFirstVertexSequence.length - 1; i++) {
|
||||
if (fullFirstVertexSequence[i] != sequence[i + start]) {
|
||||
mismatchFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!mismatchFound)
|
||||
return nextRoute;
|
||||
} else {
|
||||
final Integer newNextEnd = nextEnd - 1;
|
||||
for (final E edge : graph.incomingEdgesOf(nextFirstVertex)) {
|
||||
final V prevVertex = graph.getEdgeSource(edge);
|
||||
final byte[] prevSequence = prevVertex.getSequence();
|
||||
final byte prevByte = prevSequence[prevSequence.length - 1];
|
||||
if (prevByte == sequence[newNextEnd]) {
|
||||
stack.push(new Pair<>(new Route<>(edge,nextRoute),newNextEnd));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Completes a path forward in the graph that would explain the sequence if bytes starting by the prefix provided.
|
||||
*
|
||||
* @param sequence missing sequence we want to
|
||||
* @param start inclusive first position in {@code sequence} that starts the extension
|
||||
* @param end exclusive position after the last of bases to be added to the extension.
|
||||
* @param prefix the seed prefix of the path.
|
||||
* @return {@code null} if there is not such path, otherwise a path such that vertex is the last vertex of it
|
||||
* and its sequence is prefix.getBases() + sequence[start to end];
|
||||
*/
|
||||
private static <V extends BaseVertex, E extends BaseEdge> Route<V,E> extendRouteForwards(
|
||||
final BaseGraph<V, E> graph, final byte[] sequence, final int start, final int end,
|
||||
final Route<V, E> prefix) {
|
||||
if (end <= start) // trivial case.
|
||||
return prefix;
|
||||
|
||||
final Stack<Pair<Route<V,E>,Integer>> stack = new Stack<>();
|
||||
stack.ensureCapacity(end - start + 1);
|
||||
stack.push(new Pair<>(prefix,start));
|
||||
while (!stack.isEmpty()) {
|
||||
final Pair<Route<V,E>,Integer> next = stack.pop();
|
||||
final Route<V,E> nextRoute = next.getFirst();
|
||||
final int nextStart = next.getSecond();
|
||||
if (end <= nextStart)
|
||||
return nextRoute; // gotcha!!!
|
||||
final V lastVertex = nextRoute.getLastVertex();
|
||||
final Integer newNextStart = nextStart + 1;
|
||||
for (final E edge : graph.outgoingEdgesOf(lastVertex)) {
|
||||
final V nextVertex = graph.getEdgeTarget(edge);
|
||||
final byte[] nextSequence = nextVertex.getSequence();
|
||||
final byte nextByte = nextSequence[nextSequence.length - 1];
|
||||
if (nextByte == sequence[nextStart]) {
|
||||
stack.push(new Pair<>(new Route<>(nextRoute,edge),newNextStart));
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new route object give a sequence using unique kmer mappings.
|
||||
*
|
||||
* @param sequence base sequence.
|
||||
* @return {@code null} if there is no way such route on the graph or the start kmer is not unique.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <V extends BaseVertex, E extends BaseEdge> Route<V,E> findRoute(final BaseGraph<V,E> graph,
|
||||
final byte[] sequence) {
|
||||
if (graph == null)
|
||||
throw new NullPointerException();
|
||||
if (!(graph instanceof KmerSearchableGraph))
|
||||
throw new IllegalArgumentException("the input graph must implement " + KmerSearchableGraph.class.getName());
|
||||
|
||||
final int kmerSize = graph.getKmerSize();
|
||||
final KmerSequence haplotypeKmers = new KmerSequence(sequence,kmerSize);
|
||||
|
||||
if (haplotypeKmers.kmerSize() != graph.getKmerSize())
|
||||
throw new IllegalArgumentException("incompatible kmer sizes " + graph.getKmerSize() + " != " + haplotypeKmers.kmerSize());
|
||||
|
||||
V vertex = null;
|
||||
int i;
|
||||
for (i = 0; i < haplotypeKmers.size(); i++)
|
||||
if ((vertex = ((KmerSearchableGraph<V,E>)graph).findKmer(haplotypeKmers.get(i))) != null)
|
||||
break;
|
||||
if (vertex == null)
|
||||
return null;
|
||||
if (!graph.containsVertex(vertex))
|
||||
throw new IllegalStateException("vertex does not belong to graph.");
|
||||
Route<V,E> result = i == 0 ? new Route<>(vertex,graph) :
|
||||
extendRouteBackwards(graph, sequence, 0, i + kmerSize - 1, vertex);
|
||||
if (result == null)
|
||||
return null;
|
||||
result = extendRouteForwards(graph, sequence, i + kmerSize, sequence.length, result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -62,7 +62,7 @@ import java.util.Set;
|
|||
* @author: depristo
|
||||
* @since 03/2013
|
||||
*/
|
||||
public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
||||
public class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
||||
/**
|
||||
* Edge factory that creates non-reference multiplicity 1 edges
|
||||
*/
|
||||
|
|
@ -89,13 +89,6 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
*/
|
||||
private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100;
|
||||
|
||||
/**
|
||||
* Construct an empty SeqGraph
|
||||
*/
|
||||
public SeqGraph() {
|
||||
this(11);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer
|
||||
*
|
||||
|
|
@ -294,10 +287,8 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
|
||||
// create the combined vertex, and add it to the graph
|
||||
// TODO -- performance problem -- can be optimized if we want
|
||||
final List<byte[]> seqs = new LinkedList<byte[]>();
|
||||
for ( SeqVertex v : linearChain ) seqs.add(v.getSequence());
|
||||
final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{}));
|
||||
final SeqVertex addedVertex = new SeqVertex( seqsCat );
|
||||
|
||||
final SeqVertex addedVertex = mergeLinearChainVertices(linearChain);
|
||||
addVertex(addedVertex);
|
||||
|
||||
final Set<BaseEdge> inEdges = incomingEdgesOf(first);
|
||||
|
|
@ -315,6 +306,13 @@ public final class SeqGraph extends BaseGraph<SeqVertex, BaseEdge> {
|
|||
return true;
|
||||
}
|
||||
|
||||
protected SeqVertex mergeLinearChainVertices(final List<SeqVertex> vertices) {
|
||||
final List<byte[]> seqs = new LinkedList<byte[]>();
|
||||
for ( SeqVertex v : vertices ) seqs.add(v.getSequence());
|
||||
final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{}));
|
||||
return new SeqVertex( seqsCat );
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the sum of the edge weights on a linear chain of at least 2 elements
|
||||
*
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
|
|
|||
|
|
@ -189,7 +189,7 @@ public class SharedVertexSequenceSplitter {
|
|||
* Must be called before calling updateGraph
|
||||
*/
|
||||
public void split() {
|
||||
splitGraph = new SeqGraph();
|
||||
splitGraph = new SeqGraph(outer.getKmerSize());
|
||||
newMiddles = new LinkedList<SeqVertex>();
|
||||
edgesToRemove = new LinkedList<BaseEdge>();
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs;
|
||||
|
||||
/**
|
||||
* Indicate a SeqGraph vertex topological order between to vertices.
|
||||
*/
|
||||
public enum VertexOrder {
|
||||
BEFORE, AFTER, SAME, PARALLEL;
|
||||
|
||||
public VertexOrder inverse() {
|
||||
switch (this) {
|
||||
case BEFORE: return AFTER;
|
||||
case AFTER: return BEFORE;
|
||||
default: return this;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -64,7 +64,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
* Date: 4/17/13
|
||||
* Time: 3:20 PM
|
||||
*/
|
||||
final class MultiDeBruijnVertex extends DeBruijnVertex {
|
||||
public final class MultiDeBruijnVertex extends DeBruijnVertex {
|
||||
private final static boolean KEEP_TRACK_OF_READS = false;
|
||||
|
||||
// Note that using an AtomicInteger is critical to allow multi-threaded HaplotypeCaller
|
||||
|
|
@ -116,6 +116,10 @@ final class MultiDeBruijnVertex extends DeBruijnVertex {
|
|||
|
||||
@Override
|
||||
public String additionalInfo() {
|
||||
return KEEP_TRACK_OF_READS ? (! reads.contains("ref") ? "__" + Utils.join(",", reads) : "") : "";
|
||||
return super.additionalInfo() + (KEEP_TRACK_OF_READS ? (! reads.contains("ref") ? "__" + Utils.join(",", reads) : "") : "");
|
||||
}
|
||||
|
||||
int getId() {
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
return null;
|
||||
}
|
||||
|
||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot"));
|
||||
printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot"));
|
||||
|
||||
// go through and prune all of the chains where all edges have <= pruneFactor. This must occur
|
||||
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
|
||||
|
|
@ -195,20 +195,23 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine {
|
|||
// remove all heading and trailing paths
|
||||
if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef();
|
||||
|
||||
printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot"));
|
||||
printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot"));
|
||||
|
||||
final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph();
|
||||
if (debugGraphTransformations) initialSeqGraph.printGraph(new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"),10000);
|
||||
|
||||
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
|
||||
if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph);
|
||||
|
||||
if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||
printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot"));
|
||||
if (debug) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
|
||||
printDebugGraphTransform(initialSeqGraph, new File( "" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot"));
|
||||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction
|
||||
|
||||
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
|
||||
final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus();
|
||||
return new AssemblyResult(status, cleaned.getGraph());
|
||||
final AssemblyResult result = new AssemblyResult(status, cleaned.getGraph());
|
||||
result.setThreadingGraph(rtgraph);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -65,8 +65,11 @@ import org.jgrapht.alg.CycleDetector;
|
|||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> implements KmerSearchableGraph<MultiDeBruijnVertex,MultiSampleEdge> {
|
||||
|
||||
public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSampleEdge> {
|
||||
/**
|
||||
* Edge factory that encapsulates the numPruningSamples assembly parameter
|
||||
*/
|
||||
|
|
@ -108,17 +111,17 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
/**
|
||||
* A set of non-unique kmers that cannot be used as merge points in the graph
|
||||
*/
|
||||
private Set<Kmer> nonUniqueKmers;
|
||||
protected Set<Kmer> nonUniqueKmers;
|
||||
|
||||
/**
|
||||
* A map from kmers -> their corresponding vertex in the graph
|
||||
*/
|
||||
private Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<>();
|
||||
protected Map<Kmer, MultiDeBruijnVertex> uniqueKmers = new LinkedHashMap<>();
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
final int kmerSize;
|
||||
|
||||
final boolean debugGraphTransformations;
|
||||
final byte minBaseQualityToUseInAssembly;
|
||||
|
||||
|
|
@ -129,16 +132,42 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
// state variables, initialized in resetToInitialState()
|
||||
// --------------------------------------------------------------------------------
|
||||
private Kmer refSource;
|
||||
private boolean alreadyBuilt;
|
||||
|
||||
public ReadThreadingGraph() {
|
||||
this(25, false, (byte)6, 1);
|
||||
}
|
||||
protected boolean alreadyBuilt;
|
||||
|
||||
/**
|
||||
* Constructs an empty read-threading-grpah provided the kmerSize.
|
||||
* @param kmerSize 1 or greater.
|
||||
*
|
||||
* @throw IllegalArgumentException if (@code kmerSize) < 1.
|
||||
*/
|
||||
public ReadThreadingGraph(final int kmerSize) {
|
||||
this(kmerSize, false, (byte)6, 1);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the collection of outgoing vertices that expand this vertex with a particular base.
|
||||
*
|
||||
* @param v original vertex.
|
||||
* @param b expanding base.
|
||||
* @return never null, but perhaps an empty set. You cannot assume that you can modify the result.
|
||||
*/
|
||||
protected Set<MultiDeBruijnVertex> getNextVertices(final MultiDeBruijnVertex v, final byte b) {
|
||||
if (v == null) throw new IllegalArgumentException("the input vertex cannot be null");
|
||||
if (!vertexSet().contains(v)) throw new IllegalArgumentException("the vertex must be present in the graph");
|
||||
final List<MultiDeBruijnVertex> result = new LinkedList<>();
|
||||
for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) {
|
||||
if (w.getSuffix() == b)
|
||||
result.add(w);
|
||||
}
|
||||
switch (result.size()) {
|
||||
case 0: return Collections.emptySet();
|
||||
case 1: return Collections.singleton(result.get(0));
|
||||
default:
|
||||
return new HashSet<>(result);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ReadThreadingAssembler using kmerSize for matching
|
||||
* @param kmerSize must be >= 1
|
||||
|
|
@ -147,7 +176,6 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
super(kmerSize, new MyEdgeFactory(numPruningSamples));
|
||||
|
||||
if ( kmerSize < 1 ) throw new IllegalArgumentException("bad minkKmerSize " + kmerSize);
|
||||
this.kmerSize = kmerSize;
|
||||
this.debugGraphTransformations = debugGraphTransformations;
|
||||
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
|
||||
|
||||
|
|
@ -350,6 +378,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0);
|
||||
|
||||
addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -457,6 +486,33 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
// clear
|
||||
pending.clear();
|
||||
alreadyBuilt = true;
|
||||
for (final MultiDeBruijnVertex v : uniqueKmers.values())
|
||||
v.setAdditionalInfo(v.additionalInfo() + "+");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean removeVertex(MultiDeBruijnVertex V) {
|
||||
final boolean result = super.removeVertex(V);
|
||||
if (result) {
|
||||
final byte[] sequence = V.getSequence();
|
||||
final Kmer kmer = new Kmer(sequence);
|
||||
uniqueKmers.remove(kmer);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public void removeSingletonOrphanVertices() {
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final List<MultiDeBruijnVertex> verticesToRemove = new LinkedList<>();
|
||||
for( final MultiDeBruijnVertex v : vertexSet() ) {
|
||||
if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
this.removeVertex(null);
|
||||
removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -487,7 +543,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
int attempted = 0;
|
||||
int nRecovered = 0;
|
||||
for ( final MultiDeBruijnVertex v : vertexSet() ) {
|
||||
if ( outDegreeOf(v) == 0 && ! isRefNodeAndRefSink(v) ) {
|
||||
if ( outDegreeOf(v) == 0 && ! isRefSink(v) ) {
|
||||
attempted++;
|
||||
nRecovered += recoverDanglingChain(v, pruneFactor);
|
||||
}
|
||||
|
|
@ -594,6 +650,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
final SeqGraph seqGraph = new SeqGraph(kmerSize);
|
||||
final Map<MultiDeBruijnVertex, SeqVertex> vertexMap = new HashMap<MultiDeBruijnVertex, SeqVertex>();
|
||||
|
||||
|
||||
// create all of the equivalent seq graph vertices
|
||||
for ( final MultiDeBruijnVertex dv : vertexSet() ) {
|
||||
final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv)));
|
||||
|
|
@ -638,7 +695,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
* @param seqForKmers the sequence we want to thread into the graph
|
||||
* @return a pair of the starting vertex and its position in seqForKmer
|
||||
*/
|
||||
private Pair<MultiDeBruijnVertex, Integer> findStart(final SequenceForKmers seqForKmers) {
|
||||
protected Pair<MultiDeBruijnVertex, Integer> findStart(final SequenceForKmers seqForKmers) {
|
||||
final int uniqueStartPos = seqForKmers.isRef ? 0 : findUniqueStartPosition(seqForKmers.sequence, seqForKmers.start, seqForKmers.stop);
|
||||
|
||||
if ( uniqueStartPos == -1 )
|
||||
|
|
@ -670,7 +727,7 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
* @param allowRefSource if true, we will allow matches to the kmer that represents the reference starting kmer
|
||||
* @return a non-null vertex
|
||||
*/
|
||||
private Pair<MultiDeBruijnVertex, Integer> getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) {
|
||||
protected Pair<MultiDeBruijnVertex, Integer> getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) {
|
||||
final Kmer kmer = new Kmer(sequence, start, kmerSize);
|
||||
final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, allowRefSource);
|
||||
if ( vertex != null ) {
|
||||
|
|
@ -688,9 +745,11 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
*/
|
||||
private MultiDeBruijnVertex getUniqueKmerVertex(final Kmer kmer, final boolean allowRefSource) {
|
||||
if ( ! allowRefSource && kmer.equals(refSource) ) return null;
|
||||
|
||||
return uniqueKmers.get(kmer);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a new vertex for kmer. Add it to the uniqueKmers map if appropriate.
|
||||
*
|
||||
|
|
@ -809,4 +868,178 @@ public class ReadThreadingGraph extends BaseGraph<MultiDeBruijnVertex, MultiSamp
|
|||
"kmerSize=" + kmerSize +
|
||||
'}';
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public MultiDeBruijnVertex findKmer(final Kmer k) {
|
||||
return uniqueKmers.get(k);
|
||||
}
|
||||
|
||||
/*************************************************************
|
||||
* Simple string representation support for testing purposes *
|
||||
*************************************************************/
|
||||
|
||||
private static final Pattern PROPERTIES_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*\\]");
|
||||
private static final Pattern PATH_PATTERN = Pattern.compile("\\{((\\S+):)?([^\\}]*)\\}");
|
||||
private static final Pattern KMERSIZE_EXTRACTOR_PATTERN = Pattern.compile("^\\s*\\[[^\\]]*(ks|kmerSize)\\s*=\\s*(\\d+)\\s*[,\\]]");
|
||||
|
||||
|
||||
/**
|
||||
* Constructs a read-threadingg-graph for a string representation.
|
||||
*
|
||||
* <p>
|
||||
* Note: only used for testing.
|
||||
* Checkout {@link HaplotypeGraphUnitTest} for examples.
|
||||
* </p>
|
||||
* @param s the string representation of the graph {@code null}.
|
||||
*/
|
||||
public ReadThreadingGraph(final String s) {
|
||||
super(kmerSizeFromString(s),new MyEdgeFactory(1));
|
||||
debugGraphTransformations = false;
|
||||
minBaseQualityToUseInAssembly = 0;
|
||||
applyString(s);
|
||||
alreadyBuilt = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain the kmer size for the string representation.
|
||||
* @param str the source string representation.
|
||||
* @return 1 or greater.
|
||||
* @throws IllegalArgumentException if {@code} str does not contain a valid representation.
|
||||
*/
|
||||
private static int kmerSizeFromString(final String str) {
|
||||
final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str);
|
||||
if (matcher.find()) {
|
||||
return Integer.parseInt(matcher.group(2));
|
||||
} else
|
||||
throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize");
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply description string into the graph.
|
||||
*
|
||||
* <p>
|
||||
* Note: this is done just for testing purposes.
|
||||
* Checkout {@link HaplotypeGraphUnitTest} for examples.
|
||||
* </p>
|
||||
* @param str the string representation.
|
||||
*/
|
||||
private void applyString(final String str) {
|
||||
final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str);
|
||||
final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0;
|
||||
|
||||
final String pathString = str.substring(pathStart);
|
||||
final Matcher pathMatcher = PATH_PATTERN.matcher(pathString);
|
||||
|
||||
boolean referenceFound = false;
|
||||
final Map<String,MultiDeBruijnVertex> vertexById = new HashMap<>();
|
||||
|
||||
// Loop between path strings and add them one by one.
|
||||
while (pathMatcher.find()) {
|
||||
final String label = pathMatcher.group(2);
|
||||
final boolean isReference = (label != null && label.equals("REF"));
|
||||
if (referenceFound) {
|
||||
if (isReference)
|
||||
throw new IllegalArgumentException("there are two reference paths");
|
||||
|
||||
} else
|
||||
referenceFound |= isReference;
|
||||
|
||||
// Divide each path into its elements getting a list of sequences and labels if applies:
|
||||
final String elementsString = pathMatcher.group(3);
|
||||
final String[] elements = elementsString.split("\\s*->\\s*");
|
||||
if (elements.length == 0)
|
||||
throw new IllegalArgumentException("empty path not allowed");
|
||||
final String[] seqs = new String[elements.length];
|
||||
final String[] ids = new String[elements.length];
|
||||
for (int i = 0; i < elements.length; i++) {
|
||||
ids[i] = pathElementId(elements[i]);
|
||||
seqs[i] = pathElementSeq(elements[i]);
|
||||
if (seqs[i].isEmpty() && ids[i] == null)
|
||||
throw new IllegalArgumentException("path with empty element without an id");
|
||||
}
|
||||
final boolean isSource = ids[0] == null || !vertexById.containsKey(ids[0]);
|
||||
if (isSource && seqs[0].length() != kmerSize)
|
||||
throw new IllegalArgumentException("source sequence length must be the same as the kmerSize "
|
||||
+ ids[0] + " " + seqs[0] + " " + pathMatcher.group());
|
||||
final MultiDeBruijnVertex firstVertex;
|
||||
if (ids[0] != null && vertexById.containsKey(ids[0]))
|
||||
firstVertex = vertexById.get(ids[0]);
|
||||
else {
|
||||
firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes());
|
||||
addVertex(firstVertex);
|
||||
if (ids[0] != null)
|
||||
vertexById.put(ids[0],firstVertex);
|
||||
}
|
||||
if (!seqs[0].isEmpty() &&
|
||||
((isSource && !firstVertex.getSequenceString().equals(seqs[0]))
|
||||
|| (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0])))
|
||||
throw new IllegalArgumentException("mismatched first element sequence");
|
||||
|
||||
MultiDeBruijnVertex lastVertex = firstVertex;
|
||||
for (int i = 1; i < elements.length; i++) {
|
||||
if (seqs[i].length() > 1)
|
||||
throw new IllegalArgumentException("non-source vertex sequence must have length 1");
|
||||
final MultiDeBruijnVertex nextVertex;
|
||||
if (ids[i] == null || !vertexById.containsKey(ids[i])) {
|
||||
final Set<MultiDeBruijnVertex> nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]);
|
||||
if (nextVertices.size() == 0) {
|
||||
nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0]));
|
||||
addVertex(nextVertex);
|
||||
} else {
|
||||
nextVertex = nextVertices.iterator().next();
|
||||
}
|
||||
if (ids[i] != null)
|
||||
vertexById.put(ids[i],nextVertex);
|
||||
} else {
|
||||
nextVertex = vertexById.get(ids[i]);
|
||||
}
|
||||
final MultiSampleEdge edge = addEdge(lastVertex,nextVertex);
|
||||
if (isReference) edge.setIsRef(true);
|
||||
lastVertex = nextVertex;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String pathElementId(final String element) {
|
||||
final int parentesysPos = element.indexOf('(');
|
||||
|
||||
if (parentesysPos == -1)
|
||||
return null;
|
||||
|
||||
final int closeParentesysPos = element.lastIndexOf(')');
|
||||
if (closeParentesysPos == -1)
|
||||
throw new IllegalArgumentException("non-closed id parantesys found in element: " + element);
|
||||
final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim();
|
||||
if (result.isEmpty())
|
||||
throw new IllegalArgumentException("empty id found in element: " + element);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the lenght of a path element in the string representation.
|
||||
* @param element the query element.
|
||||
* @return 0 or greater.
|
||||
*/
|
||||
private static String pathElementSeq(final String element) {
|
||||
final int parentesysPos = element.indexOf('(');
|
||||
|
||||
if (parentesysPos == -1)
|
||||
return element.trim();
|
||||
|
||||
return element.substring(0,parentesysPos).trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a base to the end of a byte sequence.
|
||||
* @param sequence sequence where to add the base to.
|
||||
* @param b base to add.
|
||||
* @return never {@code null}, a new array each time.
|
||||
*/
|
||||
private static byte[] extendSequence(final byte[] sequence, final byte b) {
|
||||
final byte[] result = new byte[sequence.length];
|
||||
System.arraycopy(sequence,1,result,0,sequence.length - 1);
|
||||
result[result.length - 1] = b;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -48,11 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.indels;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
|
|
@ -64,10 +65,9 @@ import org.broadinstitute.variant.variantcontext.Allele;
|
|||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Map;
|
||||
|
||||
//import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM;
|
||||
|
||||
|
||||
public class PairHMMIndelErrorModel {
|
||||
public static final int BASE_QUAL_THRESHOLD = 20;
|
||||
|
|
@ -120,8 +120,11 @@ public class PairHMMIndelErrorModel {
|
|||
case LOGLESS_CACHING:
|
||||
pairHMM = new LoglessPairHMM();
|
||||
break;
|
||||
case ARRAY_LOGLESS:
|
||||
pairHMM = new ArrayLoglessPairHMM();
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT or LOGLESS_CACHING.");
|
||||
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS.");
|
||||
}
|
||||
|
||||
// fill gap penalty table, affine naive model:
|
||||
|
|
@ -202,6 +205,39 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
private LinkedHashMap<Allele, Haplotype> trimHaplotypes(final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
long startLocationInRefForHaplotypes,
|
||||
long stopLocationInRefForHaplotypes,
|
||||
final ReferenceContext ref){
|
||||
|
||||
final LinkedHashMap<Allele, Haplotype> trimmedHaplotypeMap = new LinkedHashMap<>();
|
||||
for (final Allele a: haplotypeMap.keySet()) {
|
||||
|
||||
final Haplotype haplotype = haplotypeMap.get(a);
|
||||
|
||||
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
|
||||
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
|
||||
|
||||
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
|
||||
startLocationInRefForHaplotypes = haplotype.getStartPosition();
|
||||
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
|
||||
startLocationInRefForHaplotypes = haplotype.getStopPosition();
|
||||
|
||||
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
|
||||
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
|
||||
|
||||
if (DEBUG)
|
||||
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n",
|
||||
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
|
||||
|
||||
// get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map
|
||||
final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
|
||||
final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference());
|
||||
trimmedHaplotypeMap.put(a, trimmedHaplotype);
|
||||
}
|
||||
return trimmedHaplotypeMap;
|
||||
}
|
||||
|
||||
|
||||
public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
|
|
@ -218,6 +254,28 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we clip a downstream portion of a read because it spans off the end of a haplotype?
|
||||
*
|
||||
* @param read the read in question
|
||||
* @param refWindowStop the end of the reference window
|
||||
* @return true if the read needs to be clipped, false otherwise
|
||||
*/
|
||||
protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) {
|
||||
return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() > refWindowStop );
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we clip a upstream portion of a read because it spans off the end of a haplotype?
|
||||
*
|
||||
* @param read the read in question
|
||||
* @param refWindowStart the start of the reference window
|
||||
* @return true if the read needs to be clipped, false otherwise
|
||||
*/
|
||||
protected static boolean mustClipUpstream(final GATKSAMRecord read, final int refWindowStart) {
|
||||
return ( !read.isEmpty() && read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart );
|
||||
}
|
||||
|
||||
@Ensures("result != null && result.length == pileup.getNumberOfElements()")
|
||||
public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup,
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
|
|
@ -227,6 +285,8 @@ public class PairHMMIndelErrorModel {
|
|||
final int[] readCounts) {
|
||||
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
|
||||
|
||||
final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
|
||||
final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
|
||||
int readIdx=0;
|
||||
for (PileupElement p: pileup) {
|
||||
// > 1 when the read is a consensus read representing multiple independent observations
|
||||
|
|
@ -245,9 +305,8 @@ public class PairHMMIndelErrorModel {
|
|||
// in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly
|
||||
// in case bases at edge of reads have lower quality.
|
||||
final int trailingBases = 3;
|
||||
final int extraOffset = Math.abs(eventLength);
|
||||
final int refWindowStart = ref.getWindow().getStart()+(trailingBases+extraOffset);
|
||||
final int refWindowStop = ref.getWindow().getStop()-(trailingBases+extraOffset);
|
||||
final int refWindowStart = ref.getWindow().getStart() + trailingBases;
|
||||
final int refWindowStop = ref.getWindow().getStop() - trailingBases;
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString());
|
||||
|
|
@ -255,11 +314,13 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
|
||||
|
||||
if (!read.isEmpty() && (read.getSoftEnd() > refWindowStop && read.getSoftStart() < refWindowStop))
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, refWindowStop);
|
||||
// if the read extends beyond the downstream (right) end of the reference window, clip it
|
||||
if ( mustClipDownstream(read, refWindowStop) )
|
||||
read = ReadClipper.hardClipByReadCoordinates(read, read.getSoftStart() + read.getReadLength() - refWindowStop + 1, read.getReadLength() - 1);
|
||||
|
||||
if (!read.isEmpty() && (read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart))
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, refWindowStart);
|
||||
// if the read extends beyond the upstream (left) end of the reference window, clip it
|
||||
if ( mustClipUpstream(read, refWindowStart) )
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart);
|
||||
|
||||
if (read.isEmpty())
|
||||
continue;
|
||||
|
|
@ -297,8 +358,9 @@ public class PairHMMIndelErrorModel {
|
|||
* trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to
|
||||
* differentiate context between two haplotypes
|
||||
*/
|
||||
long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0);
|
||||
long stopLocationInRefForHaplotypes = readEnd -numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset;
|
||||
final int absEventLength = Math.abs(eventLength);
|
||||
long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0);
|
||||
long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength;
|
||||
|
||||
if (DEBUG)
|
||||
System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes);
|
||||
|
|
@ -365,52 +427,30 @@ public class PairHMMIndelErrorModel {
|
|||
baseDeletionQualities = contextLogGapOpenProbabilities;
|
||||
}
|
||||
|
||||
boolean firstHap = true;
|
||||
for (Allele a: haplotypeMap.keySet()) {
|
||||
// Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
|
||||
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
|
||||
readList.add(processedRead);
|
||||
|
||||
Haplotype haplotype = haplotypeMap.get(a);
|
||||
// Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
|
||||
readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities);
|
||||
|
||||
if (stopLocationInRefForHaplotypes > haplotype.getStopPosition())
|
||||
stopLocationInRefForHaplotypes = haplotype.getStopPosition();
|
||||
// Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations
|
||||
final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref);
|
||||
|
||||
if (startLocationInRefForHaplotypes < haplotype.getStartPosition())
|
||||
startLocationInRefForHaplotypes = haplotype.getStartPosition();
|
||||
else if (startLocationInRefForHaplotypes > haplotype.getStopPosition())
|
||||
startLocationInRefForHaplotypes = haplotype.getStopPosition();
|
||||
|
||||
final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition();
|
||||
final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition();
|
||||
|
||||
double readLikelihood;
|
||||
if (DEBUG)
|
||||
System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n",
|
||||
indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString());
|
||||
|
||||
final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop);
|
||||
|
||||
// it's possible that the indel starts at the last base of the haplotypes
|
||||
if ( haplotypeBases.length == 0 ) {
|
||||
readLikelihood = -Double.MAX_VALUE;
|
||||
} else {
|
||||
if (firstHap) {
|
||||
//no need to reallocate arrays for each new haplotype, as length won't change
|
||||
pairHMM.initialize(readBases.length, haplotypeBases.length);
|
||||
firstHap = false;
|
||||
}
|
||||
|
||||
readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals,
|
||||
baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("H:"+new String(haplotypeBases));
|
||||
System.out.println("R:"+new String(readBases));
|
||||
System.out.format("L:%4.2f\n",readLikelihood);
|
||||
}
|
||||
// Get the likelihoods for our clipped read against each of our trimmed haplotypes.
|
||||
final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap);
|
||||
|
||||
// Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array
|
||||
for (Allele a: trimmedHaplotypeMap.keySet()){
|
||||
double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a);
|
||||
perReadAlleleLikelihoodMap.add(p, a, readLikelihood);
|
||||
readLikelihoods[readIdx][j++] = readLikelihood;
|
||||
}
|
||||
// The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually
|
||||
readList.remove(processedRead);
|
||||
|
||||
// The same is true for the read/GCP-array map
|
||||
readGCPArrayMap.remove(processedRead);
|
||||
}
|
||||
}
|
||||
readIdx++;
|
||||
|
|
@ -434,16 +474,16 @@ public class PairHMMIndelErrorModel {
|
|||
return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength));
|
||||
}
|
||||
|
||||
private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
|
||||
if (b1.length != b2.length)
|
||||
return 0; // sanity check
|
||||
|
||||
for (int i=0; i < b1.length; i++ ){
|
||||
if ( b1[i]!= b2[i] )
|
||||
return i;
|
||||
}
|
||||
return b1.length;
|
||||
}
|
||||
// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) {
|
||||
// if (b1.length != b2.length)
|
||||
// return 0; // sanity check
|
||||
//
|
||||
// for (int i=0; i < b1.length; i++ ){
|
||||
// if ( b1[i]!= b2[i] )
|
||||
// return i;
|
||||
// }
|
||||
// return b1.length;
|
||||
// }
|
||||
|
||||
private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
|
||||
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
|
|
|
|||
|
|
@ -46,10 +46,8 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.apache.commons.math.util.MathUtils;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -112,6 +110,9 @@ import java.util.*;
|
|||
@PartitionBy(PartitionType.LOCUS)
|
||||
public class ApplyRecalibration extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
|
||||
|
||||
public static final String LOW_VQSLOD_FILTER_NAME = "LOW_VQSLOD";
|
||||
private final double DEFAULT_VQSLOD_CUTOFF = 0.0;
|
||||
|
||||
/////////////////////////////
|
||||
// Inputs
|
||||
/////////////////////////////
|
||||
|
|
@ -122,7 +123,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
public List<RodBinding<VariantContext>> input;
|
||||
@Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
|
||||
protected RodBinding<VariantContext> recal;
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
|
||||
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=false)
|
||||
protected File TRANCHES_FILE;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
@ -134,9 +135,17 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Advanced
|
||||
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
|
||||
protected double TS_FILTER_LEVEL = 99.0;
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
protected Double TS_FILTER_LEVEL = null;
|
||||
@Advanced
|
||||
@Argument(fullName="lodCutoff", shortName="lodCutoff", doc="The VQSLOD score below which to start filtering", required=false)
|
||||
protected Double VQSLOD_CUTOFF = null;
|
||||
|
||||
/**
|
||||
* For this to work properly, the -ignoreFilter argument should also be applied to the VariantRecalibration command.
|
||||
*/
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the recalibration will be applied to variants marked as filtered by the specified filter name in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't output filtered loci after applying the recalibration", required=false)
|
||||
protected boolean EXCLUDE_FILTERED = false;
|
||||
|
|
@ -157,13 +166,15 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
|
||||
if ( t.ts >= TS_FILTER_LEVEL ) {
|
||||
tranches.add(t);
|
||||
if( TS_FILTER_LEVEL != null ) {
|
||||
for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) {
|
||||
if ( t.ts >= TS_FILTER_LEVEL ) {
|
||||
tranches.add(t);
|
||||
}
|
||||
logger.info(String.format("Read tranche " + t));
|
||||
}
|
||||
logger.info(String.format("Read tranche " + t));
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity)
|
||||
}
|
||||
Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity)
|
||||
|
||||
for( final RodBinding rod : input ) {
|
||||
inputNames.add( rod.getName() );
|
||||
|
|
@ -180,19 +191,32 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
final TreeSet<String> samples = new TreeSet<>();
|
||||
samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames));
|
||||
|
||||
if( tranches.size() >= 2 ) {
|
||||
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
|
||||
final Tranche t = tranches.get(iii);
|
||||
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level for " + t.model.toString() + " model at VQS Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
|
||||
if( TS_FILTER_LEVEL != null ) {
|
||||
// if the user specifies both ts_filter_level and lodCutoff then throw a user error
|
||||
if( VQSLOD_CUTOFF != null ) {
|
||||
throw new UserException("Arguments --ts_filter_level and --lodCutoff are mutually exclusive. Please only specify one option.");
|
||||
}
|
||||
}
|
||||
if( tranches.size() >= 1 ) {
|
||||
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level for " + tranches.get(0).model.toString() + " model at VQS Lod < " + tranches.get(0).minVQSLod)));
|
||||
} else {
|
||||
throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL);
|
||||
}
|
||||
|
||||
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
|
||||
if( tranches.size() >= 2 ) {
|
||||
for( int iii = 0; iii < tranches.size() - 1; iii++ ) {
|
||||
final Tranche t = tranches.get(iii);
|
||||
hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level for " + t.model.toString() + " model at VQS Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod)));
|
||||
}
|
||||
}
|
||||
if( tranches.size() >= 1 ) {
|
||||
hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level for " + tranches.get(0).model.toString() + " model at VQS Lod < " + tranches.get(0).minVQSLod)));
|
||||
} else {
|
||||
throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL);
|
||||
}
|
||||
|
||||
logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1));
|
||||
} else {
|
||||
if( VQSLOD_CUTOFF == null ) {
|
||||
VQSLOD_CUTOFF = DEFAULT_VQSLOD_CUTOFF;
|
||||
}
|
||||
hInfo.add(new VCFFilterHeaderLine(LOW_VQSLOD_FILTER_NAME, "VQSLOD < " + VQSLOD_CUTOFF));
|
||||
logger.info("Keeping all variants with VQSLOD >= " + VQSLOD_CUTOFF);
|
||||
}
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
|
||||
vcfWriter.writeHeader(vcfHeader);
|
||||
|
|
@ -242,7 +266,6 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
}
|
||||
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
String filterString = null;
|
||||
|
||||
// Annotate the new record with its VQSLOD and the worst performing annotation
|
||||
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
|
||||
|
|
@ -252,21 +275,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY))
|
||||
builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true);
|
||||
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
final Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
if( i == tranches.size() - 1 ) {
|
||||
filterString = VCFConstants.PASSES_FILTERS_v4;
|
||||
} else {
|
||||
filterString = tranche.name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( filterString == null ) {
|
||||
filterString = tranches.get(0).name+"+";
|
||||
}
|
||||
final String filterString = generateFilterString(lod);
|
||||
|
||||
if( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) {
|
||||
builder.passFilters();
|
||||
|
|
@ -286,6 +295,36 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> implements T
|
|||
return 1; // This value isn't used for anything
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the VCF filter string for this record based on the provided lod score
|
||||
* @param lod non-null double
|
||||
* @return the String to use as the VCF filter field
|
||||
*/
|
||||
protected String generateFilterString( final double lod ) {
|
||||
String filterString = null;
|
||||
if( TS_FILTER_LEVEL != null ) {
|
||||
for( int i = tranches.size() - 1; i >= 0; i-- ) {
|
||||
final Tranche tranche = tranches.get(i);
|
||||
if( lod >= tranche.minVQSLod ) {
|
||||
if( i == tranches.size() - 1 ) {
|
||||
filterString = VCFConstants.PASSES_FILTERS_v4;
|
||||
} else {
|
||||
filterString = tranche.name;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( filterString == null ) {
|
||||
filterString = tranches.get(0).name+"+";
|
||||
}
|
||||
} else {
|
||||
filterString = ( lod < VQSLOD_CUTOFF ? LOW_VQSLOD_FILTER_NAME : VCFConstants.PASSES_FILTERS_v4 );
|
||||
}
|
||||
|
||||
return filterString;
|
||||
}
|
||||
|
||||
private static VariantContext getMatchingRecalVC(final VariantContext target, final List<VariantContext> recalVCs) {
|
||||
for( final VariantContext recalVC : recalVCs ) {
|
||||
if ( target.getEnd() == recalVC.getEnd() ) {
|
||||
|
|
|
|||
|
|
@ -267,7 +267,7 @@ public class GaussianMixtureModel {
|
|||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
int numRandomDraws = 0;
|
||||
double sumPVarInGaussian = 0.0;
|
||||
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
|
||||
final int numIterPerMissingAnnotation = 20; // Trade off here between speed of computation and accuracy of the marginalization
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
// for each dimension
|
||||
for( int iii = 0; iii < datum.annotations.length; iii++ ) {
|
||||
|
|
|
|||
|
|
@ -160,11 +160,11 @@ public class TrancheManager {
|
|||
}
|
||||
}
|
||||
|
||||
public static List<Tranche> findTranches( final ArrayList<VariantDatum> data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) {
|
||||
public static List<Tranche> findTranches( final List<VariantDatum> data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) {
|
||||
return findTranches( data, tranches, metric, model, null );
|
||||
}
|
||||
|
||||
public static List<Tranche> findTranches( final ArrayList<VariantDatum> data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) {
|
||||
public static List<Tranche> findTranches( final List<VariantDatum> data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) {
|
||||
logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size()));
|
||||
|
||||
Collections.sort( data, new VariantDatum.VariantDatumLODComparator() );
|
||||
|
|
@ -172,7 +172,7 @@ public class TrancheManager {
|
|||
|
||||
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); }
|
||||
|
||||
List<Tranche> tranches = new ArrayList<Tranche>();
|
||||
List<Tranche> tranches = new ArrayList<>();
|
||||
for ( double trancheThreshold : trancheThresholds ) {
|
||||
Tranche t = findTranche(data, metric, trancheThreshold, model);
|
||||
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ import java.util.*;
|
|||
*/
|
||||
|
||||
public class VariantDataManager {
|
||||
private ExpandingArrayList<VariantDatum> data;
|
||||
private List<VariantDatum> data;
|
||||
private double[] meanVector;
|
||||
private double[] varianceVector; // this is really the standard deviation
|
||||
public List<String> annotationKeys;
|
||||
|
|
@ -88,30 +88,30 @@ public class VariantDataManager {
|
|||
trainingSets = new ArrayList<>();
|
||||
}
|
||||
|
||||
public void setData( final ExpandingArrayList<VariantDatum> data ) {
|
||||
public void setData( final List<VariantDatum> data ) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> getData() {
|
||||
public List<VariantDatum> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public void normalizeData() {
|
||||
boolean foundZeroVarianceAnnotation = false;
|
||||
for( int iii = 0; iii < meanVector.length; iii++ ) {
|
||||
final double theMean = mean(iii);
|
||||
final double theSTD = standardDeviation(theMean, iii);
|
||||
final double theMean = mean(iii, true);
|
||||
final double theSTD = standardDeviation(theMean, iii, true);
|
||||
logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) );
|
||||
if( Double.isNaN(theMean) ) {
|
||||
throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpConstants.forumPost("discussion/49/using-variant-annotator"));
|
||||
}
|
||||
|
||||
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6);
|
||||
foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-5);
|
||||
meanVector[iii] = theMean;
|
||||
varianceVector[iii] = theSTD;
|
||||
for( final VariantDatum datum : data ) {
|
||||
// Transform each data point via: (x - mean) / standard deviation
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD );
|
||||
}
|
||||
}
|
||||
if( foundZeroVarianceAnnotation ) {
|
||||
|
|
@ -129,7 +129,7 @@ public class VariantDataManager {
|
|||
|
||||
// re-order the data by increasing standard deviation so that the results don't depend on the order things were specified on the command line
|
||||
// standard deviation over the training points is used as a simple proxy for information content, perhaps there is a better thing to use here
|
||||
final List<Integer> theOrder = calculateSortOrder(varianceVector);
|
||||
final List<Integer> theOrder = calculateSortOrder(meanVector);
|
||||
annotationKeys = reorderList(annotationKeys, theOrder);
|
||||
varianceVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(varianceVector), theOrder));
|
||||
meanVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(meanVector), theOrder));
|
||||
|
|
@ -137,40 +137,41 @@ public class VariantDataManager {
|
|||
datum.annotations = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.annotations), theOrder));
|
||||
datum.isNull = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.isNull), theOrder));
|
||||
}
|
||||
logger.info("Annotations are now ordered by their information content: " + annotationKeys.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of indices which give the ascending sort order of the data array
|
||||
* @param data the data to consider
|
||||
* @param inputVector the data to consider
|
||||
* @return a non-null list of integers with length matching the length of the input array
|
||||
*/
|
||||
protected List<Integer> calculateSortOrder(final double[] data) {
|
||||
final List<Integer> theOrder = new ArrayList<>(data.length);
|
||||
final List<MyStandardDeviation> sortedData = new ArrayList<>(data.length);
|
||||
protected List<Integer> calculateSortOrder(final double[] inputVector) {
|
||||
final List<Integer> theOrder = new ArrayList<>(inputVector.length);
|
||||
final List<MyDoubleForSorting> toBeSorted = new ArrayList<>(inputVector.length);
|
||||
int count = 0;
|
||||
for( final double d : data ) {
|
||||
sortedData.add(new MyStandardDeviation(d, count++));
|
||||
for( int iii = 0; iii < inputVector.length; iii++ ) {
|
||||
toBeSorted.add(new MyDoubleForSorting(-1.0 * Math.abs(inputVector[iii] - mean(iii, false)), count++));
|
||||
}
|
||||
Collections.sort(sortedData); // sort the data in ascending order
|
||||
for( final MyStandardDeviation d : sortedData ) {
|
||||
Collections.sort(toBeSorted);
|
||||
for( final MyDoubleForSorting d : toBeSorted ) {
|
||||
theOrder.add(d.originalIndex); // read off the sort order by looking at the index field
|
||||
}
|
||||
return theOrder;
|
||||
}
|
||||
|
||||
// small private class to assist in reading off the new ordering of the standard deviation array
|
||||
private class MyStandardDeviation implements Comparable<MyStandardDeviation> {
|
||||
final Double standardDeviation;
|
||||
// small private class to assist in reading off the new ordering of the annotation array
|
||||
private class MyDoubleForSorting implements Comparable<MyDoubleForSorting> {
|
||||
final Double myData;
|
||||
final int originalIndex;
|
||||
|
||||
public MyStandardDeviation( final double standardDeviation, final int originalIndex ) {
|
||||
this.standardDeviation = standardDeviation;
|
||||
public MyDoubleForSorting(final double myData, final int originalIndex) {
|
||||
this.myData = myData;
|
||||
this.originalIndex = originalIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final MyStandardDeviation other) {
|
||||
return standardDeviation.compareTo(other.standardDeviation);
|
||||
public int compareTo(final MyDoubleForSorting other) {
|
||||
return myData.compareTo(other.myData);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -233,92 +234,77 @@ public class VariantDataManager {
|
|||
return false;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> getTrainingData() {
|
||||
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<>();
|
||||
public List<VariantDatum> getTrainingData() {
|
||||
final List<VariantDatum> trainingData = new ExpandingArrayList<>();
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.atTrainingSite && !datum.failingSTDThreshold && datum.originalQual > VRAC.QUAL_THRESHOLD ) {
|
||||
if( datum.atTrainingSite && !datum.failingSTDThreshold ) {
|
||||
trainingData.add( datum );
|
||||
}
|
||||
}
|
||||
logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." );
|
||||
if( trainingData.size() < VRAC.NUM_BAD_VARIANTS) {
|
||||
if( trainingData.size() < VRAC.MIN_NUM_BAD_VARIANTS ) {
|
||||
logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." );
|
||||
} else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) {
|
||||
logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." );
|
||||
Collections.shuffle(trainingData);
|
||||
return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA);
|
||||
}
|
||||
return trainingData;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> selectWorstVariants( final int minimumNumber ) {
|
||||
// The return value is the list of training variants
|
||||
final ExpandingArrayList<VariantDatum> trainingData = new ExpandingArrayList<>();
|
||||
public List<VariantDatum> selectWorstVariants() {
|
||||
final List<VariantDatum> trainingData = new ExpandingArrayList<>();
|
||||
|
||||
// First add to the training list all sites overlapping any bad sites training tracks
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) {
|
||||
trainingData.add( datum );
|
||||
}
|
||||
}
|
||||
final int numBadSitesAdded = trainingData.size();
|
||||
logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." );
|
||||
|
||||
// Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants
|
||||
Collections.sort( data, new VariantDatum.VariantDatumLODComparator() );
|
||||
final int numToAdd = minimumNumber - trainingData.size();
|
||||
if( numToAdd > data.size() ) {
|
||||
throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --numBadVariants arugment but this is unsafe." );
|
||||
}
|
||||
int index = 0, numAdded = 0;
|
||||
while( numAdded < numToAdd && index < data.size() ) {
|
||||
final VariantDatum datum = data.get(index++);
|
||||
if( datum != null && !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) {
|
||||
if( datum != null && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) && datum.lod < VRAC.BAD_LOD_CUTOFF ) {
|
||||
datum.atAntiTrainingSite = true;
|
||||
trainingData.add( datum );
|
||||
numAdded++;
|
||||
}
|
||||
}
|
||||
logger.info( "Additionally training with worst " + numToAdd + " scoring variants --> " + (trainingData.size() - numBadSitesAdded) + " variants with LOD <= " + String.format("%.4f", data.get(index).lod) + "." );
|
||||
|
||||
logger.info( "Training with worst " + trainingData.size() + " scoring variants --> variants with LOD <= " + String.format("%.4f", VRAC.BAD_LOD_CUTOFF) + "." );
|
||||
|
||||
return trainingData;
|
||||
}
|
||||
|
||||
public ExpandingArrayList<VariantDatum> getRandomDataForPlotting( int numToAdd ) {
|
||||
numToAdd = Math.min(numToAdd, data.size());
|
||||
final ExpandingArrayList<VariantDatum> returnData = new ExpandingArrayList<>();
|
||||
// add numToAdd non-anti training sites to plot
|
||||
for( int iii = 0; iii < numToAdd; iii++) {
|
||||
final VariantDatum datum = data.get(GenomeAnalysisEngine.getRandomGenerator().nextInt(data.size()));
|
||||
if( ! datum.atAntiTrainingSite && !datum.failingSTDThreshold ) {
|
||||
returnData.add(datum);
|
||||
}
|
||||
}
|
||||
public List<VariantDatum> getEvaluationData() {
|
||||
final List<VariantDatum> evaluationData = new ExpandingArrayList<>();
|
||||
|
||||
final int MAX_ANTI_TRAINING_SITES = 10000;
|
||||
int nAntiTrainingAdded = 0;
|
||||
// Add all anti-training sites to visual
|
||||
for( final VariantDatum datum : data ) {
|
||||
if ( nAntiTrainingAdded > MAX_ANTI_TRAINING_SITES )
|
||||
break;
|
||||
else if ( datum.atAntiTrainingSite ) {
|
||||
returnData.add(datum);
|
||||
nAntiTrainingAdded++;
|
||||
if( datum != null && !datum.failingSTDThreshold && !datum.atTrainingSite && !datum.atAntiTrainingSite ) {
|
||||
evaluationData.add( datum );
|
||||
}
|
||||
}
|
||||
|
||||
return evaluationData;
|
||||
}
|
||||
|
||||
public List<VariantDatum> getRandomDataForPlotting( final int numToAdd, final List<VariantDatum> trainingData, final List<VariantDatum> antiTrainingData, final List<VariantDatum> evaluationData ) {
|
||||
final List<VariantDatum> returnData = new ExpandingArrayList<>();
|
||||
Collections.shuffle(trainingData);
|
||||
Collections.shuffle(antiTrainingData);
|
||||
Collections.shuffle(evaluationData);
|
||||
returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size())));
|
||||
returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size())));
|
||||
returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size())));
|
||||
Collections.shuffle(returnData);
|
||||
return returnData;
|
||||
}
|
||||
|
||||
private double mean( final int index ) {
|
||||
protected double mean( final int index, final boolean trainingData ) {
|
||||
double sum = 0.0;
|
||||
int numNonNull = 0;
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.atTrainingSite && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; }
|
||||
if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; }
|
||||
}
|
||||
return sum / ((double) numNonNull);
|
||||
}
|
||||
|
||||
private double standardDeviation( final double mean, final int index ) {
|
||||
protected double standardDeviation( final double mean, final int index, final boolean trainingData ) {
|
||||
double sum = 0.0;
|
||||
int numNonNull = 0;
|
||||
for( final VariantDatum datum : data ) {
|
||||
if( datum.atTrainingSite && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; }
|
||||
if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; }
|
||||
}
|
||||
return Math.sqrt( sum / ((double) numNonNull) );
|
||||
}
|
||||
|
|
@ -343,12 +329,9 @@ public class VariantDataManager {
|
|||
try {
|
||||
value = vc.getAttributeAsDouble( annotationKey, Double.NaN );
|
||||
if( Double.isInfinite(value) ) { value = Double.NaN; }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM
|
||||
value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
||||
}
|
||||
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); }
|
||||
if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); }
|
||||
} catch( Exception e ) {
|
||||
value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -79,14 +80,14 @@ import java.util.*;
|
|||
* Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
|
||||
*
|
||||
* <p>
|
||||
* This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker.
|
||||
* This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker.
|
||||
*</p>
|
||||
*
|
||||
* <p>
|
||||
* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
|
||||
* One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
|
||||
* You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
|
||||
* The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
|
||||
* between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the the probability that a SNP is a true genetic
|
||||
* between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic
|
||||
* variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
|
||||
* as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
|
||||
* error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
|
||||
|
|
@ -94,12 +95,7 @@ import java.util.*;
|
|||
* the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
|
||||
* See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <h3>Inputs</h3>
|
||||
* <p>
|
||||
* The input raw variants to be recalibrated.
|
||||
* <p>
|
||||
|
|
@ -127,6 +123,17 @@ import java.util.*;
|
|||
* -rscriptFile path/to/output.plots.R
|
||||
* </pre>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
*
|
||||
* <ul>
|
||||
* <li>The values used in the example above are only meant to show how the command lines are composed.
|
||||
* They are not meant to be taken as specific recommendations of values to use in your own work, and they may be
|
||||
* different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on
|
||||
* how to set parameter values for you own analyses, please read the Best Practices section of the documentation.</li>
|
||||
*
|
||||
* <li>In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
|
||||
* See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.</li>
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} )
|
||||
|
|
@ -136,7 +143,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model
|
||||
public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out
|
||||
public static final String NEGATIVE_LABEL_KEY = "NEGATIVE_TRAIN_SITE"; // this variant was used in the negative training set
|
||||
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive traning set
|
||||
public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; // this variant was used in the positive training set
|
||||
private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R";
|
||||
|
||||
@ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection();
|
||||
|
|
@ -155,7 +162,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
* Training - Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
|
||||
* Truth - When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
|
||||
* Known - The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
|
||||
* Bad - In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list with a database of known bad variants.
|
||||
* Bad - In addition to using the set of worst ranked variants as compared to the Gaussian mixture model (see -numBad argument), we can also supplement the list with a database of known bad variants.
|
||||
*/
|
||||
@Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm (training and truth sets are required to run)", required=true)
|
||||
public List<RodBinding<VariantContext>> resource = Collections.emptyList();
|
||||
|
|
@ -175,7 +182,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
/**
|
||||
* The expected transition / transversion ratio of true novel variants in your targeted region (whole genome, exome, specific
|
||||
* genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
|
||||
* practices documentation (http://www.broadinstitute.org/gatk/guide/topic?name=best-practices) for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
|
||||
* practices documentation (http://www.broadinstitute.org/gatk/guide/best-practices) for more information.
|
||||
* Normal values are 2.15 for human whole genome values and 3.2 for human whole exomes. Note
|
||||
* that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
|
||||
*/
|
||||
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
|
||||
|
|
@ -194,12 +202,18 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
*/
|
||||
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
|
||||
private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
|
||||
/**
|
||||
* For this to work properly, the -ignoreFilter argument should also be applied to the ApplyRecalibration command.
|
||||
*/
|
||||
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the variant recalibrator will also use variants marked as filtered by the specified filter name in the input VCF file", required=false)
|
||||
private String[] IGNORE_INPUT_FILTERS = null;
|
||||
@Output(fullName="rscript_file", shortName="rscriptFile", doc="The output rscript file generated by the VQSR to aid in visualization of the input data and learned model", required=false, defaultToStdout=false)
|
||||
private File RSCRIPT_FILE = null;
|
||||
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
|
||||
protected double TS_FILTER_LEVEL = 99.0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="replicate", shortName="replicate", doc="Used to debug the random number generation inside the VQSR. Do not use.", required=false)
|
||||
protected int REPLICATE = 200;
|
||||
private ArrayList<Double> replicate = new ArrayList<>();
|
||||
|
||||
/////////////////////////////
|
||||
// Debug Arguments
|
||||
|
|
@ -213,7 +227,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
/////////////////////////////
|
||||
private VariantDataManager dataManager;
|
||||
private PrintStream tranchesStream;
|
||||
private final Set<String> ignoreInputFilterSet = new TreeSet<String>();
|
||||
private final Set<String> ignoreInputFilterSet = new TreeSet<>();
|
||||
private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC );
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -222,8 +236,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
dataManager = new VariantDataManager( new ArrayList<String>(Arrays.asList(USE_ANNOTATIONS)), VRAC );
|
||||
dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC );
|
||||
|
||||
if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS)
|
||||
Utils.warnUser(logger, String.format(
|
||||
|
|
@ -252,9 +267,13 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
}
|
||||
|
||||
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
|
||||
final Set<VCFHeaderLine> hInfo = new HashSet<>();
|
||||
ApplyRecalibration.addVQSRStandardHeaderLines(hInfo);
|
||||
recalWriter.writeHeader( new VCFHeader(hInfo) );
|
||||
|
||||
for( int iii = 0; iii < REPLICATE * 2; iii++ ) {
|
||||
replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble());
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -263,8 +282,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public ExpandingArrayList<VariantDatum> map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<VariantDatum>();
|
||||
final ExpandingArrayList<VariantDatum> mapList = new ExpandingArrayList<>();
|
||||
|
||||
if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers
|
||||
return mapList;
|
||||
|
|
@ -284,7 +304,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
|
||||
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC );
|
||||
double priorFactor = QualityUtils.qualToProb( datum.prior );
|
||||
final double priorFactor = QualityUtils.qualToProb( datum.prior );
|
||||
datum.prior = Math.log10( priorFactor ) - Math.log10( 1.0 - priorFactor );
|
||||
|
||||
mapList.add( datum );
|
||||
|
|
@ -301,15 +321,18 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public ExpandingArrayList<VariantDatum> reduceInit() {
|
||||
return new ExpandingArrayList<VariantDatum>();
|
||||
return new ExpandingArrayList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExpandingArrayList<VariantDatum> reduce( final ExpandingArrayList<VariantDatum> mapValue, final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
reduceSum.addAll( mapValue );
|
||||
return reduceSum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExpandingArrayList<VariantDatum> treeReduce( final ExpandingArrayList<VariantDatum> lhs, final ExpandingArrayList<VariantDatum> rhs ) {
|
||||
rhs.addAll( lhs );
|
||||
return rhs;
|
||||
|
|
@ -321,21 +344,23 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public void onTraversalDone( final ExpandingArrayList<VariantDatum> reduceSum ) {
|
||||
dataManager.setData( reduceSum );
|
||||
dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation
|
||||
|
||||
// Generate the positive model using the training data and evaluate each variant
|
||||
final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData(), VRAC.MAX_GAUSSIANS );
|
||||
final List<VariantDatum> positiveTrainingData = dataManager.getTrainingData();
|
||||
final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS );
|
||||
engine.evaluateData( dataManager.getData(), goodModel, false );
|
||||
|
||||
// Generate the negative model using the worst performing data and evaluate each variant contrastively
|
||||
final ExpandingArrayList<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants( VRAC.NUM_BAD_VARIANTS );
|
||||
final List<VariantDatum> negativeTrainingData = dataManager.selectWorstVariants();
|
||||
final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS));
|
||||
engine.evaluateData( dataManager.getData(), badModel, true );
|
||||
|
||||
if( badModel.failedToConverge || goodModel.failedToConverge ) {
|
||||
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --numBad 3000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") );
|
||||
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") );
|
||||
}
|
||||
|
||||
engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel );
|
||||
|
|
@ -346,31 +371,28 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
final List<Tranche> tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE );
|
||||
tranchesStream.print(Tranche.tranchesString( tranches ));
|
||||
|
||||
// Find the filtering lodCutoff for display on the model PDFs. Red variants are those which were below the cutoff and filtered out of the final callset.
|
||||
double lodCutoff = 0.0;
|
||||
for( final Tranche tranche : tranches ) {
|
||||
if( MathUtils.compareDoubles(tranche.ts, TS_FILTER_LEVEL, 0.0001) == 0 ) {
|
||||
lodCutoff = tranche.minVQSLod;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info( "Writing out recalibration table..." );
|
||||
dataManager.writeOutRecalibrationTable( recalWriter );
|
||||
if( RSCRIPT_FILE != null ) {
|
||||
logger.info( "Writing out visualization Rscript file...");
|
||||
createVisualizationScript( dataManager.getRandomDataForPlotting( 6000 ), goodModel, badModel, lodCutoff, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) );
|
||||
createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) );
|
||||
}
|
||||
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class));
|
||||
executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV);
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
logger.info("Executing: " + executor.getApproximateCommandLine());
|
||||
executor.exec();
|
||||
if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) {
|
||||
// Print out an info message to make it clear why the tranches plot is not generated
|
||||
logger.info("Tranches plot will not be generated since we are running in INDEL mode");
|
||||
} else {
|
||||
// Execute the RScript command to plot the table of truth values
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class));
|
||||
executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV);
|
||||
// Print out the command line to make it clear to the user what is being executed and how one might modify it
|
||||
logger.info("Executing: " + executor.getApproximateCommandLine());
|
||||
executor.exec();
|
||||
}
|
||||
}
|
||||
|
||||
private void createVisualizationScript( final ExpandingArrayList<VariantDatum> randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) {
|
||||
private void createVisualizationScript( final List<VariantDatum> randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) {
|
||||
PrintStream stream;
|
||||
try {
|
||||
stream = new PrintStream(RSCRIPT_FILE);
|
||||
|
|
@ -394,7 +416,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
for( int jjj = iii + 1; jjj < annotationKeys.length; jjj++) {
|
||||
logger.info( "Building " + annotationKeys[iii] + " x " + annotationKeys[jjj] + " plot...");
|
||||
|
||||
final ExpandingArrayList<VariantDatum> fakeData = new ExpandingArrayList<VariantDatum>();
|
||||
final List<VariantDatum> fakeData = new ExpandingArrayList<>();
|
||||
double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0;
|
||||
for( final VariantDatum datum : randomData ) {
|
||||
minAnn1 = Math.min(minAnn1, datum.annotations[iii]);
|
||||
|
|
@ -403,8 +425,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
maxAnn2 = Math.max(maxAnn2, datum.annotations[jjj]);
|
||||
}
|
||||
// Create a fake set of data which spans the full extent of these two annotation dimensions in order to calculate the model PDF projected to 2D
|
||||
for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+=0.1) {
|
||||
for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+=0.1) {
|
||||
final double NUM_STEPS = 60.0;
|
||||
for(double ann1 = minAnn1; ann1 <= maxAnn1; ann1+= (maxAnn1 - minAnn1) / NUM_STEPS) {
|
||||
for(double ann2 = minAnn2; ann2 <= maxAnn2; ann2+= (maxAnn2 - minAnn2) / NUM_STEPS) {
|
||||
final VariantDatum datum = new VariantDatum();
|
||||
datum.prior = 0.0;
|
||||
datum.annotations = new double[randomData.get(0).annotations.length];
|
||||
|
|
@ -426,7 +449,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
stream.print("surface <- c(");
|
||||
for( final VariantDatum datum : fakeData ) {
|
||||
stream.print(String.format("%.3f, %.3f, %.3f, ",
|
||||
stream.print(String.format("%.4f, %.4f, %.4f, ",
|
||||
dataManager.denormalizeDatum(datum.annotations[iii], iii),
|
||||
dataManager.denormalizeDatum(datum.annotations[jjj], jjj),
|
||||
Math.min(4.0, Math.max(-4.0, datum.lod))));
|
||||
|
|
@ -436,7 +459,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
stream.print("data <- c(");
|
||||
for( final VariantDatum datum : randomData ) {
|
||||
stream.print(String.format("%.3f, %.3f, %.3f, %d, %d,",
|
||||
stream.print(String.format("%.4f, %.4f, %.4f, %d, %d,",
|
||||
dataManager.denormalizeDatum(datum.annotations[iii], iii),
|
||||
dataManager.denormalizeDatum(datum.annotations[jjj], jjj),
|
||||
(datum.lod < lodCutoff ? -1.0 : 1.0),
|
||||
|
|
|
|||
|
|
@ -73,27 +73,48 @@ public class VariantRecalibratorArgumentCollection {
|
|||
|
||||
@Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels (emitting SNPs untouched in the output VCF); and 3.) BOTH for recalibrating both SNPs and indels simultaneously (for testing purposes only, not recommended for general use).", required = false)
|
||||
public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="maxGaussians", shortName="mG", doc="The maximum number of Gaussians for the positive model to try during variational Bayes algorithm.", required=false)
|
||||
public int MAX_GAUSSIANS = 10;
|
||||
public int MAX_GAUSSIANS = 8;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="maxNegativeGaussians", shortName="mNG", doc="The maximum number of Gaussians for the negative model to try during variational Bayes algorithm. The actual maximum used is the min of the mG and mNG arguments. Note that this number should be small (like 4) to achieve the best results", required=false)
|
||||
public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 4;
|
||||
public int MAX_GAUSSIANS_FOR_NEGATIVE_MODEL = 2;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="maxIterations", shortName="mI", doc="The maximum number of VBEM iterations to be performed in variational Bayes algorithm. Procedure will normally end when convergence is detected.", required=false)
|
||||
public int MAX_ITERATIONS = 100;
|
||||
public int MAX_ITERATIONS = 150;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="numKMeans", shortName="nKM", doc="The number of k-means iterations to perform in order to initialize the means of the Gaussians in the Gaussian mixture model.", required=false)
|
||||
public int NUM_KMEANS_ITERATIONS = 30;
|
||||
public int NUM_KMEANS_ITERATIONS = 100;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="stdThreshold", shortName="std", doc="If a variant has annotations more than -std standard deviations away from mean then don't use it for building the Gaussian mixture model.", required=false)
|
||||
public double STD_THRESHOLD = 14.0;
|
||||
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
|
||||
public double QUAL_THRESHOLD = 80.0;
|
||||
public double STD_THRESHOLD = 10.0;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
|
||||
public double SHRINKAGE = 1.0;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
|
||||
public double DIRICHLET_PARAMETER = 0.001;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
|
||||
public double PRIOR_COUNTS = 20.0;
|
||||
@Argument(fullName="numBadVariants", shortName="numBad", doc="The number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false)
|
||||
public int NUM_BAD_VARIANTS = 1000;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="maxNumTrainingData", shortName="maxNumTrainingData", doc="Maximum number of training data to be used in building the Gaussian mixture model. Training sets large than this will be randomly downsampled.", required=false)
|
||||
protected int MAX_NUM_TRAINING_DATA = 2500000;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum number of worst scoring variants to use when building the Gaussian mixture model of bad variants.", required=false)
|
||||
public int MIN_NUM_BAD_VARIANTS = 1000;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="badLodCutoff", shortName="badLodCutoff", doc="The LOD score below which to be used when building the Gaussian mixture model of bad variants.", required=false)
|
||||
public double BAD_LOD_CUTOFF = -5.0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ public class VariantRecalibratorEngine {
|
|||
// the unified argument collection
|
||||
final private VariantRecalibratorArgumentCollection VRAC;
|
||||
|
||||
private final static double MIN_PROB_CONVERGENCE = 2E-2;
|
||||
private final static double MIN_PROB_CONVERGENCE = 2E-3;
|
||||
|
||||
/////////////////////////////
|
||||
// Public Methods to interface with the Engine
|
||||
|
|
|
|||
|
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.utils;
|
||||
|
||||
/**
|
||||
* Helper class to handle sequence complexity analyses.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class SequenceComplexity {
|
||||
|
||||
/**
|
||||
* Indicates what positions in a base sequence is found in homopolymers or STR repeat.
|
||||
*
|
||||
* <p>
|
||||
* The result is an boolean array with as many positions as the input base array.
|
||||
* </p>
|
||||
* <p>
|
||||
* Each entry the result makes reference to the base at the same position in the input, where {@code true}
|
||||
* means that it forms part of a repeat.
|
||||
* </p>
|
||||
*
|
||||
* @param bases the input bases.
|
||||
* @param maxRepeatUnitLength what is the largest repeat unit to consider.
|
||||
* @param minRepeatLengthInUnits what is minimum length of a repeat in units to consider it significantly long. Shorter
|
||||
* repeats won't be considered as such.
|
||||
* @return never {@code null} but an array with the same length as the reference haplotype.
|
||||
*/
|
||||
public static boolean[] findBasesInShortUnitRepeats(final byte[] bases, final int maxRepeatUnitLength,
|
||||
final int minRepeatLengthInUnits) {
|
||||
final boolean[] result = new boolean[bases.length];
|
||||
final int[] repeatAbsoluteLengthCount = new int[maxRepeatUnitLength];
|
||||
for (int i = 0; i < maxRepeatUnitLength; i++)
|
||||
repeatAbsoluteLengthCount[i] = i + 1;
|
||||
for (int i = 0; i < bases.length; i++)
|
||||
for (int j = 1; j <= maxRepeatUnitLength; j++) {
|
||||
final int prevI = i - j;
|
||||
if (prevI < 0) continue;
|
||||
if (bases[prevI] == bases[i]) // repeat continuation.
|
||||
repeatAbsoluteLengthCount[j - 1]++;
|
||||
else if (minRepeatLengthInUnits <= (repeatAbsoluteLengthCount[j - 1] / j)) { // end of a long enough repeat.
|
||||
for (int k = i - repeatAbsoluteLengthCount[j - 1]; k < i; k++)
|
||||
result[k] = true;
|
||||
repeatAbsoluteLengthCount[j - 1] = j;
|
||||
} else { // end of not long enough repeat.
|
||||
repeatAbsoluteLengthCount[j - 1] = j;
|
||||
}
|
||||
}
|
||||
// Do the marking for the last position in bases.
|
||||
for (int j = 1; j <= maxRepeatUnitLength; j++)
|
||||
if (minRepeatLengthInUnits <= (repeatAbsoluteLengthCount[j - 1] / j))
|
||||
for (int k = bases.length - repeatAbsoluteLengthCount[j - 1]; k < bases.length; k++)
|
||||
result[k] = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,521 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.utils.collections;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import com.sun.istack.internal.NotNull;
|
||||
|
||||
import java.lang.reflect.Array;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Efficient implementation for a small set of integer primitive values.
|
||||
* <p>
|
||||
* It includes a increment operation incAll which is convenient when analyzing the read-threading graphs. Nevertheless
|
||||
* it can be also be used in general purpose.
|
||||
* </p>
|
||||
* <p>
|
||||
* It does not provide a O(1) look-up of its elements though. These are kept in a sorted array so look up is implemented
|
||||
* using a binary search O(log n). Therefore it might not be optimal for problems that require large integer sets.
|
||||
* </p>
|
||||
* <p>
|
||||
* Also note that addition can be costly for large sets unless done in order: O(n).
|
||||
* </p>
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class CountSet implements Cloneable, Set<Integer> {
|
||||
|
||||
/**
|
||||
* The size of the set.
|
||||
*/
|
||||
private int size;
|
||||
|
||||
/**
|
||||
* Holds the element of the set within the subrange <code>[0 .. size - 1]</code> in ascending order.
|
||||
*/
|
||||
private int[] elements;
|
||||
|
||||
/**
|
||||
* Creates a copy of an existing int-set.
|
||||
* @param template the intset to copy values from.
|
||||
*/
|
||||
public CountSet(final CountSet template) {
|
||||
elements = template.elements.clone();
|
||||
size = template.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new set indicating the expected maximum number of elements it will contain.
|
||||
* @param initialCapacity the desired initial capacity of the set.
|
||||
* @throws IllegalArgumentException if <code>initialCapacity</code> is negative.
|
||||
*/
|
||||
public CountSet(int initialCapacity) {
|
||||
if (initialCapacity < 0)
|
||||
throw new IllegalArgumentException();
|
||||
elements = new int[initialCapacity];
|
||||
size = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the set contents to a single integer value.
|
||||
* @param value the integer value to set the set to.
|
||||
*/
|
||||
public void setTo(int value) {
|
||||
ensureCapacity(1);
|
||||
size = 1;
|
||||
elements[0] = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the content of this set to a collection of integers.
|
||||
* @param values the new values to be included in the set.
|
||||
* @throws NullPointerException if <code>value</code> is <code>null</code>.
|
||||
*/
|
||||
public void setTo(int ... values) {
|
||||
ensureCapacity(values.length);
|
||||
size = values.length;
|
||||
System.arraycopy(values, 0, elements, 0, size);
|
||||
Arrays.sort(elements,0,size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Increase (or decrease) all elements in the set by a number.
|
||||
* @param delta the number of add (or substract if negative) to all elements.
|
||||
*
|
||||
* @return <code>true</code> if the set changed as a result of this invocation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean incAll(final int delta) {
|
||||
if (size == 0 || delta == 0)
|
||||
return false;
|
||||
for (int i = 0; i < size; i++)
|
||||
elements[i] += delta;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the smallest integer value in the set.
|
||||
*
|
||||
* @throws NoSuchElementException if the set is empty (thus there is no minimum).
|
||||
* @return the smallest integer value in the set.
|
||||
*/
|
||||
public int min() {
|
||||
if (size == 0)
|
||||
throw new NoSuchElementException("cannot have a min from an empty set");
|
||||
return elements[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the largest integer value in the set.
|
||||
*
|
||||
* @throws NoSuchElementException if the set is empty (thus there is no maximum).
|
||||
* @return the largest integer value in the set.
|
||||
*/
|
||||
public int max() {
|
||||
if (size == 0)
|
||||
throw new NoSuchElementException("cannot have a max from an empty set");
|
||||
return elements[size - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a range of integer values to the collection.
|
||||
*
|
||||
* This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive.
|
||||
* You can indicate a decrease range (fromValue > toValue).
|
||||
*
|
||||
* @param fromValue the first value to add in the set (inclusive).
|
||||
* @param toValue the last value to add to the set (inclusive).
|
||||
* @return <code>true</code> if the set changed as a result of this invocation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean addRange(final int fromValue, final int toValue) {
|
||||
final int lowEnd;
|
||||
final int highEnd;
|
||||
|
||||
if (fromValue <= toValue) {
|
||||
lowEnd = fromValue; highEnd = toValue;
|
||||
} else {
|
||||
highEnd = fromValue; lowEnd = toValue;
|
||||
}
|
||||
|
||||
//TODO to be optimized to add missing sub-ranges in one go:
|
||||
boolean result = false;
|
||||
for (int i = lowEnd; i <= highEnd; i++)
|
||||
result = add(i) | result;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an integer value to the set.
|
||||
* @param value to add to the set.
|
||||
* @return <code>true</code> if the set changed as a result of this invocation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean add(final int value) {
|
||||
int pos = Arrays.binarySearch(elements,0,size,value);
|
||||
if (pos >= 0) return false;
|
||||
int insertPos = - pos - 1;
|
||||
ensureCapacity(size + 1);
|
||||
System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos);
|
||||
elements[insertPos] = value;
|
||||
size++;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a arbitrary number of integers to the set.
|
||||
*
|
||||
* @param values integer to add to the set.
|
||||
* @return <code>true</code> if the set changed as a result of this invocation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean addAll(final int ... values) {
|
||||
ensureCapacity(size + values.length);
|
||||
boolean result = false;
|
||||
for (final int v : values)
|
||||
result = add(v) | result;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean addAll(final Collection<? extends Integer> numbers) {
|
||||
ensureCapacity(size + numbers.size());
|
||||
boolean result = false;
|
||||
for (final Number n : numbers)
|
||||
result = add(n.intValue()) | result;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all values within a range in an integer array.
|
||||
*
|
||||
* @param source array where the values to add are found.
|
||||
* @param fromIndex first position from <code>source</code> to add (inclusive).
|
||||
* @param toIndex index after the last position in <code>source</code> to add (thus exclusive).
|
||||
* @throws NullPointerException if <code>source</code> is <code>null</code>.
|
||||
* @throws NegativeArraySizeException if <code>fromIndex</code> or <code>toIndex</code> are negative.
|
||||
* @throws ArrayIndexOutOfBoundsException if <code>fromIndex</code> or <code>toIndex</code> are beyond bounds
|
||||
* allowed <code>[0 .. source.length]</code>.
|
||||
* @return <code>true</code> if the set changed as a result of this invocation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean addAll(final int[] source, final int fromIndex, final int toIndex) {
|
||||
ensureCapacity(size + source.length);
|
||||
boolean result = false;
|
||||
for (int i = fromIndex; i < toIndex; i++)
|
||||
result = add(source[i]) | result;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add all elements present in a int-set.
|
||||
*
|
||||
* @param other the other inset.
|
||||
*
|
||||
* @throws NullPointerException if <code>other</code> is <code>null</code>.
|
||||
* @return <code>true</code> if this set changed due to this operation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean addAll(final CountSet other) {
|
||||
return addAll(other.elements,0,other.size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether a integer value is included in the set.
|
||||
* @param value the value to check.
|
||||
* @return <code>true</code> if <code>value</code> is inside the set, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean contains(final int value) {
|
||||
return Arrays.binarySearch(elements,0,size,value) >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure that this int-set has capacity to handle a number of elements.
|
||||
* <p/>
|
||||
* If the set has already that or greater capacity nothing would be changed.
|
||||
*
|
||||
* @param capacity the requested capacity.
|
||||
*/
|
||||
private void ensureCapacity(final int capacity) {
|
||||
if (elements.length >= capacity) return;
|
||||
int newLength = Math.max(elements.length << 1, capacity);
|
||||
elements = Arrays.copyOf(elements,newLength);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return size() == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(final Object o) {
|
||||
if (o instanceof Integer) {
|
||||
final int i = (Integer)o;
|
||||
return contains(i);
|
||||
} else
|
||||
return false; //To change body of implemented methods use File | Settings | File Templates.
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
public Iterator<Integer> iterator() {
|
||||
return new MyIterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
public Object[] toArray() {
|
||||
final Integer[] result = new Integer[size];
|
||||
for (int i = 0; i < size; i++)
|
||||
result[i] = elements[i];
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
@NotNull
|
||||
@SuppressWarnings("unchecked")
|
||||
public <T> T[] toArray(final T[] a) {
|
||||
if (a == null)
|
||||
throw new NullPointerException();
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final Class<T> componentClass = (Class) a.getClass().getComponentType();
|
||||
if (!componentClass.isAssignableFrom(Integer.class))
|
||||
throw new ArrayStoreException();
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
final T[] dest = (a.length < size) ? (T[]) (Object[]) Array.newInstance(componentClass, size) : a;
|
||||
|
||||
for (int i = 0; i < size; i++)
|
||||
dest[i] = (T) (Integer) elements[i];
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies the content of the set into an integer array. The result can be freely modified by the invoker.
|
||||
* @return never <code>null</code> but a zero-length array if the set is empty.
|
||||
*/
|
||||
@NotNull
|
||||
public int[] toIntArray() {
|
||||
return Arrays.copyOfRange(elements,0,size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the content of the set into an array.
|
||||
* @param dest the destination array.
|
||||
* @param offset where to store the first element of the set.
|
||||
* @throws NullPointerException if <code>dest</code> is <code>null</code>.
|
||||
* @throws ArrayIndexOutOfBoundsException if <code>offset</code> is out of range of there is not enough
|
||||
* space after <code>offset</code> in the destination array to hold all values in the set.
|
||||
*/
|
||||
public void copyTo(final int[] dest, int offset) {
|
||||
if (dest == null)
|
||||
throw new NullPointerException();
|
||||
if (dest.length < (size + offset))
|
||||
throw new ArrayIndexOutOfBoundsException("destination is to short");
|
||||
System.arraycopy(elements,0,dest,offset,size);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the content of the set into an array.
|
||||
* @param dest the destination array.
|
||||
* @throws NullPointerException if <code>dest</code> is <code>null</code>.
|
||||
* @throws ArrayIndexOutOfBoundsException if there is not enough
|
||||
* space after <code>offset</code> in the destination array to hold all values in the set.
|
||||
*/
|
||||
public void copyTo(final int[] dest) {
|
||||
copyTo(dest,0);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean add(final Integer integer) {
|
||||
return add((int) integer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean remove(final Object o) {
|
||||
return o instanceof Integer && remove((int)o);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a single integer value for the set.
|
||||
* @param i the value to remove.
|
||||
* @return <code>true</code> if the set has changed as a result of this invocation, <code>false</code> otherwise.
|
||||
*/
|
||||
public boolean remove(final int i) {
|
||||
final int pos = Arrays.binarySearch(elements,0,size,i);
|
||||
if (pos < 0)
|
||||
return false;
|
||||
else {
|
||||
removeIndex(pos);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean containsAll(final Collection<?> c) {
|
||||
for (final Object o : c)
|
||||
if (!contains(o))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean retainAll(final Collection<?> c) {
|
||||
if (size == 0)
|
||||
return false;
|
||||
@SuppressWarnings("all")
|
||||
final CountSet retainIndices = new CountSet(c.size() + 2);
|
||||
retainIndices.add(-1);
|
||||
retainIndices.add(size);
|
||||
for (final Object o : c) {
|
||||
if (!(o instanceof Integer))
|
||||
continue;
|
||||
final int pos = Arrays.binarySearch(elements,0,size,(int) o);
|
||||
if (pos < 0)
|
||||
continue;
|
||||
retainIndices.add(pos);
|
||||
}
|
||||
if (retainIndices.size == 2) {
|
||||
size = 0;
|
||||
return true;
|
||||
} else if (retainIndices.size == size + 2) {
|
||||
return false;
|
||||
} else {
|
||||
for (int idx = retainIndices.size - 1; idx > 0; idx--) {
|
||||
final int toIdx = retainIndices.elements[idx];
|
||||
final int fromIdx = retainIndices.elements[idx - 1] + 1;
|
||||
removeIndices(toIdx,fromIdx);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the values found in a range of indexes in {@link #elements}.
|
||||
* @param fromIdx first index to remove (inclusive).
|
||||
* @param toIdx right after last index to remove (exclusive).
|
||||
*/
|
||||
@Requires("fromIdx >= toIdx & fromIdx >= 0 & toIdx <= size")
|
||||
private void removeIndices(final int fromIdx, final int toIdx) {
|
||||
System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx);
|
||||
size -= toIdx - fromIdx;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean removeAll(final Collection<?> c) {
|
||||
boolean result = false;
|
||||
for (final Object o : c)
|
||||
result = remove(o) | result;
|
||||
return result;
|
||||
}
|
||||
|
||||
@Requires("idx >= 0 && idx < size")
|
||||
private void removeIndex(int idx) {
|
||||
System.arraycopy(elements,idx+1,elements,idx,size - idx - 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
size = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this set which can be changed without modifying the original one.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
@NotNull
|
||||
@SuppressWarnings("all")
|
||||
public CountSet clone() {
|
||||
return new CountSet(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder(2 + size() * 10);
|
||||
sb.append('{');
|
||||
for (int i = 0; i < size; i++)
|
||||
sb.append(elements[i]).append(',');
|
||||
sb.replace(sb.length()-1,sb.length(),"}");
|
||||
return sb.toString();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Custom iterator class for {@link CountSet IntSets}
|
||||
*/
|
||||
private class MyIterator implements Iterator<Integer> {
|
||||
/** What position I am in. */
|
||||
private int next = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return next < size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer next() {
|
||||
if (next >= size)
|
||||
throw new NoSuchElementException();
|
||||
return elements[next];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
if (next == 0)
|
||||
throw new IllegalStateException();
|
||||
if (next >= size)
|
||||
throw new NoSuchElementException();
|
||||
removeIndex(next - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -46,6 +46,9 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.gvcf;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Genotype;
|
||||
import org.broadinstitute.variant.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
|
|
@ -230,6 +233,7 @@ public class GVCFWriter implements VariantContextWriter {
|
|||
gb.DP(block.getMedianDP());
|
||||
gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP());
|
||||
gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ());
|
||||
gb.PL(block.getMinPLs());
|
||||
|
||||
return vcb.genotypes(gb.make()).make();
|
||||
}
|
||||
|
|
@ -283,7 +287,7 @@ public class GVCFWriter implements VariantContextWriter {
|
|||
}
|
||||
|
||||
final Genotype g = vc.getGenotype(0);
|
||||
if ( g.isHomRef() ) {
|
||||
if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) {
|
||||
// create bands
|
||||
final VariantContext maybeCompletedBand = addHomRefSite(vc, g);
|
||||
if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand);
|
||||
|
|
|
|||
|
|
@ -69,10 +69,11 @@ import java.util.List;
|
|||
*/
|
||||
final class HomRefBlock {
|
||||
private final VariantContext startingVC;
|
||||
int stop;
|
||||
private int stop;
|
||||
private final int minGQ, maxGQ;
|
||||
private List<Integer> GQs = new ArrayList<>(100);
|
||||
private List<Integer> DPs = new ArrayList<>(100);
|
||||
private int[] minPLs = null;
|
||||
final private List<Integer> GQs = new ArrayList<>(100);
|
||||
final private List<Integer> DPs = new ArrayList<>(100);
|
||||
private final Allele ref;
|
||||
|
||||
/**
|
||||
|
|
@ -116,9 +117,23 @@ final class HomRefBlock {
|
|||
public void add(final int pos, final Genotype g) {
|
||||
if ( g == null ) throw new IllegalArgumentException("g cannot be null");
|
||||
if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field");
|
||||
if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field");
|
||||
if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field");
|
||||
if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop);
|
||||
|
||||
if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs
|
||||
final int[] PL = g.getPL();
|
||||
if( PL.length == 3 ) {
|
||||
minPLs = PL.clone();
|
||||
}
|
||||
} else { // otherwise take the min with the provided genotype's PLs
|
||||
final int[] PL = g.getPL();
|
||||
if( PL.length == 3 ) {
|
||||
minPLs[0] = Math.min(minPLs[0], PL[0]);
|
||||
minPLs[1] = Math.min(minPLs[1], PL[1]);
|
||||
minPLs[2] = Math.min(minPLs[2], PL[2]);
|
||||
}
|
||||
}
|
||||
stop = pos;
|
||||
GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission
|
||||
DPs.add(g.getDP());
|
||||
|
|
@ -141,6 +156,8 @@ final class HomRefBlock {
|
|||
public int getMinDP() { return MathUtils.arrayMin(DPs); }
|
||||
/** Get the median DP observed within this band */
|
||||
public int getMedianDP() { return MathUtils.median(DPs); }
|
||||
/** Get the min PLs observed within this band, can be null if no PLs have yet been observed */
|
||||
public int[] getMinPLs() { return minPLs; }
|
||||
|
||||
protected int getGQUpperBound() { return maxGQ; }
|
||||
protected int getGQLowerBound() { return minGQ; }
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@
|
|||
package org.broadinstitute.sting.utils.haplotype;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
|
@ -100,7 +100,7 @@ public class HaplotypeLDCalculator {
|
|||
final Map<Haplotype, Double> map = new HashMap<Haplotype, Double>(haplotypes.size());
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// count up the co-occurrences of the events for the R^2 calculation
|
||||
final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0];
|
||||
final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0];
|
||||
map.put(h, haplotypeLikelihood);
|
||||
}
|
||||
haplotypeLikelihoodsPerSample.add(map);
|
||||
|
|
|
|||
|
|
@ -46,15 +46,17 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.haplotypeBAMWriter;
|
||||
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Writes a BAM containing just the reads in stratifiedReadMap aligned to their
|
||||
|
|
@ -99,7 +101,9 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
|
|||
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) {
|
||||
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) {
|
||||
final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes);
|
||||
writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative());
|
||||
final Haplotype haplotype = alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele());
|
||||
if (haplotype == null) continue;
|
||||
writeReadAgainstHaplotype(entry.getKey(), haplotype, paddedReferenceLoc.getStart(), bestAllele.isInformative());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -77,8 +77,9 @@ public abstract class HaplotypeBAMWriter {
|
|||
protected final static String READ_GROUP_ID = "ArtificialHaplotype";
|
||||
protected final static String HAPLOTYPE_TAG = "HC";
|
||||
|
||||
final ReadDestination output;
|
||||
boolean writeHaplotypesAsWell = true;
|
||||
private final ReadDestination output;
|
||||
private boolean writeHaplotypesAsWell = true;
|
||||
private boolean onlyRealignInformativeReads = false;
|
||||
|
||||
/**
|
||||
* Possible modes for writing haplotypes to BAMs
|
||||
|
|
@ -181,9 +182,21 @@ public abstract class HaplotypeBAMWriter {
|
|||
final Haplotype haplotype,
|
||||
final int referenceStart,
|
||||
final boolean isInformative) {
|
||||
final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative);
|
||||
if ( alignedToRef != null )
|
||||
output.add(alignedToRef);
|
||||
if( onlyRealignInformativeReads && !isInformative ) {
|
||||
if( originalRead != null ) {
|
||||
output.add(originalRead);
|
||||
}
|
||||
} else if (haplotype == null) {
|
||||
output.add(originalRead);
|
||||
return;
|
||||
} else {
|
||||
final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative);
|
||||
if ( alignedToRef != null ) {
|
||||
output.add(alignedToRef);
|
||||
} else {
|
||||
output.add(originalRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -305,7 +318,15 @@ public abstract class HaplotypeBAMWriter {
|
|||
return writeHaplotypesAsWell;
|
||||
}
|
||||
|
||||
public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) {
|
||||
public void setWriteHaplotypesAsWell(final boolean writeHaplotypesAsWell) {
|
||||
this.writeHaplotypesAsWell = writeHaplotypesAsWell;
|
||||
}
|
||||
|
||||
public boolean getOnlyRealignInformativeReads() {
|
||||
return onlyRealignInformativeReads;
|
||||
}
|
||||
|
||||
public void setOnlyRealignInformativeReads(final boolean onlyRealignInformativeReads) {
|
||||
this.onlyRealignInformativeReads = onlyRealignInformativeReads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,450 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: bradt
|
||||
* Date: 6/11/13
|
||||
*/
|
||||
public class ArrayLoglessPairHMM extends PairHMM {
|
||||
private static final double INITIAL_CONDITION = Math.pow(2, 1020);
|
||||
private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
|
||||
|
||||
// we divide e by 3 because the observed base could have come from any of the non-observed alleles
|
||||
protected static final double TRISTATE_CORRECTION = 3.0;
|
||||
|
||||
private static final int matchToMatch = 0;
|
||||
private static final int indelToMatch = 1;
|
||||
private static final int matchToInsertion = 2;
|
||||
private static final int insertionToInsertion = 3;
|
||||
private static final int matchToDeletion = 4;
|
||||
private static final int deletionToDeletion = 5;
|
||||
|
||||
protected double[][] transition = null; // The transition probabilities cache
|
||||
protected double[][] prior = null; // The prior probabilities cache
|
||||
|
||||
// Array declarations for arrays implementation
|
||||
private double[] currentMatchArray = null;
|
||||
private double[] currentDeleteArray = null;
|
||||
private double[] currentInsertArray = null;
|
||||
private double[] parentMatchArray = null;
|
||||
private double[] parentDeleteArray = null;
|
||||
private double[] parentInsertArray = null;
|
||||
private double[] grandparentMatchArray = null;
|
||||
private double[] grandparentDeleteArray = null;
|
||||
private double[] grandparentInsertArray = null;
|
||||
|
||||
// When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading
|
||||
private double[] matchCacheArray = null;
|
||||
private double[] deleteCacheArray = null;
|
||||
private double[] insertCacheArray = null;
|
||||
|
||||
// These arrays store cache info for use with the next haplotype; for writing
|
||||
private double[] nextMatchCacheArray = null;
|
||||
private double[] nextDeleteCacheArray = null;
|
||||
private double[] nextInsertCacheArray = null;
|
||||
|
||||
// Used when caching to store our intermediate sum at point of first difference bw successive haplotypes
|
||||
private double partialSum;
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
|
||||
super.initialize(readMaxLength, haplotypeMaxLength);
|
||||
|
||||
transition = new double[paddedMaxReadLength][6];
|
||||
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
|
||||
// Initialize all arrays
|
||||
// Final Cell of array is a padding cell, initialized to zero.
|
||||
currentMatchArray = new double[paddedMaxReadLength];
|
||||
currentDeleteArray = new double[paddedMaxReadLength];
|
||||
currentInsertArray = new double[paddedMaxReadLength];
|
||||
|
||||
parentMatchArray = new double[paddedMaxReadLength];
|
||||
parentDeleteArray = new double[paddedMaxReadLength];
|
||||
parentInsertArray = new double[paddedMaxReadLength];
|
||||
|
||||
grandparentMatchArray = new double[paddedMaxReadLength];
|
||||
grandparentDeleteArray = new double[paddedMaxReadLength];
|
||||
grandparentInsertArray = new double[paddedMaxReadLength];
|
||||
|
||||
// Initialize the special arrays used for caching when successive haplotypes have a common prefix
|
||||
matchCacheArray = new double[paddedMaxReadLength];
|
||||
deleteCacheArray = new double[paddedMaxReadLength];
|
||||
insertCacheArray = new double[paddedMaxReadLength];
|
||||
|
||||
nextMatchCacheArray = new double[paddedMaxReadLength];
|
||||
nextDeleteCacheArray = new double[paddedMaxReadLength];
|
||||
nextInsertCacheArray = new double [paddedMaxReadLength];
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
int hapStartIndex,
|
||||
final boolean recacheReadValues,
|
||||
final int nextHapStartIndex) {
|
||||
|
||||
if ( ! constantsAreInitialized) {
|
||||
initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);
|
||||
|
||||
// note that we initialized the constants
|
||||
constantsAreInitialized = true;
|
||||
}
|
||||
initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex);
|
||||
|
||||
// Some housekeeping to be done if we are starting a new read
|
||||
if (recacheReadValues) {
|
||||
hapStartIndex = 0;
|
||||
|
||||
initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP);
|
||||
// note that we initialized the constants
|
||||
constantsAreInitialized = true;
|
||||
|
||||
// Read length may have changed, so we need to set zero-value padding at the appropriate position.
|
||||
padMatchAndInsertArrays(readBases.length);
|
||||
}
|
||||
|
||||
// if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration
|
||||
if (hapStartIndex == 0) {
|
||||
clearPreviouslyCachedInfo(readBases.length);
|
||||
|
||||
// Haplotype length may have changed, so we need to set initial-value padding at the appropriate position.
|
||||
padDeleteArrays(haplotypeBases.length, readBases.length);
|
||||
}
|
||||
|
||||
// We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start.
|
||||
clearArraySolutionPosition();
|
||||
|
||||
// Some parameters to control behavior during the dynamic programming loop
|
||||
final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1;
|
||||
int startFill; // The lower bound of the array indices we want to over-write
|
||||
int endFill; // The upper bound of the array indices we want to over-write
|
||||
final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype
|
||||
double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction
|
||||
|
||||
// Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix
|
||||
for (int i = 1; i <= maxDiagonals; i++) {
|
||||
// set the bounds for cells we wish to fill in the arrays
|
||||
startFill = Math.max(readBases.length - i, 0);
|
||||
endFill = Math.min(maxDiagonals - i + 1, readBases.length);
|
||||
|
||||
// apply any previously cached array information
|
||||
if (i <= readBases.length)
|
||||
applyPreviouslyCachedInfo(startFill);
|
||||
|
||||
// fill in the cells for our current arrays
|
||||
updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i);
|
||||
|
||||
// final probability is the log10 sum of the last element in the Match and Insertion state arrays
|
||||
// this way we ignore all paths that ended in deletions! (huge)
|
||||
// but we have to sum all the paths ending in the M and I arrays, because they're no longer extended.
|
||||
// Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above.
|
||||
finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0];
|
||||
|
||||
// Partial sum for caching the next haplotype:
|
||||
// At the position of the last similar base between this haplotype and the next one...
|
||||
// ...remember the partial sum, so that we can start here on the next hap.
|
||||
if (i == cacheSumIndex)
|
||||
partialSum = finalArraySumProbabilities;
|
||||
|
||||
rotateArrayReferences();
|
||||
}
|
||||
// The cache arrays we wrote for this haplotype will be read for the next haplotype.
|
||||
rotateCacheArrays();
|
||||
|
||||
//return result
|
||||
return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to the editing
|
||||
* distance between the read and the haplotype.
|
||||
*
|
||||
* @param haplotypeBases the bases of the haplotype
|
||||
* @param readBases the bases of the read
|
||||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = startIndex; j < haplotypeBases.length; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the matrix that holds all the constants related to quality scores.
|
||||
*
|
||||
* @param insertionGOP insertion quality scores of the read
|
||||
* @param deletionGOP deletion quality scores of the read
|
||||
* @param overallGCP overall gap continuation penalty
|
||||
*/
|
||||
@Requires({
|
||||
"insertionGOP != null",
|
||||
"deletionGOP != null",
|
||||
"overallGCP != null"
|
||||
})
|
||||
@Ensures("constantsAreInitialized")
|
||||
protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) {
|
||||
for (int i = 0; i < insertionGOP.length; i++) {
|
||||
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
|
||||
transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP);
|
||||
transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]);
|
||||
transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]);
|
||||
transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]);
|
||||
transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pad the ends of the Match and Insert arrays with 0.
|
||||
* Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM.
|
||||
*
|
||||
* @param padPosition Which index in the arrays we wish to pad
|
||||
*/
|
||||
private void padMatchAndInsertArrays(final int padPosition) {
|
||||
grandparentMatchArray[padPosition] = 0;
|
||||
grandparentInsertArray[padPosition] = 0;
|
||||
parentMatchArray[padPosition] = 0;
|
||||
parentInsertArray[padPosition] = 0;
|
||||
currentMatchArray[padPosition] = 0;
|
||||
currentInsertArray[padPosition] = 0;
|
||||
matchCacheArray[padPosition] = 0;
|
||||
insertCacheArray[padPosition] = 0;
|
||||
nextMatchCacheArray[padPosition] = 0;
|
||||
nextInsertCacheArray[padPosition] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment.
|
||||
* Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM.
|
||||
*
|
||||
* @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value
|
||||
* @param padPosition Which index in the arrays we wish to pad
|
||||
*/
|
||||
private void padDeleteArrays(final int haplotypeLength, final int padPosition) {
|
||||
final double initialValue = INITIAL_CONDITION / haplotypeLength;
|
||||
|
||||
// Pad the deletion arrays. Akin to padding the first row in the deletion matrix
|
||||
parentDeleteArray[padPosition] = initialValue;
|
||||
grandparentDeleteArray[padPosition] = initialValue;
|
||||
currentDeleteArray[padPosition] = initialValue;
|
||||
deleteCacheArray[padPosition] = initialValue;
|
||||
nextDeleteCacheArray[padPosition] = initialValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start.
|
||||
*
|
||||
*/
|
||||
private void clearArraySolutionPosition() {
|
||||
grandparentMatchArray[0] = 0;
|
||||
grandparentInsertArray[0] = 0;
|
||||
parentMatchArray[0] = 0;
|
||||
parentInsertArray[0] = 0;
|
||||
currentMatchArray[0] = 0;
|
||||
currentInsertArray[0] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears cached information saved from the last haplotype,
|
||||
* allowing us to start at the beginning of the present haplotype with intitial values of 0.
|
||||
*
|
||||
* @param fillLength How much of the cache arrays do we need to zero
|
||||
*/
|
||||
private void clearPreviouslyCachedInfo(final int fillLength) {
|
||||
Arrays.fill(matchCacheArray, 0, fillLength, 0);
|
||||
Arrays.fill(deleteCacheArray, 0, fillLength, 0);
|
||||
Arrays.fill(insertCacheArray, 0, fillLength, 0);
|
||||
|
||||
partialSum = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies cached information saved from the last haplotype,
|
||||
* allowing us to start in the middle of the present haplotype.
|
||||
*
|
||||
* @param indK the index in the arrays we wish to update with cached info
|
||||
*/
|
||||
private void applyPreviouslyCachedInfo(int indK) {
|
||||
// apply caching info necessary for calculating current DELETE array values
|
||||
parentMatchArray[indK] = matchCacheArray[indK];
|
||||
parentDeleteArray[indK] = deleteCacheArray[indK];
|
||||
|
||||
// apply caching info necessary for calculating current MATCH array values
|
||||
grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1];
|
||||
grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1];
|
||||
grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Records the mid-process state of one location in the read/haplotype alignment.
|
||||
* Writes new cache information for use with the next haplotype we see.
|
||||
*
|
||||
* @param indK the index in the cache arrays we wish to store information in
|
||||
*/
|
||||
private void recordNewCacheInfo(int indK) {
|
||||
nextMatchCacheArray[indK] = currentMatchArray[indK];
|
||||
nextDeleteCacheArray[indK] = currentDeleteArray[indK];
|
||||
nextInsertCacheArray[indK] = currentInsertArray[indK];
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the HMM arrays for the current diagonal.
|
||||
*
|
||||
* @param readLength The length of the read
|
||||
* @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype
|
||||
* @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info
|
||||
* @param startFill The lower bound of the array indices we want to over-write
|
||||
* @param endFill The upper bound of the array indices we want to over-write
|
||||
* @param iii The index indicating which diagonal of the read/haplotype alignment we are working on
|
||||
*/
|
||||
private void updateArrays(final int readLength,
|
||||
final int hapStartIndex,
|
||||
final int nextHapStartIndex,
|
||||
final int startFill,
|
||||
final int endFill,
|
||||
final int iii) {
|
||||
|
||||
// The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment
|
||||
int matrixRow;
|
||||
int matrixCol;
|
||||
|
||||
int arrayIndex;
|
||||
for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) {
|
||||
// translate the array position into a row, column in the priors and transition matrices
|
||||
matrixRow = readLength - arrayIndex - 1;
|
||||
matrixCol = iii - matrixRow - 1 + hapStartIndex;
|
||||
|
||||
// update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col
|
||||
updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]);
|
||||
|
||||
// Set up caching for the next haplotype
|
||||
// At the position of the final similar base between this haplotype and the next one, remember the mid-array values
|
||||
if (matrixCol == nextHapStartIndex - 1)
|
||||
recordNewCacheInfo(arrayIndex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a cell in the HMM arrays
|
||||
*
|
||||
* @param indK index in the arrays to update
|
||||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param transition an array with the six transition relevant to this location
|
||||
*/
|
||||
private void updateArrayCell( final int indK, final double prior, final double[] transition) {
|
||||
currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] +
|
||||
grandparentInsertArray[indK + 1] * transition[indelToMatch] +
|
||||
grandparentDeleteArray[indK + 1] * transition[indelToMatch] );
|
||||
currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion];
|
||||
currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion];
|
||||
}
|
||||
|
||||
/**
|
||||
* To prepare for the next diagonal in our loop, each array must be bumped to an older generation
|
||||
*
|
||||
*/
|
||||
private void rotateArrayReferences() {
|
||||
double[] tempMatchArray = grandparentMatchArray;
|
||||
double[] tempDeleteArray = grandparentDeleteArray;
|
||||
double[] tempInsertArray = grandparentInsertArray;
|
||||
|
||||
grandparentMatchArray = parentMatchArray;
|
||||
grandparentDeleteArray = parentDeleteArray;
|
||||
grandparentInsertArray = parentInsertArray;
|
||||
|
||||
parentMatchArray = currentMatchArray;
|
||||
parentDeleteArray = currentDeleteArray;
|
||||
parentInsertArray = currentInsertArray;
|
||||
|
||||
currentMatchArray = tempMatchArray;
|
||||
currentDeleteArray = tempDeleteArray;
|
||||
currentInsertArray = tempInsertArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays
|
||||
*
|
||||
*/
|
||||
private void rotateCacheArrays() {
|
||||
matchCacheArray = nextMatchCacheArray.clone();
|
||||
deleteCacheArray = nextDeleteCacheArray.clone();
|
||||
insertCacheArray = nextInsertCacheArray.clone();
|
||||
}
|
||||
}
|
||||
|
|
@ -47,12 +47,16 @@
|
|||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
||||
import java.io.File;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
|
||||
private static class HmmInput {
|
||||
|
|
@ -62,14 +66,14 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
|
|||
public byte[] deletionGOP;
|
||||
public byte[] overallGCP;
|
||||
public List<Haplotype> haplotypes;
|
||||
};
|
||||
}
|
||||
|
||||
public static class ResultQueue {
|
||||
private int offset;
|
||||
private List<double[]> batchResults;
|
||||
|
||||
public ResultQueue() {
|
||||
batchResults = new LinkedList<double[]>();
|
||||
batchResults = new LinkedList<>();
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
|
|
@ -92,7 +96,7 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
|
|||
final static String libName = "gmvhdl_gatk_hmm";
|
||||
|
||||
private static boolean loaded = false;
|
||||
private List<HmmInput> batchRequests = new LinkedList<HmmInput>();
|
||||
private List<HmmInput> batchRequests = new LinkedList<>();
|
||||
private ResultQueue resultQueue = new ResultQueue();
|
||||
|
||||
static public boolean isAvailable() {
|
||||
|
|
@ -184,6 +188,55 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
|
|||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap){
|
||||
|
||||
// initialize the pairHMM if necessary
|
||||
if (! initialized) {
|
||||
int readMaxLength = findMaxReadLength(reads);
|
||||
int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap);
|
||||
initialize(readMaxLength, haplotypeMaxLength);
|
||||
}
|
||||
|
||||
// Pass the read bases/quals, and the haplotypes as a list into the HMM
|
||||
performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap);
|
||||
|
||||
// Get the log10-likelihoods for each read/haplotype ant pack into the results map
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap);
|
||||
|
||||
return likelihoodMap;
|
||||
}
|
||||
|
||||
private void collectLikelihoodResults(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) {
|
||||
for(final GATKSAMRecord read : reads){
|
||||
final double[] likelihoods = batchGetResult();
|
||||
int jjj = 0;
|
||||
for (Allele allele : alleleHaplotypeMap.keySet()){
|
||||
final double log10l = likelihoods[jjj];
|
||||
likelihoodMap.add(read, allele, log10l);
|
||||
jjj++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void performBatchAdditions(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, Map<GATKSAMRecord, byte[]> GCPArrayMap) {
|
||||
final List<Haplotype> haplotypeList = getHaplotypeList(alleleHaplotypeMap);
|
||||
for(final GATKSAMRecord read : reads){
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final byte[] readQuals = read.getBaseQualities();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities();
|
||||
final byte[] overallGCP = GCPArrayMap.get(read);
|
||||
|
||||
batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
|
|
@ -191,10 +244,19 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
|
|||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
final boolean recacheReadValues,
|
||||
final int nextHapStartIndex) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
private List<Haplotype> getHaplotypeList(Map<Allele, Haplotype> alleleHaplotypeMap){
|
||||
final List<Haplotype> haplotypeList = new LinkedList<>();
|
||||
for (Allele a : alleleHaplotypeMap.keySet()){
|
||||
haplotypeList.add(alleleHaplotypeMap.get(a));
|
||||
}
|
||||
return haplotypeList;
|
||||
}
|
||||
|
||||
private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) {
|
||||
double[] results = null;
|
||||
int n = dequeueRequirement(haplotypeBases.length, readBases.length);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,813 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Fast partial PairHMM backed on the standard Logless PairHMM
|
||||
*
|
||||
*/
|
||||
public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM {
|
||||
|
||||
|
||||
/**
|
||||
* Initial read length capacity.
|
||||
*/
|
||||
private static final int INITIAL_READ_LENGTH_CAPACITY = 200;
|
||||
|
||||
/**
|
||||
* Initial haplotype length capacity.
|
||||
*/
|
||||
private static final int INITIAL_HAPLOTYPE_LENGTH_CAPACITY = 400;
|
||||
|
||||
|
||||
/**
|
||||
* Holds the current read capacity.
|
||||
* <p>It can only go up overtime.</p>
|
||||
*/
|
||||
private int readCapacity = INITIAL_READ_LENGTH_CAPACITY;
|
||||
|
||||
/**
|
||||
* Holds the current haplotype length capacity.
|
||||
* <p>It can only go up overtime.</p>
|
||||
*/
|
||||
private int haplotypeCapacity = INITIAL_HAPLOTYPE_LENGTH_CAPACITY;
|
||||
|
||||
private int maxToCol;
|
||||
private int haplotypeLength;
|
||||
|
||||
/**
|
||||
* Returns the currently loaded read base qualities.
|
||||
*
|
||||
* @throws IllegalStateException if no read was previously loaded using {@link #loadRead}.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public byte[] getReadQuals() {
|
||||
if (readQuals == null)
|
||||
throw new IllegalStateException("no read was loaded onto the pairhmm calculator");
|
||||
return readQuals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the currently loaded read insertion qualities.
|
||||
*
|
||||
* @throws IllegalStateException if no read was previously loaded using {@link #loadRead}.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public byte[] getReadInsQuals() {
|
||||
if (readQuals == null)
|
||||
throw new IllegalStateException("no read was loaded onto the pairhmm calculator");
|
||||
return readInsQuals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the currently loaded read deletion qualities.
|
||||
*
|
||||
* @throws IllegalStateException if no read was previously loaded using {@link #loadRead}.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public byte[] getReadDelQuals() {
|
||||
if (readQuals == null)
|
||||
throw new IllegalStateException("no read was loaded onto the pairhmm calculator");
|
||||
return readDelQuals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the currently loaded read gap extension penalty..
|
||||
*
|
||||
* @throws IllegalStateException if no read was previously loaded using {@link #loadRead}.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
public byte[] getReadGepQuals() {
|
||||
if (readQuals == null)
|
||||
throw new IllegalStateException("no read was loaded onto the pairhmm calculator");
|
||||
return readGepQuals;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new pair-hmm calculator instance give the gap continuation penalty.
|
||||
*
|
||||
* @param gcp the gap-continuation penalty.
|
||||
*/
|
||||
public FastLoglessPairHMM(final byte gcp) {
|
||||
constantGCP = gcp;
|
||||
initialize(readCapacity,haplotypeCapacity);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases,
|
||||
final byte[] readBases,
|
||||
final byte[] readQuals,
|
||||
final byte[] insertionGOP,
|
||||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues, final int nextHapStartIndex) {
|
||||
this.readBases = readBases;
|
||||
this.haplotypeBases = haplotypeBases;
|
||||
this.haplotypeLength = haplotypeBases.length;
|
||||
return super.subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases,readBases,readQuals,
|
||||
insertionGOP,deletionGOP,overallGCP,hapStartIndex,recacheReadValues,nextHapStartIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement the last step summation to calculate the total likelihood.
|
||||
*
|
||||
* @param row number of the last row of the pair-hmm where the likelihood values are present.
|
||||
* @param fromCol inclusive first column to include in the summation.
|
||||
* @param toCol exclusive last column to include in the summation.
|
||||
* @return 0 or less.
|
||||
*/
|
||||
protected double finalLikelihoodCalculation(final int row,
|
||||
final int fromCol, final int toCol) {
|
||||
|
||||
final double divider = Math.max(1,2 *(toCol - fromCol));
|
||||
final double dividerInverse = 1.0 / divider;
|
||||
double finalLikelihood = 0;
|
||||
|
||||
for (int j = fromCol; j < toCol; j++) {
|
||||
finalLikelihood += matchMatrix[row][j] * dividerInverse;
|
||||
finalLikelihood += insertionMatrix[row][j] * dividerInverse;
|
||||
}
|
||||
return StrictMath.log10(finalLikelihood) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the matrix values for a problem including the trailing end of the read.
|
||||
*
|
||||
* <p>
|
||||
* Notice that you can improve performance by omitting filling reusable values from
|
||||
* previous haplotype calculations. You can set {@code haplotypeStartOffset} to skill
|
||||
* those columns.
|
||||
* </p>
|
||||
*
|
||||
* @param readStart inclusive first position of the read used in the calculations.
|
||||
* @param readEnd exclusive last position of the read considered in the calculations.
|
||||
* @param haplotypeStartOffset offset of the haplotype right after the reusable prefix
|
||||
* from previous calls.
|
||||
*
|
||||
*
|
||||
*/
|
||||
protected void initializeMatrixValuesForTrailingProblem(final int readStart, final int readEnd,
|
||||
final int haplotypeStartOffset) {
|
||||
|
||||
@SuppressWarnings("all")
|
||||
final int zeroRow = readStart;
|
||||
final int toRow = readEnd + 1;
|
||||
final int toCol = haplotypeLength + 1;
|
||||
|
||||
// fill first row with -Inf fot M and I but not for Deletion if leading
|
||||
// to allow for free deletions at the beginning.
|
||||
if (readStart == 0) {
|
||||
// First row initialization:
|
||||
Arrays.fill(matchMatrix[zeroRow],haplotypeStartOffset,toCol,0);
|
||||
Arrays.fill(deletionMatrix[zeroRow],haplotypeStartOffset,toCol,INITIAL_CONDITION);
|
||||
|
||||
if (haplotypeStartOffset == 0)
|
||||
for (int i = zeroRow + 1; i < toRow; i++)
|
||||
insertionMatrix[i][0] = matchMatrix[i][0] = deletionMatrix[i][0] = 0;
|
||||
|
||||
} else {
|
||||
Arrays.fill(matchMatrix[zeroRow], Math.max(1,haplotypeStartOffset), toCol,0);
|
||||
Arrays.fill(insertionMatrix[zeroRow], haplotypeStartOffset, toCol,0);
|
||||
if (haplotypeStartOffset == 0) {
|
||||
matchMatrix[zeroRow][0] = INITIAL_CONDITION;
|
||||
deletionMatrix[zeroRow][0] = 0;
|
||||
}
|
||||
if (haplotypeStartOffset <= 1) deletionMatrix[zeroRow][1] = matchMatrix[zeroRow][1] * transition[zeroRow][matchToDeletion];
|
||||
for (int i = Math.max(haplotypeStartOffset,2); i < toCol; i++) {
|
||||
deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1]
|
||||
* transition[zeroRow][deletionToDeletion];
|
||||
}
|
||||
|
||||
if (haplotypeStartOffset == 0) {
|
||||
matchMatrix[zeroRow + 1][0] = deletionMatrix[zeroRow + 1][0] = 0;
|
||||
insertionMatrix[zeroRow + 1][0] = matchMatrix[zeroRow][0] * transition[zeroRow + 1][matchToInsertion];
|
||||
|
||||
|
||||
for (int i = zeroRow + 2; i < toRow; i++) {
|
||||
matchMatrix[i][0] = deletionMatrix[i][0] = 0;
|
||||
insertionMatrix[i][0] = insertionMatrix[i - 1][0]
|
||||
* transition[i][insertionToInsertion];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes calculation matrices give the characteristics of the next and previous problems.
|
||||
* @param currentProblem reference to the Lk calculation problem we are dealing currently.
|
||||
* @param previousProblem reference to the Lk calculation problem that has been solved just before.
|
||||
*
|
||||
*/
|
||||
protected void initializeMatrixValues(final Problem currentProblem, final Problem previousProblem) {
|
||||
if (previousProblem != null &&
|
||||
previousProblem.readStart == currentProblem.readStart &&
|
||||
previousProblem.hapStart == currentProblem.hapStart &&
|
||||
maxToCol >= currentProblem.hapEnd + 1)
|
||||
return;
|
||||
|
||||
final int zeroRow = currentProblem.readStart;
|
||||
final int zeroCol = currentProblem.hapStart;
|
||||
final int toRow = currentProblem.readEnd + 1;
|
||||
final int toCol = currentProblem.hapEnd + 1;
|
||||
maxToCol = toCol;
|
||||
|
||||
// fill first row with -Inf fot M and I but not for Deletion if leading
|
||||
// to allow for free deletions at the beginning.
|
||||
if (currentProblem.leading) {
|
||||
// First row initialization:
|
||||
Arrays.fill(matchMatrix[zeroRow],zeroCol,toCol,0);
|
||||
Arrays.fill(deletionMatrix[zeroRow],zeroCol,toCol,INITIAL_CONDITION);
|
||||
|
||||
for (int i = zeroRow + 1; i < toRow; i++)
|
||||
insertionMatrix[i][zeroCol] = matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0;
|
||||
|
||||
} else { // If not leading set the appropriate matching 1.0 prob and
|
||||
// deletion + extension.
|
||||
|
||||
Arrays.fill(matchMatrix[zeroRow], zeroCol + 1, toCol,0);
|
||||
Arrays.fill(insertionMatrix[zeroRow], zeroCol, toCol,0);
|
||||
matchMatrix[zeroRow][zeroCol] = INITIAL_CONDITION;
|
||||
deletionMatrix[zeroRow][zeroCol] = 0;
|
||||
deletionMatrix[zeroRow][zeroCol + 1] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow][matchToDeletion];
|
||||
for (int i = zeroCol + 2; i < toCol; i++) {
|
||||
deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1]
|
||||
* transition[zeroRow][deletionToDeletion];
|
||||
}
|
||||
|
||||
matchMatrix[zeroRow + 1][zeroCol] = deletionMatrix[zeroRow + 1][zeroCol] = 0;
|
||||
insertionMatrix[zeroRow + 1][zeroCol] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow + 1][matchToInsertion];
|
||||
|
||||
for (int i = zeroRow + 2; i < toRow; i++) {
|
||||
matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0;
|
||||
insertionMatrix[i][zeroCol] = insertionMatrix[i - 1][zeroCol]
|
||||
* transition[i][insertionToInsertion];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constant gap-continuation-penalty.
|
||||
*/
|
||||
private final byte constantGCP;
|
||||
|
||||
/**
|
||||
* Currently loaded haplotype base sequence.
|
||||
*/
|
||||
private byte[] haplotypeBases;
|
||||
|
||||
/**
|
||||
* Currently loaded read base sequence.
|
||||
*/
|
||||
private byte[] readBases;
|
||||
|
||||
/**
|
||||
* Read qualities.
|
||||
*/
|
||||
private byte[] readQuals;
|
||||
|
||||
/**
|
||||
* Read insertion qualities.
|
||||
*/
|
||||
private byte[] readInsQuals;
|
||||
|
||||
/**
|
||||
* Read deletion qualities.
|
||||
*/
|
||||
private byte[] readDelQuals;
|
||||
|
||||
/**
|
||||
* Read gap-extension-penalties.
|
||||
*/
|
||||
private byte[] readGepQuals;
|
||||
|
||||
/**
|
||||
* Cached results.
|
||||
*/
|
||||
private Map<Problem, Double> cachedResults = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}.
|
||||
*
|
||||
* @param read the target read.
|
||||
* @throws NullPointerException if {@code read} is null.
|
||||
*/
|
||||
@Override
|
||||
public void loadRead(final GATKSAMRecord read) {
|
||||
loadRead(read.getReadBases(),read.getBaseQualities(),read.getBaseInsertionQualities(),read.getBaseDeletionQualities(),read.getMappingQuality());
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}.
|
||||
*
|
||||
* @param readBases the read bases.
|
||||
* @param readQuals the read base call quality scores.
|
||||
* @param readInsQuals the read insertion quality scores.
|
||||
* @param readDelQuals the read deletion quality scores.
|
||||
* @param mq the read mapping quality score.
|
||||
* @throws NullPointerException if any of the arrays passed is {@code null}.
|
||||
* @throws IllegalArgumentException if the arrays passed have incompatible sizes.
|
||||
*/
|
||||
public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) {
|
||||
// TODO This is a copy&paste from PairHMM*Engine read data preparation code.
|
||||
// TODO It is simply to difficult to share the code without changing that class and I don't want
|
||||
// TODO to do so for now.
|
||||
if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length");
|
||||
if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length");
|
||||
if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length");
|
||||
maxToCol = 0;
|
||||
|
||||
if (readBases.length > readCapacity) {
|
||||
readCapacity = readBases.length;
|
||||
initialize(readCapacity,haplotypeCapacity);
|
||||
}
|
||||
paddedReadLength = readBases.length + 1;
|
||||
final byte[] overallGCP = new byte[readBases.length];
|
||||
Arrays.fill(overallGCP, constantGCP); // Is there a way to derive
|
||||
|
||||
for (int kkk = 0; kkk < readQuals.length; kkk++) {
|
||||
readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk],
|
||||
mq); // cap base quality by mapping
|
||||
// TODO -- why is Q18 hard-coded here???
|
||||
readQuals[kkk] = (readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE
|
||||
: readQuals[kkk]);
|
||||
}
|
||||
this.readBases = readBases;
|
||||
this.readQuals = readQuals;
|
||||
this.readInsQuals = readInsQuals;
|
||||
this.readDelQuals = readDelQuals;
|
||||
this.readGepQuals = overallGCP;
|
||||
initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP);
|
||||
if (haplotypeBases != null)
|
||||
fillPriorsTable(0);
|
||||
cachedResults.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadHaplotypeBases(final byte[] haplotypeBases) {
|
||||
if (readBases == null)
|
||||
throw new IllegalStateException(
|
||||
"no read was loaded before the haplotype");
|
||||
this.haplotypeBases = haplotypeBases.clone();
|
||||
haplotypeLength = haplotypeBases.length;
|
||||
paddedHaplotypeLength = haplotypeLength;
|
||||
if (haplotypeCapacity < haplotypeLength) {
|
||||
haplotypeCapacity = haplotypeLength;
|
||||
initialize(readCapacity,haplotypeCapacity);
|
||||
initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals);
|
||||
}
|
||||
initializePriors(this.haplotypeBases, readBases, readQuals, 0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Changes only the suffix of the currently loaded haplotype.
|
||||
*
|
||||
* <p>
|
||||
* If from is 0, this is equivalent to call {@link #loadHaplotypeBases(byte[])} directly.
|
||||
* </p>
|
||||
* @param from first position on the current haplotype to substitute with the new suffix.
|
||||
* It can be up to the length of the haplotype in such case this operation is in
|
||||
* effect just extending that haplotype.
|
||||
* @param suffix the new bases for the end part of the current haplotype.
|
||||
* @param suffixFrom inclusive first position of the actual suffix within the {@code suffix} array.
|
||||
* @param suffixTo exclusive last position of the actual suffix within the {@code suffix} array.
|
||||
*
|
||||
* @throws IllegalStateException if no read was loaded with {@link #loadRead}.
|
||||
* @throws IllegalArgumentException if from is more than 0 but no haplotype was loaded previously or if indices passed are inconsistent.
|
||||
* @throws ArrayIndexOutOfBoundsException if indices passed are outside valid ranges.
|
||||
*/
|
||||
public void changeHaplotypeSuffix(final int from, final byte[] suffix, final int suffixFrom, final int suffixTo) {
|
||||
if (readBases == null)
|
||||
throw new IllegalStateException(
|
||||
"no read was loaded before the haplotype");
|
||||
if (haplotypeBases == null && from > 0)
|
||||
throw new IllegalArgumentException("from cannot be larger than 0 if no haplotype bases was previously loaded");
|
||||
if (suffixFrom < 0)
|
||||
throw new ArrayIndexOutOfBoundsException("the suffix from index cannot be negative");
|
||||
if (suffixTo > suffix.length)
|
||||
throw new ArrayIndexOutOfBoundsException("the suffix to index cannot be larger than the suffix array length");
|
||||
if (suffixFrom > suffixTo)
|
||||
throw new IllegalArgumentException("the suffix to index cannot be smaller than the suffix from index");
|
||||
if (from > haplotypeLength)
|
||||
throw new IllegalArgumentException("the from index cannot be greater than the current haplotype length");
|
||||
if (from < 0)
|
||||
throw new IllegalArgumentException("the from index cannot be negative");
|
||||
|
||||
int startIndex = from;
|
||||
if (haplotypeBases == null) {
|
||||
haplotypeBases = Arrays.copyOfRange(suffix,suffixFrom,suffixTo);
|
||||
haplotypeLength = suffixTo - suffixFrom;
|
||||
} else {
|
||||
final int newLength = from + suffixTo - suffixFrom;
|
||||
if (haplotypeBases.length < newLength)
|
||||
haplotypeBases = Arrays.copyOf(haplotypeBases,newLength);
|
||||
System.arraycopy(suffix,suffixFrom,haplotypeBases,from,newLength - from);
|
||||
haplotypeLength = newLength;
|
||||
}
|
||||
paddedHaplotypeLength = haplotypeLength + 1;
|
||||
if (haplotypeCapacity < haplotypeLength) {
|
||||
haplotypeCapacity = haplotypeLength;
|
||||
initialize(readCapacity,haplotypeCapacity);
|
||||
initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals);
|
||||
startIndex = 0;
|
||||
}
|
||||
//startIndex = 0;
|
||||
fillPriorsTable(startIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the bases of the current haplotype.
|
||||
*
|
||||
* @throws IllegalStateException if no haplotype was loaded previously
|
||||
* @return never {@code null}
|
||||
*/
|
||||
public byte[] getHaplotypeBases() {
|
||||
if (haplotypeBases == null)
|
||||
throw new IllegalStateException();
|
||||
return Arrays.copyOfRange(haplotypeBases,0,haplotypeLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a debug representation of the pair-hmm.
|
||||
* @return never {@code null}.
|
||||
*/
|
||||
public String toString() {
|
||||
return "" + haplotypeLength + ":" + new String(Arrays.copyOfRange(haplotypeBases,0,haplotypeLength));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void initializePriors(final byte[] hapBases, final byte[] readBases, final byte[] baseQuals, final int idx) {
|
||||
haplotypeBases = hapBases;
|
||||
haplotypeLength = haplotypeBases.length;
|
||||
this.readBases = readBases;
|
||||
this.readQuals = baseQuals;
|
||||
fillPriorsTable(idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the prior table up.
|
||||
*
|
||||
* <p>
|
||||
* It accepts an argument to save unnecessary prefix filling up.
|
||||
* </p>
|
||||
*
|
||||
* @param idx first position in the haplotype to start filling from.
|
||||
*/
|
||||
protected void fillPriorsTable(final int idx) {
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
final byte x = readBases[i];
|
||||
final byte qual = readQuals[i];
|
||||
for (int j = idx; j < haplotypeLength; j++) {
|
||||
final byte y = haplotypeBases[j];
|
||||
prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
|
||||
QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Decorates haplotype set with their likelihoods as compared with the currently loaded read.
|
||||
*
|
||||
*
|
||||
* @param readStart inclusive start position of the targeted section of the read.
|
||||
* @param readEnd exclusive end position just beyond the targeted section of the read.
|
||||
* @param haplotypes in/out set of haplotypes.
|
||||
*/
|
||||
public void calculateLocalLikelihoods(final int readStart, final int readEnd, final PairHMMReadyHaplotypes haplotypes) {
|
||||
final PairHMMReadyHaplotypes.Iterator entryIterator = haplotypes.iterator();
|
||||
boolean isFirst = true;
|
||||
while (entryIterator.hasNext()) {
|
||||
entryIterator.next();
|
||||
final int startIndex = entryIterator.startIndex();
|
||||
final byte[] bases = entryIterator.bases();
|
||||
changeHaplotypeSuffix(startIndex,bases,startIndex,bases.length);
|
||||
final double likelihood = calculateLikelihood(readStart, readEnd, startIndex, isFirst);
|
||||
isFirst = false;
|
||||
entryIterator.setLikelihood(likelihood);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public double calculateLocalLikelihood(final int readStart, final int readEnd,
|
||||
final int hapStart, final int hapEnd, final boolean kmerMatch) {
|
||||
if (readBases == null || haplotypeBases == null)
|
||||
throw new IllegalStateException("read or haplotype was not loaded");
|
||||
final int hapSegmentLength = hapEnd - hapStart;
|
||||
final int readSegmentLength = readEnd - readStart;
|
||||
// trivial case when there is a single base match.
|
||||
if (kmerMatch) {
|
||||
return calculateLocalLikelihoodsExactMatch(readStart, hapStart, hapSegmentLength, readSegmentLength);
|
||||
} else if (hapSegmentLength == readSegmentLength) {
|
||||
if (hapSegmentLength == 0) {
|
||||
return calculateLocalLikelihoodEmptySquare(readStart, readEnd);
|
||||
} else if (hapSegmentLength == 1) {
|
||||
return calculateLocalLikelihoodSingleBase(readStart, readEnd, hapStart);
|
||||
} else { // general (slower) solution.
|
||||
return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd);
|
||||
}
|
||||
} else if (hapSegmentLength == 0) { // must be full insertion we
|
||||
return calculateLocalLikelihoodInsertion(readStart, readEnd);
|
||||
} else if (readSegmentLength == 0) { // full deletion.
|
||||
return calculateLocalLikelihoodDeletion(readStart, hapStart, hapEnd);
|
||||
} else { // general (slower) solution.
|
||||
return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fast likelihood when the pair-hmm represents a deletion in the read.
|
||||
*/
|
||||
private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) {
|
||||
double result = INITIAL_CONDITION;
|
||||
if (readStart > 0) { // no penalty if at the beginning.
|
||||
result *= transition[readStart][matchToDeletion];
|
||||
result *=
|
||||
StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1);
|
||||
result *= transition[readStart][indelToMatch];
|
||||
}
|
||||
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Fast likelihood when the pair-hmm represents a insertion in the read.
|
||||
*/
|
||||
private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) {
|
||||
double result = INITIAL_CONDITION;
|
||||
result *= transition[readStart + 1][matchToInsertion];
|
||||
for (int i = readStart + 1; i < readEnd; i++) {
|
||||
result *= transition[i + 1][insertionToInsertion];
|
||||
}
|
||||
if (readEnd < readBases.length) {
|
||||
result *= transition[readEnd + 1][indelToMatch];
|
||||
}
|
||||
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Single base mismatch fast likelihood calculation.
|
||||
*/
|
||||
private double calculateLocalLikelihoodSingleBase(final int readStart, final int readEnd, final int hapStart) {
|
||||
double result = INITIAL_CONDITION;
|
||||
result *= prior[readStart + 1][hapStart + 1];
|
||||
if (readStart > 0) {
|
||||
result *= transition[readStart + 1][matchToMatch];
|
||||
}
|
||||
if (readEnd < readBases.length) {
|
||||
result *= transition[readEnd + 1][matchToMatch];
|
||||
}
|
||||
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Empty square Pair-hmm.
|
||||
*/
|
||||
private double calculateLocalLikelihoodEmptySquare(final int readStart, final int readEnd) {
|
||||
double result = INITIAL_CONDITION;
|
||||
if (readStart > 0 && readEnd < readBases.length) {
|
||||
result *= transition[readStart + 1][matchToMatch];
|
||||
}
|
||||
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Likelihood assuming that there is a exact match between both sequences: read and haplotype
|
||||
*/
|
||||
private double calculateLocalLikelihoodsExactMatch(final int readStart, final int hapStart, final int hapSegmentLength, final int readSegmentLength) {
|
||||
double result = INITIAL_CONDITION;
|
||||
if (hapSegmentLength == 1) {
|
||||
result *= prior[readStart + 1][hapStart + 1];
|
||||
} else {
|
||||
for (int i = 0; i < readSegmentLength; i++) {
|
||||
result *= prior[readStart + i + 1][hapStart + i + 1];
|
||||
if (i > 0) {
|
||||
result *= transition[readStart + i + 1][matchToMatch];
|
||||
}
|
||||
}
|
||||
}
|
||||
return StrictMath.log10(result) - INITIAL_CONDITION_LOG10;
|
||||
}
|
||||
|
||||
/**
|
||||
* Revert to a general pair-hmm solution.
|
||||
*/
|
||||
private double calculateLocalLikelihoodsGeneral(final int readStart, final int readEnd, final int hapStart, final int hapEnd) {
|
||||
final Problem p = new Problem(readStart, readEnd, hapStart, hapEnd);
|
||||
final Double cachedCost = cachedResults.get(p);
|
||||
if (cachedCost != null) {
|
||||
return cachedCost;
|
||||
}
|
||||
double cost = calculateLocalLikelihoodGeneral(p);
|
||||
cachedResults.put(p, cost);
|
||||
return cost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the regular full pair-hmm.
|
||||
*
|
||||
* <p>
|
||||
* With the possibility of reuse the previous haplotype common prefix by using
|
||||
* a startIndex which is greater than 0.
|
||||
*/
|
||||
private double calculateLikelihood(final int readStart, final int readEnd, final int startIndex, final boolean initializeEdges) {
|
||||
final int edgeStart = initializeEdges ? 0 : startIndex + 1;
|
||||
initializeMatrixValuesForTrailingProblem(readStart, readEnd, edgeStart);
|
||||
updateTable(readStart + 1, readEnd + 1, startIndex + 1, haplotypeLength + 1);
|
||||
if (readEnd == readBases.length)
|
||||
return finalLikelihoodCalculation(readEnd,0,haplotypeLength + 1) - (readStart == 0 ? StrictMath.log10(haplotypeLength) : 0);
|
||||
else {
|
||||
final double divider = 3.0;
|
||||
final double dividerInverted = 1.0 / divider;
|
||||
return StrictMath.log10(matchMatrix[readEnd][haplotypeLength]
|
||||
* transition[readEnd][matchToMatch] * dividerInverted +
|
||||
insertionMatrix[readEnd][haplotypeLength]
|
||||
* transition[readEnd][indelToMatch] * dividerInverted +
|
||||
deletionMatrix[readEnd][haplotypeLength]
|
||||
* transition[readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double calculateLocalLikelihoodGeneral(final Problem p) {
|
||||
|
||||
initializeMatrixValues(p,null);
|
||||
int fromCol = p.hapStart + 1;
|
||||
// if (previousProblem == null) {
|
||||
// fromCol = p.hapStart + 1;
|
||||
// } else {
|
||||
// final int sharedPrefix = previousProblem.followerStartIndex(p);
|
||||
// if (sharedPrefix >= 0)
|
||||
// fromCol = sharedPrefix + 1;
|
||||
// else
|
||||
// fromCol = p.hapStart + 1;
|
||||
// }
|
||||
// previousProblem = p;
|
||||
|
||||
updateTable(p.readStart + 1, p.readEnd + 1,
|
||||
fromCol, p.hapEnd + 1);
|
||||
|
||||
if (p.trailing) {
|
||||
return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1)
|
||||
- (p.leading ? StrictMath.log10(p.hapEnd - p.hapStart) : 0);
|
||||
} else {
|
||||
final double divider = 3.0;
|
||||
final double dividerInverted = 1.0 / divider;
|
||||
return StrictMath.log10(matchMatrix[p.readEnd][p.hapEnd]
|
||||
* transition[p.readEnd][matchToMatch] * dividerInverted +
|
||||
insertionMatrix[p.readEnd][p.hapEnd]
|
||||
* transition[p.readEnd][indelToMatch] * dividerInverted +
|
||||
deletionMatrix[p.readEnd][p.hapEnd]
|
||||
* transition[p.readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider);
|
||||
}
|
||||
}
|
||||
|
||||
private void updateTable(final int rowFrom, final int rowTo,
|
||||
final int colFrom, final int colTo) {
|
||||
|
||||
for (int i = rowFrom; i < rowTo; i++) {
|
||||
for (int j = colFrom; j < colTo; j++) {
|
||||
updateCell(i, j, prior[i][j], transition[i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds the properties of a pair-hmm computational problem.
|
||||
*/
|
||||
public class Problem {
|
||||
private final byte[] haplotypeSegment;
|
||||
private final int readStart;
|
||||
private final int readEnd;
|
||||
private final int hapStart;
|
||||
private final int hapEnd;
|
||||
private final int hashCode;
|
||||
private final boolean trailing;
|
||||
private final boolean leading;
|
||||
|
||||
/**
|
||||
* Construct a new project object.
|
||||
* @param start inclusive start position on the read to consider.
|
||||
* @param end exclusive after last position on the read to consider.
|
||||
* @param hapStart inclusive start position on the haplotype to consider.
|
||||
* @param hapEnd exclusive after last position on the haplotype to consider.
|
||||
*/
|
||||
public Problem(final int start, final int end, final int hapStart,
|
||||
final int hapEnd) {
|
||||
if (start < 0 || start > readBases.length)
|
||||
throw new IllegalArgumentException("bad start index " + start);
|
||||
if (end < start || end > readBases.length)
|
||||
throw new IllegalArgumentException("bad end index " + end + " < " + start + " or " + end + " > " + readBases.length);
|
||||
if (hapStart < 0 || hapStart > haplotypeLength)
|
||||
throw new IllegalArgumentException("bad hap start index "
|
||||
+ hapStart + " is larger than the haplotypeLength " + haplotypeLength);
|
||||
if (hapEnd < hapStart || hapEnd > haplotypeLength)
|
||||
throw new IllegalArgumentException("bad hap end index "
|
||||
+ hapEnd + " outside [" + hapStart + ","
|
||||
+ haplotypeLength + "]");
|
||||
|
||||
haplotypeSegment = Arrays.copyOfRange(haplotypeBases, hapStart, hapEnd);
|
||||
readStart = start;
|
||||
readEnd = end;
|
||||
this.hapStart = hapStart;
|
||||
this.hapEnd = hapEnd;
|
||||
trailing = readEnd == readBases.length;
|
||||
leading = readStart == 0;
|
||||
|
||||
hashCode = ((start * 31 + end) * 31 + Arrays.hashCode(haplotypeSegment) * 31);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (o == this)
|
||||
return true;
|
||||
else if (o == null)
|
||||
return false;
|
||||
else if (o.getClass() != this.getClass())
|
||||
return false;
|
||||
else {
|
||||
final Problem p = (Problem) o;
|
||||
return (p.hashCode == this.hashCode) && (p.readStart == this.readStart) && (p.readEnd == this.readEnd) && Arrays.equals(haplotypeSegment, p.haplotypeSegment);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the currently loaded read base calls.
|
||||
* @return {@code never null}.
|
||||
*/
|
||||
public byte[] getReadBases() {
|
||||
if (readBases == null)
|
||||
throw new IllegalStateException("no read was previously loaded.");
|
||||
return readBases;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.pairhmm;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* API for the fast (partial) HMM calculation engine.
|
||||
*/
|
||||
public interface FlexibleHMM {
|
||||
|
||||
/**
|
||||
* Load a read into the HMM calculation matrices.
|
||||
* @param read the read record to load into the HMM calculating engine.
|
||||
*/
|
||||
public void loadRead(GATKSAMRecord read);
|
||||
|
||||
/**
|
||||
* Returns the current read bases.
|
||||
*
|
||||
* @return never null.
|
||||
*/
|
||||
public byte[] getReadBases();
|
||||
|
||||
/**
|
||||
* Loads a haplotype bases in the HMM calculation matrices.
|
||||
* @param haplotype the haplotype sequence.
|
||||
*
|
||||
* @throws IllegalStateException if no read has been previously loaded.
|
||||
* @throws NullPointerException if {@code haplotype} is {@code null}.
|
||||
*/
|
||||
public void loadHaplotypeBases(byte[] haplotype);
|
||||
|
||||
/**
|
||||
* Resolve the partial Fast PairHMM for a section of the read and haplotype
|
||||
* @param readFrom inclusive offset of the first position on the read.
|
||||
* @param readTo exclusive offset of the last position on the read.
|
||||
* @param haplotypeFrom inclusive offset of the first position on the haplotype.
|
||||
* @param haplotypeTo exclusive offset of the last position on the haplotype.
|
||||
* @param treatAsMatch can assume that both pieces are the same sequence.
|
||||
* @return the cost the sub-HMM.
|
||||
*/
|
||||
public double calculateLocalLikelihood(int readFrom, int readTo, int haplotypeFrom, int haplotypeTo, boolean treatAsMatch);
|
||||
|
||||
/**
|
||||
* Load a read given its relevant information pieces by separate.
|
||||
* @param bases read bases.
|
||||
* @param bq base qualities.
|
||||
* @param iq insertion qualities.
|
||||
* @param dq deletion qualities.
|
||||
* @param mq read mapping quality.
|
||||
*/
|
||||
public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq);
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -55,32 +55,21 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
* User: rpoplin, carneiro
|
||||
* Date: 10/16/12
|
||||
*/
|
||||
public final class LoglessPairHMM extends N2MemoryPairHMM {
|
||||
public class LoglessPairHMM extends N2MemoryPairHMM {
|
||||
protected static final double INITIAL_CONDITION = Math.pow(2, 1020);
|
||||
protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION);
|
||||
|
||||
// we divide e by 3 because the observed base could have come from any of the non-observed alleles
|
||||
protected static final double TRISTATE_CORRECTION = 3.0;
|
||||
|
||||
private static final int matchToMatch = 0;
|
||||
private static final int indelToMatch = 1;
|
||||
private static final int matchToInsertion = 2;
|
||||
private static final int insertionToInsertion = 3;
|
||||
private static final int matchToDeletion = 4;
|
||||
private static final int deletionToDeletion = 5;
|
||||
protected static final int matchToMatch = 0;
|
||||
protected static final int indelToMatch = 1;
|
||||
protected static final int matchToInsertion = 2;
|
||||
protected static final int insertionToInsertion = 3;
|
||||
protected static final int matchToDeletion = 4;
|
||||
protected static final int deletionToDeletion = 5;
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void initialize(final int readMaxLength, final int haplotypeMaxLength ) {
|
||||
super.initialize(readMaxLength, haplotypeMaxLength);
|
||||
|
||||
transition = new double[paddedMaxReadLength][6];
|
||||
prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength];
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
|
|
@ -92,7 +81,8 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
|
|||
final byte[] deletionGOP,
|
||||
final byte[] overallGCP,
|
||||
final int hapStartIndex,
|
||||
final boolean recacheReadValues ) {
|
||||
final boolean recacheReadValues,
|
||||
final int nextHapStartIndex) {
|
||||
|
||||
if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) {
|
||||
final double initialValue = INITIAL_CONDITION / haplotypeBases.length;
|
||||
|
|
@ -138,7 +128,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
|
|||
* @param readQuals the base quality scores of the read
|
||||
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
|
||||
*/
|
||||
public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
|
||||
protected void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) {
|
||||
|
||||
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
|
||||
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
|
||||
|
|
@ -190,7 +180,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM {
|
|||
* @param prior the likelihood editing distance matrix for the read x haplotype
|
||||
* @param transition an array with the six transition relevant to this location
|
||||
*/
|
||||
private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
|
||||
protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) {
|
||||
|
||||
matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] +
|
||||
insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] +
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Represents a hard-clipped view of a read.
|
||||
*/
|
||||
public class ClippedGATKSAMRecord extends GATKSAMRecord {
|
||||
|
||||
private byte[] insertionQuals;
|
||||
|
||||
private byte[] deletionQuals;
|
||||
|
||||
/**
|
||||
* Creates a hard-clipped view on a existing read record.
|
||||
* @param read the underlying unclipped read.
|
||||
* @param start inclusive first position in {@code read} included in the clipped view.
|
||||
* @param end inclusive last position in {@code read} included in the clipped view.
|
||||
*/
|
||||
public ClippedGATKSAMRecord(final GATKSAMRecord read, int start, int end) {
|
||||
super(read.getHeader(), read.getReferenceIndex(), read.getAlignmentStart() + start, (short) read.getReadNameLength(),
|
||||
(short) 100, -1, read.getCigarLength(), read.getFlags(), end - start,
|
||||
read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getInferredInsertSize(),
|
||||
new byte[0]);
|
||||
this.setReadBases(Arrays.copyOfRange(read.getReadBases(), start, end));
|
||||
this.setBaseQualities(Arrays.copyOfRange(read.getBaseQualities(),start,end));
|
||||
this.setReadName(read.getReadName());
|
||||
insertionQuals = Arrays.copyOfRange(read.getBaseInsertionQualities(),start,end);
|
||||
deletionQuals = Arrays.copyOfRange(read.getBaseDeletionQualities(),start,end);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBaseDeletionQualities() {
|
||||
return deletionQuals;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBaseInsertionQualities() {
|
||||
return insertionQuals;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMappingQuality() {
|
||||
return 100;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getReadName().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (o instanceof GATKSAMRecord) {
|
||||
return getReadName().equals(((GATKSAMRecord)o).getReadName());
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -46,14 +46,25 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broad.tribble.readers.LineIterator;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.variant.variantcontext.VariantContext;
|
||||
import org.broadinstitute.variant.vcf.VCFCodec;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
||||
|
||||
final static String REF = b37KGReference;
|
||||
final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
|
||||
|
||||
public static String baseTestString() {
|
||||
return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s";
|
||||
}
|
||||
|
|
@ -290,4 +301,96 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
executeTest("Testing InbreedingCoeff annotation with PED file", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrandBiasBySample() throws IOException {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList(""));
|
||||
final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0);
|
||||
|
||||
final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample";
|
||||
final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList(""));
|
||||
specNoFS.disableShadowBCF();
|
||||
final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0);
|
||||
|
||||
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand";
|
||||
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList(""));
|
||||
specAnn.disableShadowBCF();
|
||||
final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0);
|
||||
|
||||
// confirm that the FisherStrand values are identical for the two pipelines
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final FileInputStream s = new FileInputStream(outputVCF);
|
||||
final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s));
|
||||
codec.readHeader(lineIterator);
|
||||
|
||||
final VCFCodec codecAnn = new VCFCodec();
|
||||
final FileInputStream sAnn = new FileInputStream(outputVCFAnn);
|
||||
final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn));
|
||||
codecAnn.readHeader(lineIteratorAnn);
|
||||
|
||||
while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) {
|
||||
final String line = lineIterator.next();
|
||||
Assert.assertFalse(line == null);
|
||||
final VariantContext vc = codec.decode(line);
|
||||
|
||||
final String lineAnn = lineIteratorAnn.next();
|
||||
Assert.assertFalse(lineAnn == null);
|
||||
final VariantContext vcAnn = codecAnn.decode(lineAnn);
|
||||
|
||||
Assert.assertTrue(vc.hasAttribute("FS"));
|
||||
Assert.assertTrue(vcAnn.hasAttribute("FS"));
|
||||
Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0));
|
||||
}
|
||||
|
||||
Assert.assertFalse(lineIterator.hasNext());
|
||||
Assert.assertFalse(lineIteratorAnn.hasNext());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQualByDepth() throws IOException {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList(""));
|
||||
final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0);
|
||||
|
||||
final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth";
|
||||
final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList(""));
|
||||
specNoQD.disableShadowBCF();
|
||||
final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0);
|
||||
|
||||
final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth";
|
||||
final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249"));
|
||||
specAnn.disableShadowBCF();
|
||||
final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0);
|
||||
|
||||
// confirm that the QD values are present in the new file for all biallelic variants
|
||||
// QD values won't be identical because some filtered reads are missing during re-annotation
|
||||
|
||||
final VCFCodec codec = new VCFCodec();
|
||||
final FileInputStream s = new FileInputStream(outputVCF);
|
||||
final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s));
|
||||
codec.readHeader(lineIterator);
|
||||
|
||||
final VCFCodec codecAnn = new VCFCodec();
|
||||
final FileInputStream sAnn = new FileInputStream(outputVCFAnn);
|
||||
final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn));
|
||||
codecAnn.readHeader(lineIteratorAnn);
|
||||
|
||||
while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) {
|
||||
final String line = lineIterator.next();
|
||||
Assert.assertFalse(line == null);
|
||||
final VariantContext vc = codec.decode(line);
|
||||
|
||||
final String lineAnn = lineIteratorAnn.next();
|
||||
Assert.assertFalse(lineAnn == null);
|
||||
final VariantContext vcAnn = codecAnn.decode(lineAnn);
|
||||
|
||||
if( vc.isBiallelic() ) {
|
||||
Assert.assertTrue(vc.hasAttribute("QD"));
|
||||
Assert.assertTrue(vcAnn.hasAttribute("QD"));
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertFalse(lineIterator.hasNext());
|
||||
Assert.assertFalse(lineIteratorAnn.hasNext());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -860,6 +860,29 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUpdateHeaderForReadWithHighMQ() {
|
||||
|
||||
// set up the window header
|
||||
final int currentHeaderStart = 100;
|
||||
final LinkedList<HeaderElement> windowHeader = new LinkedList<>();
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
windowHeader.add(new HeaderElement(currentHeaderStart + i));
|
||||
|
||||
// set up the read
|
||||
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, readLength);
|
||||
read.setReadBases(Utils.dupBytes((byte) 'A', readLength));
|
||||
read.setBaseQualities(Utils.dupBytes((byte)30, readLength));
|
||||
read.setMappingQuality(180);
|
||||
read.setReadNegativeStrandFlag(false);
|
||||
|
||||
// add the read and make sure it's not filtered because of low MQ (byte vs. int)
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0);
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
//// This section tests functionality related to polyploid consensus creation ////
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
|||
|
|
@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testSingleSample() {
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "1771e95aed2b3b240dc353f84e19847d");
|
||||
DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "6ca3d3917a7b65eaa877aa3658d80912");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultiSample() {
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "c7f1691dbe5f121c4a79be823d3057e5");
|
||||
DTTest("testMultiSample ", "-I " + multiSample, "f50c6b9bef9f63f0a8b32ae9a9bdd51a");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,131 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.diagnostics.missing;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import java.util.List;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 9/20/13
|
||||
* Time: 3:59 PM
|
||||
*/
|
||||
public class QualifyMissingIntervalsUnitTest extends BaseTest {
|
||||
@Test(enabled = true)
|
||||
public void testInterpretation() {
|
||||
final QualifyMissingIntervals tool = new QualifyMissingIntervals();
|
||||
|
||||
final Metrics unmappable = new Metrics(0.5, 7500.0, 0.0, 2500, 20);
|
||||
final Metrics highGC = new Metrics(0.99, 0.0, 0.0, 0, 20);
|
||||
final Metrics lowGC = new Metrics(0.09, 0.0, 0.0, 0, 20);
|
||||
final Metrics unsequenceable = new Metrics(0.5, 3.0, 1200.0, 10, 20);
|
||||
final Metrics noData = new Metrics(0.5, 0.0, 0.0, 0, 20);
|
||||
final Metrics unknown = new Metrics(0.5, 30.0, 120000.0, 2500, 20);
|
||||
|
||||
final Metrics[] array = {unmappable, highGC, lowGC, unsequenceable, noData, unknown};
|
||||
|
||||
final GenomeLoc testInterval = new UnvalidatingGenomeLoc("chr1", 0, 10000, 20000);
|
||||
final GenomeLoc smallInterval = new UnvalidatingGenomeLoc("chr1", 0, 1, 4);
|
||||
|
||||
|
||||
Assert.assertNotEquals(tool.checkMappability(unmappable), "");
|
||||
Assert.assertNotEquals(tool.checkGCContent(highGC), "");
|
||||
Assert.assertNotEquals(tool.checkGCContent(lowGC), "");
|
||||
Assert.assertNotEquals(tool.checkContext(unsequenceable), "");
|
||||
|
||||
Assert.assertEquals(tool.interpret(unmappable, testInterval), QualifyMissingIntervals.Interpretation.UNMAPPABLE.toString());
|
||||
Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString());
|
||||
Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString());
|
||||
Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString());
|
||||
Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString());
|
||||
Assert.assertEquals(tool.interpret(unknown, testInterval), QualifyMissingIntervals.Interpretation.UNKNOWN.toString());
|
||||
|
||||
for (Metrics m : array)
|
||||
Assert.assertEquals(tool.interpret(m, smallInterval), QualifyMissingIntervals.Interpretation.SMALL_INTERVAL.toString());
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
void testGetPositionInTarget () {
|
||||
final UnvalidatingGenomeLoc target = new UnvalidatingGenomeLoc("a", 0, 30, 50);
|
||||
final List<GenomeLoc> targets = new ObjectArrayList<>(1);
|
||||
targets.add(target);
|
||||
|
||||
// left overlap
|
||||
UnvalidatingGenomeLoc interval = new UnvalidatingGenomeLoc("a", 0, 10, 50);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -20);
|
||||
|
||||
// right overlap
|
||||
interval = new UnvalidatingGenomeLoc("a", 0, 40, 60);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -10);
|
||||
|
||||
// interval > target with short right tail
|
||||
interval = new UnvalidatingGenomeLoc("a", 0, 10, 60);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -10);
|
||||
|
||||
// interval > target with short left tail
|
||||
interval = new UnvalidatingGenomeLoc("a", 0, 10, 80);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -30);
|
||||
|
||||
// interval < target with short right tail
|
||||
interval = new UnvalidatingGenomeLoc("a", 0, 32, 40);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), 2);
|
||||
|
||||
// interval < target with short left tail
|
||||
interval = new UnvalidatingGenomeLoc("a", 0, 40, 42);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), 8);
|
||||
|
||||
// no overlap
|
||||
interval = new UnvalidatingGenomeLoc("a", 0, 40, 42);
|
||||
Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, new ObjectArrayList<GenomeLoc>()), Integer.MIN_VALUE);
|
||||
}
|
||||
}
|
||||
|
|
@ -129,7 +129,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest {
|
|||
}
|
||||
|
||||
private List<Allele> getConsensusAlleles(int eventLength, boolean isInsertion, int minCnt, double minFraction, String altBases) {
|
||||
final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(pileupProvider.genomeLocParser, true, minCnt, minFraction);
|
||||
final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, minCnt, minFraction);
|
||||
return counter.computeConsensusAlleles(pileupProvider.referenceContext,
|
||||
pileupProvider.getAlignmentContextFromAlleles(isInsertion?eventLength:-eventLength,altBases,numReadsPerAllele),
|
||||
AlignmentContextUtils.ReadOrientation.COMPLETE);
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,500,000",
|
||||
1,
|
||||
Arrays.asList("c6f0fa039ca5672469838bc9f52c72d3"));
|
||||
Arrays.asList("3d12bdb816d27bf7c9efb4c13dc2aec7"));
|
||||
|
||||
executeTest(String.format("test indel calling, multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
|
|||
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
|
||||
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
|
||||
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
|
||||
Arrays.asList("8682738c2c66b502cdbf7db466a5c3e2"));
|
||||
Arrays.asList("a2c8e83f37cd1e114b42af4b873f57bc"));
|
||||
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -o %s" +
|
||||
" -L 1:10,000,000-10,100,000",
|
||||
1,
|
||||
Arrays.asList("9f4e663e3b156b14fd55df3f5f0336a5"));
|
||||
Arrays.asList("150b31ba05113ca1996b548be5170d6d"));
|
||||
|
||||
executeTest(String.format("test multiple technologies"), spec);
|
||||
}
|
||||
|
|
@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
" -L 1:10,000,000-10,100,000" +
|
||||
" -baq CALCULATE_AS_NECESSARY",
|
||||
1,
|
||||
Arrays.asList("260bb73e2900334d5c3ff8123be0d2d8"));
|
||||
Arrays.asList("7d0ee85cd89f4addd84c5511daaaa5c5"));
|
||||
|
||||
executeTest(String.format("test calling with BAQ"), spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
|
|||
public void testMultiSamplePilot1() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
|
||||
Arrays.asList("7f26ca78e550afa28df11d593c90ec9a"));
|
||||
Arrays.asList("ec0977e3fd3e2ac29c9821f0ca830455"));
|
||||
executeTest("test MultiSample Pilot1", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,424 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.caliper.Param;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: valentin
|
||||
* Date: 8/13/13
|
||||
* Time: 2:48 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ActiveRegionTestDataSetUnitTest {
|
||||
|
||||
|
||||
|
||||
@Test(dataProvider="activeRegionTestDataSets")
|
||||
public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) {
|
||||
Assert.assertNotNull(as);
|
||||
Assert.assertEquals(as.assemblyResultSet().getMaximumKmerSize(),kmerSize);
|
||||
final List<GATKSAMRecord> reads = as.readList();
|
||||
Assert.assertEquals(reads.size(),readCount);
|
||||
for (final GATKSAMRecord r : reads) {
|
||||
Assert.assertEquals(r.getReadLength(),readLength);
|
||||
}
|
||||
|
||||
final List<Haplotype> haplotypes = as.haplotypeList();
|
||||
final List<Civar> haplotypeCivars = Civar.fromCharSequence(variation).optionalizeAll().unroll();
|
||||
|
||||
Assert.assertEquals(haplotypes.size(),haplotypeCivars.size());
|
||||
Assert.assertTrue(haplotypeCivars.size() > 1);
|
||||
int variants = 0;
|
||||
for (int i = 0; i < variation.length(); i++) {
|
||||
final char c = variation.charAt(i);
|
||||
switch (c) {
|
||||
case 'W':
|
||||
case 'T':
|
||||
case 'C':
|
||||
case 'D':
|
||||
case 'I':
|
||||
variants++;
|
||||
default:
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(haplotypes.size(),(int) Math.pow(2,variants));
|
||||
|
||||
final Map<String,Integer> haplotypeNumberByString = new HashMap<>();
|
||||
for (int i = 0; i < haplotypes.size(); i++) {
|
||||
final Haplotype hap = haplotypes.get(i);
|
||||
final Civar civar = haplotypeCivars.get(i);
|
||||
Assert.assertEquals(hap.getBaseString(),civar.applyTo(as.getReference()));
|
||||
if (i == 0) {
|
||||
Assert.assertEquals(hap.getBaseString(), as.getReference());
|
||||
} else {
|
||||
Assert.assertNotEquals(hap.getBaseString(),as.getReference());
|
||||
}
|
||||
Assert.assertFalse(haplotypeNumberByString.containsKey(hap.getBaseString()));
|
||||
haplotypeNumberByString.put(hap.getBaseString(), i);
|
||||
}
|
||||
|
||||
final int[] hapReadsNotInReference = new int[haplotypes.size()];
|
||||
|
||||
for (int i = 0; i < readCount; i++) {
|
||||
final GATKSAMRecord r = as.readList().get(i);
|
||||
|
||||
final int hapNumber = i % haplotypes.size();
|
||||
final int offset = i % (haplotypes.get(hapNumber).length() - readLength);
|
||||
Assert.assertEquals(r.getReadString(),haplotypes.get(hapNumber).getBaseString().substring(offset,offset+readLength));
|
||||
if (as.getReference().indexOf(r.getReadString()) == -1) {
|
||||
hapReadsNotInReference[hapNumber]++;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.assertEquals(hapReadsNotInReference[0],0);
|
||||
|
||||
for (int i = 1; i < hapReadsNotInReference.length; i++) {
|
||||
Assert.assertNotEquals(hapReadsNotInReference[i],0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a test data-set based on the given parameters.
|
||||
* @param kmerSize length of the kmer.
|
||||
* @param readLength length of the read.
|
||||
* @param variation variation in that active region.
|
||||
* @param readCount number of reads in the active region
|
||||
* @param regionSize Active region size (~ size of the haplotype(s))
|
||||
* @param bq Base quality value common for all base-calls.
|
||||
* @param iq Insertion quality based for all read positions.
|
||||
* @param dq Deletion quality based for all read positions.
|
||||
* @return never null.
|
||||
*/
|
||||
public static ActiveRegionTestDataSet createActiveRegionTestDataSet(final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) {
|
||||
|
||||
final String reference = REF.substring(0, regionSize);
|
||||
|
||||
final ActiveRegionTestDataSet result = new ActiveRegionTestDataSet(kmerSize, reference,
|
||||
new String[]{"Civar:" + variation},
|
||||
new String[]{"*:" + readCount + ":" + readLength}, byteRepeat(bq, readLength), byteRepeat(dq, readLength), byteRepeat(iq, readLength));
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@DataProvider(name="activeRegionTestDataSets")
|
||||
public Iterator<Object[]> activeRegionTestDataSets() {
|
||||
return new java.util.Iterator<Object[]>() {
|
||||
|
||||
private int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < ACTIVE_REGION_TEST_DATA_SET_PARAMETERS.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] next() {
|
||||
|
||||
final Object[] params = ACTIVE_REGION_TEST_DATA_SET_PARAMETERS[i++];
|
||||
final int kmerSize = (Integer) params[0];
|
||||
final int readLength = (Integer) params[1];
|
||||
final String variation = (String) params[2];
|
||||
final int readCount = (Integer) params[3];
|
||||
final int regionSize = (Integer) params[4];
|
||||
final ActiveRegionTestDataSet dataSet = createActiveRegionTestDataSet(kmerSize, readLength, variation, readCount, regionSize, (byte) 20, (byte) 35, (byte) 35);
|
||||
return new Object[] { dataSet , kmerSize, readLength, variation, readCount, regionSize, (byte)20, (byte) 35, (byte) 35};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static int[] KMER_SIZES = new int[] { 10 };
|
||||
private static int[] READ_COUNTS = new int[] { 1000 };
|
||||
private static int[] READ_LENGTHS = new int[] { 100 };
|
||||
private static String[] VARIATION_CIVARS = new String[] {
|
||||
"*1T*",
|
||||
"*3Iacg*",
|
||||
"*30Igctcggatgccttgcggggctccagagtcc*",
|
||||
"*3D*",
|
||||
"*30D*",
|
||||
"*1T3=3Iacg*",
|
||||
"*1T*3Iacg*",
|
||||
"*1T8=1T8=1T8=1T8=1T*",
|
||||
"*1T*1T*1T*1T*1T*"
|
||||
};
|
||||
|
||||
private static int[] REGION_SIZE = new int[] { 300 };
|
||||
|
||||
static {
|
||||
try {
|
||||
if (KMER_SIZES == null) KMER_SIZES = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("kmerSize").getAnnotation(Param.class).value());
|
||||
if (READ_COUNTS == null) READ_COUNTS = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("readCount").getAnnotation(Param.class).value());
|
||||
if (READ_LENGTHS == null) READ_LENGTHS = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("readLength").getAnnotation(Param.class).value());
|
||||
if (VARIATION_CIVARS == null) VARIATION_CIVARS = HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("variation").getAnnotation(Param.class).value();
|
||||
if (REGION_SIZE == null) REGION_SIZE = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("regionSize").getAnnotation(Param.class).value());
|
||||
|
||||
} catch (NoSuchFieldException e) {
|
||||
throw new ExceptionInInitializerError(e); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
}
|
||||
|
||||
private static int[] intValues(final String[] kmerSizes) {
|
||||
final int[] result = new int[kmerSizes.length];
|
||||
for (int i = 0; i < result.length; i++)
|
||||
result[i] = Integer.parseInt(kmerSizes[i]);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static final Object[][] ACTIVE_REGION_TEST_DATA_SET_PARAMETERS;
|
||||
|
||||
static {
|
||||
final int totalLength = KMER_SIZES.length * READ_COUNTS.length * READ_LENGTHS.length * VARIATION_CIVARS.length * REGION_SIZE.length;
|
||||
ACTIVE_REGION_TEST_DATA_SET_PARAMETERS = new Object[totalLength][];
|
||||
int next = 0;
|
||||
for (final int ks : KMER_SIZES)
|
||||
for (final int rc : READ_COUNTS)
|
||||
for (final int rl : READ_LENGTHS)
|
||||
for (final String v : VARIATION_CIVARS)
|
||||
for (final int rs : REGION_SIZE)
|
||||
ACTIVE_REGION_TEST_DATA_SET_PARAMETERS[next++] = new Object[] { ks, rl, v, rc, rs };
|
||||
|
||||
}
|
||||
|
||||
private static byte[] byteRepeat(final byte bq, final int readLength) {
|
||||
final byte[] result = new byte[readLength];
|
||||
Arrays.fill(result, bq);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static final String REF =
|
||||
"TCGAGAAATTTGTATCCCGCCCCCGCAGCTTGCCAGCTCTTTCAGTATCATGGAGCCCAT" +
|
||||
"GGTTGAATGAGTCCAATAACGAACTTCGACATGATAAAATCCCCCCCTCGCGACTTCCAG" +
|
||||
"AGAAGAAGACTACTGACTTGAGCGTTCCCAGCACTTCAGCCAAGGAAGTTACCAATTTTT" +
|
||||
"TGTTTCCGAATGACACGCGTCTCCTTGCGGGTAGATCGCCGACCGCAGAACTTACGAGCC" +
|
||||
"AGGGGAAACAGTAAGGCCTAATTAGGTAAAGGGAGTAAGTGCTCGAACGCTTCAGATGTA" +
|
||||
"ACCATATACTTACGCTGGATCTTCTCCCGCGAATTTTAACCCTCACCAACTACGAGATTT" +
|
||||
"GAGGTAAACCAAATAAGCACGTAGTGGCGCTATCCGACTGTTCCCAAATTGTAACTTATC" +
|
||||
"GTTCCGTGAAGGCCAGAGTTACTTCCCGGCCCTTTCCATGCGCGCACCATACCCTCCTAG" +
|
||||
"TTCCCCGGTTATCTCTCCGAGGAGGGAGTGAGCGATCCTCCGTTTACGTTTTGTTACCAA" +
|
||||
"TGACGTAGCTATGTATTTTGTACAGGTTGCCAACGGGTTTCACAATTCACAGATAGTGGG" +
|
||||
"GATCCCGGCAAAGGGCCTATATTTGCGGTCCAACTTAGGCGTAAACTACGATGGTACCTA" +
|
||||
"CTCAGACCCAGCTCGCGCGGCGTAAATAACGCACTCATCCCAGCTGATTCTCGGCGATCT" +
|
||||
"ACGCAGCGACATGATTATCAACAGCTGTCTGGCAGCTCTAATCTTTTACCATGGTCGTAA" +
|
||||
"AAGCCTCCAAGAGTTAGATCATACCTAACGCCACAAAAGTGACACGACGCCGATGGGTAC" +
|
||||
"CGGACTTTAGGTCGACCACAGTTCGGTAAGGGAGAGGCCCTGCGGCGTACTTCATTTTGT" +
|
||||
"ATATGCAACGTGCCCAAGTGGCGCCAGGCAAGTCTCAGCTGGTTCCTGTGTTAGCTCGAG" +
|
||||
"GCTAGGCATGGGAGCTGATTGAACATGGGTTGGGGGCCTCGAACCGTCGAGGACCCCATA" +
|
||||
"GTACCTCGGACACCAAGTAGGGCAGCCTATAGTTTGAAGCAGTACTATTTCAGGGGGGGA" +
|
||||
"GCCCTCATGGTCTCTTCTACTGATGACTCAACACGCTAGGGACGTGAAGTCGATTCCTTC" +
|
||||
"GATGGTTATAAATCAAAGGCTCAGAGTGCAGTCTGGAGCGCCCATCTAACGGTACGCATC" +
|
||||
"TCGATTGCTCGGTCGCCTTTCACACTCCGCGAAAATTCATACCGCTCATTCACTAGGTTG" +
|
||||
"CGAAGCCTACACTGATATATGAATCCAAGCTAGAGCAGGGCTCTTAAAATTCGGAGTTGT" +
|
||||
"AGATGCTCAATACTCCAATCGGTTTTTTCGTGCACCACCGCGGGTGGCTGACAAGGGTTT" +
|
||||
"GACATCGAGAAACAAGGCAGTTCCGGGCTGAAAGTAGCGCCGGGTAAGGTACGCGCCTGG" +
|
||||
"TATGGCAGGACTATGAAGCCAATACAAAGGCTACATCCTCACTCGGGTGGACGGAAACGC" +
|
||||
"AGAATTATGGTTACTTTTTGGATACGTGAAACATGTCCCATGGTAGCCCAAAGACTTGGG" +
|
||||
"AGTCTATCACCCCTAGGGCCCATTTCTGGATATAGACGCCAGGTTGAATCCGTATTTGGA" +
|
||||
"GGTACGATGGATCAGTCTGGGTGGGACGTGCTCCATTTATACCCTGCGCAGGCTGGACCG" +
|
||||
"AGGACCGCAAGATGCGACGGTGCACAAGTAATTGACAACAAACCATCGTGTTTTCATTAT" +
|
||||
"GGTACCAGGATCTTCAAGCCGAGTCAATCAAGCTCGGATTACAGTGTTTACCGCGTCTTG" +
|
||||
"CGGTTACTCACAAACTGTAATCCACCACAAGTCAAGCCATTGCCTCTCTGAGACGCCGTA" +
|
||||
"TGAATTAATATGTAAACTTTGCGCGGGTTCACTGCGATCCGTTCAGTCTCGTCCAAGGGC" +
|
||||
"ACAATCGAATTCCCATTTGTATGTTCGGCTAACTTCTACCCATCCCCCGAAGTTTAGCAG" +
|
||||
"GTCGTGAGGTGTCATGGAGGCTCTCGTTCATCCCGTGGGACATCAAGCTTCGCCTTGATA" +
|
||||
"AAGCACCCCGCTCGGGTGTAGCAGAGAAGACGCCTACTGAATTGTGCGATCCCTCCACCT" +
|
||||
"CAGCTAAGGTAGCTACCAATATTTAGTTTTTTAGCCTTGCGACAGACCTCCTACTTAGAT" +
|
||||
"TGCCACGCATTGAGCTAGCGAGTCAGCGATAAGCATGACGCGCTTTCAAGCGTCGCGAGT" +
|
||||
"ATGTGAACCAAGGCTCCGGACAGGACTATATACTTGGGTTTGATCTCGCCCCGACAACTG" +
|
||||
"CAAACCTCAACATTTATAGATTATAAGGTTAGCCGAAATTGCACGTGGTGGCGCCCGCCG" +
|
||||
"ACTGCTCCCCGAGTGTGGCTCTTTGATCTGACAACGCGCGACCTCCATCGCGGCCGATTG" +
|
||||
"TTTCTGCGGACCATGTCGTCCTCATAGTTTGGGCATGTTTCCGTTGTAGGAGTGAAGCCA" +
|
||||
"CTTAGCTTTGCGCCGTAGTCCCAATGAAAAACCTATGGACTTTGTTTTGGGTAGCATCAG" +
|
||||
"GAATCTGAACCCTGTGAATGTGGGGGTCGCGCGCATAGACCTTTATCTCCGGTTCAAGTT" +
|
||||
"AGGCATGAGGCTGCATGCTACGTTGTCACACCTACACTGCTCGAAGTAAATATGGGAAGC" +
|
||||
"GCGCGGCCTGGCCCGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGC" +
|
||||
"ACATAAGCAATACCGTAGTCCCTCAAATTCAGCTCTGTTATCTCGAGCGTTATGTGTCAA" +
|
||||
"ATGGCGTAGAACGGGATTGACTGTTTGACACTAGCTGGTGTTCGGTTCGGTAACGGAGAA" +
|
||||
"TCTGTGGGGCTATGTCACTAATACTTTCGAAACGCCCCGTACCGATGCTGAACAAGTCGA" +
|
||||
"TGCAGGCTCCCGTCTTTGAATAGGGGTAAACATACAAGTCGATAGAAGATGGGTAGGGGC" +
|
||||
"CTCCAATTCATCCAACACTCTACGCCTTCTCCAAGAGCTAGTAGGGCACCCTGCAGTTGG" +
|
||||
"AAAGGGAACTATTTCGTAGGGCGAGCCCATACCGTCTCTCTTGCGGAAGACTTAACACGA" +
|
||||
"TAGGAAGCTGGAATAGTTTCGAACGATGGTTATTAATCCTAATAACGGAACGCTGTCTGG" +
|
||||
"AGGATGAGTGTGACGGAGTGTAACTCGATGAGTTACCCGCTAATCGAACTGGGCGAGAGA" +
|
||||
"TCCCAGCGCTGATGCACTCGATCCCGAGGCCTGACCCGACATATCAGCTCAGACTAGAGC" +
|
||||
"GGGGCTGTTGACGTTTGGGGTTGAAAAAATCTATTGTACCAATCGGCTTCAACGTGCTCC" +
|
||||
"ACGGCTGGCGCCTGAGGAGGGGCCCACACCGAGGAAGTAGACTGTTGCACGTTGGCGATG" +
|
||||
"GCGGTAGCTAACTAAGTCGCCTGCCACAACAACAGTATCAAAGCCGTATAAAGGGAACAT" +
|
||||
"CCACACTTTAGTGAATCGAAGCGCGGCATCAGAATTTCCTTTTGGATACCTGATACAAAG" +
|
||||
"CCCATCGTGGTCCTTAGACTTCGTGCACATACAGCTGCACCGCACGCATGTGGAATTAGA" +
|
||||
"GGCGAAGTACGATTCCTAGACCGACGTACGATACAACTATGTGGATGTGACGAGCTTCTT" +
|
||||
"TTATATGCTTCGCCCGCCGGACCGGCCTCGCGATGGCGTAGCTGCGCATAAGCAAATGAC" +
|
||||
"AATTAACCACTGTGTACTCGTTATAACATCTGGCAGTTAAAGTCGGGAGAATAGGAGCCG" +
|
||||
"CAATACACAGTTTACCGCATCTAGACCTAACTGAGATACTGCCATAGACGACTAGCCATC" +
|
||||
"CCTCTGGCTCTTAGATAGCCCGATACAGTGATTTTGAAAGGTTTGCGGGGCACAGCTATG" +
|
||||
"ACTTGCTTAGCTACGTGTGAGGGAAGGAACTTTTGCGTATTTGTATGTTCACCCGTCTAC" +
|
||||
"TACGCATGCGGGCAGATTATGTAGGTTGAGAGATGCGGGAGAAGTTCTCGACCTTCCCGT" +
|
||||
"GGGACGTGAACCTATCCCCTAATAGAGCATTCCGTTCGAGCATGGCAGTAAGTACGCCTT" +
|
||||
"CTCAATTGTGCTAACCTTCATCCCTATCAAAGCTTGGAGCCAATGATCAGGGTTATTCCC" +
|
||||
"TTGGGACAGACTTCCTACTCACAGTCGGTCACATTGGGCTACTCCATGGGTCTTCAGCTT" +
|
||||
"GACCCGGTCTGTTGGGCCGCGATTACGTGAGTTAGGGCCCCGGACTGCGCTGTATAGTCG" +
|
||||
"ATTCTCATCCGGCCCCCACATCTGGAAACCCCAACTTATTTAGATAACATGATTAGCCGA" +
|
||||
"AGTTGCACGGCGTGTCCACCGTGGAGTCCTCCCCGGGTGTCCCTCCTTCATTTGACGATA" +
|
||||
"AGCAGCGGCTACCACCATTGATTAACACAAGGAACGGTGATGTTAACATAGATTCGGCAC" +
|
||||
"ATTACTCTTGTAGGTGTGGAATCACTTAGCTACGCGGCGAAGCCTTATGGCAAAACCGAT" +
|
||||
"GGGCAATGATTCGGGTAGCGCTAAAAGTCCATAGCACGTGCATCCCAACGTGGCGTGCGT" +
|
||||
"ACAGCTTGACCACCGCTTCACGCTAAGGTGCTGGCCACATGCTAAATTGATGCGCCTGCA" +
|
||||
"CTGCTCAAAGGATAATTACGAAGCGGGCGGCCTGGCGGGAGCACTACCCCATCGACGCGT" +
|
||||
"ACTCGAATACTGTTTATTGCTCACACATGAACAAATTAGTAGAGTGCCACTTTCAGCCCT" +
|
||||
"CTTGTCGTCGGCGATGTGTGTAAAATGGCGTTGATGTGGATCGACTCTATAAAGGTATCT" +
|
||||
"ACTGATGCGTAGGGAGATCCGGAATCTATTGGCCTATGTCACTGAAACTATCCAAACACC" +
|
||||
"CCATGTCGATACTGAACGTATCGACGCATACCTCCTTCCTTGAAAACGCACAATCATACA" +
|
||||
"ACTGGGCACATAATGCGTACGCCCATCTAGTACACCCATCTCTGTGGGTCCAGTTCAAGA" +
|
||||
"GCTGGAAGAGCACCCTCCACAAGGTCAAGTGGTATCCTGGTAAGGTAAGCTCGTACCGTG" +
|
||||
"ATTCATGCGACAGGGGTAAGACCATCAGTAGTAGGGATAGTGCCAAACCTCACTCACCAC" +
|
||||
"TGCCAATAAGGGGTCCTTACCTGAAGAATAAGTGTCAGCCAGTGTAACCCGATGAGGAAC" +
|
||||
"CCAAAAGGCGAACCGGGCCAGACAACCCGGCGGTATCGCACTCAAAGCCGGGACACGACG" +
|
||||
"CGTCACAGCCGGTAAGAGTAACCCCGGAGTGAAGACCTATGGGGCTGGATAAAACTGCCG" +
|
||||
"TGGTAACCGCCTTCAACAACCCGAATACGTGGCACTTCAGGAGGCGCCCGGAGGGGGGAT" +
|
||||
"GTTTTCTACTATTCGAGGCCGTTCGTTATAACTAGTTGCGTTCCTAGCCGCTATAATTGT" +
|
||||
"CTCTTTGCCGACTAATGAGAACAACCACACCATAGCGATTTGACGCGGCGCCTCGGAATA" +
|
||||
"CCGTTTCAGCAGGCGCTTGGTAAGGCCATCGCGAATACCAGGTATCGTGTAAGTAGCGTA" +
|
||||
"GGCCCGCACGCAAGATAAACTGCTAGGGAACCGCGTTTCCACGACCGGTGCACGATTTAA" +
|
||||
"TTTCGCCGACGTGATGACATTCCAGGCAGTGCCTCTGCCGCCGGACCCCTCTCGTGATTG" +
|
||||
"GGTAGCTGGACATGCCCTTGTAAGATATAACAAGAGCCTGCCTGTCTAATGATCTCACGG" +
|
||||
"CGAAAGTCGGGGAGACAGCAGCGGCTGCAGACATTATACCGCAACAACACTAAGGTGAGA" +
|
||||
"TAACTCCGTAATTGACTACGCGTTCCTCTAGACCTTACTTGACCGGATACAGTGTCTTTG" +
|
||||
"ACACGTTTATGGGTTACAGCAATCACATCCAAGACTGGCTATGCACGAAGCAACTCTTGA" +
|
||||
"GTGTTAAAATGTTGACCCCTGTATTTGGGATGCGGGTAGTAGATGAGTGCAGGGACTCCG" +
|
||||
"AGGTCAAGTACATTACCCTCTCATAGGGGGCGTTCTAGATCACGTTACCACCATATCATT" +
|
||||
"CGAGCATGACACCATCTCCGCTGTGCCCATCCTAGTAGTCATTATTCCTATCACGCTTTC" +
|
||||
"GAGTGTCTGGTGGCGGATATCCCCCACGAATGAAAATGTTTTTCGCTGACAGTCATATTG" +
|
||||
"GGGTGCTCCTAAGCTTTTCCACTTGGCTGGGTCAGCTAGGCCTCCGTGCCCGGAGTTTCG" +
|
||||
"GCGCAGTGCTGCCGACAGCCGGCCATTGTCTTTGGGGCCTCATTCGAGGGTACCCGGACC" +
|
||||
"TATCTTGTCGGGACCACCCGGGGTAGTCGTTGGGCTTATGCACCGAAAAGCCCTGCGCCG" +
|
||||
"GCCTCCCCGCTACGGAAGGTGATAAGCTCCGGCAAGCAATTATGAACAACGCAAGGATCG" +
|
||||
"CGGATATAAACAGAGAAACGGCTGATTACACCTGTTCGTGTGGTATCGGTAAATAGCCTC" +
|
||||
"GCGGAGCCTTATGCCATACTCGTCCGCGGAGCACTCTGGTAATGCATATGGTCCACAGGA" +
|
||||
"CATTCGTCGCTTCCGGGTATGCGCTCTATTTGACGGTCCTTTGGCGCACAGATGCTGGCC" +
|
||||
"ACCATTTAAATTAGAGCGACTCCACATCTGTAAGGTCCGCCACGCAGACGACAGCCCAGG" +
|
||||
"GAGACCACTGACCGATCTACCTGAACGGCAACCTTCTGTATCGTACTGGGGCGGAGAGAT" +
|
||||
"AACTACAGTGCCGCTTACAGCCCCTCTGTCGTCGCCGACGTCTGTAGTCTAGCCTCATTA" +
|
||||
"TGATTGCACGCTATTGAGGCATTGACTGATGCCGGAAGACATCTGAAATGAACTGGTCTA" +
|
||||
"TGCGACAGAAACCGTGCACCTACCAAATCTCCTTAGTGTAGGTTCTGACCGATTCGTGCT" +
|
||||
"TCGTTGAGAACTCACATTTTAACAACAGAGGACATATGCCCTACCTCCATGATCTACTGA" +
|
||||
"CGTCCCTGAGGCTGCAATTCATGTAATGGGGCAGTATCCGCGGCAAGTCCTAGTGCAATG" +
|
||||
"GCGGTTTTTTACCCTCGTTCTGAAGAAGAGGCGACGCGGGTGCGGTCATCACTAATGTGG" +
|
||||
"AAATTGGGAAGACTCTCGGGCCTCCGCCTTTAGGCGGTGCTTACTCTTTCATAAAGGGGC" +
|
||||
"TGTTAGTTATGCCCCGCGAGGATTCGAAAAGGTGAGCCAACTCGGCCGATCCGGAGAGAC" +
|
||||
"GGGCTTCAAAGCTGCCTGACGACGGTTGTGGGCCCGTAACAAAATCCTCCCAATAAGCCC" +
|
||||
"CCGTGAGCGTCGGTTGAACAGCCCTGGTCGGCCCGACCAGAAGCCCGAATATATCGCTTT" +
|
||||
"ACGGCTCTTGGGCCGGGGTGCGTTACCTTGCAGAAATCGAGGCCGTCCGTTAATTCCTGT" +
|
||||
"TGCATTCATACCGCGTATATTTGTCTCTTTACCCGCTTACTTGGATAAGCATGGCATAGC" +
|
||||
"TTTTTATCGGAGCGCCTCCGTACACGGTACGATCGCACGCCTCGTGAGATCAATACGTAT" +
|
||||
"ACCAGGTGTCCTGTGAGCAGCGAAAGCCTAAACGGGAGATACACCGCCAAAAGTCCGTGT" +
|
||||
"GAATACGAGTCGTGGCAAATTTGGTCTGGCTGTGATCTAGATATTCCAGGCGGTACGTCT" +
|
||||
"GCTCTCGCGTGCCTCTAGTGGCTCGCTAGATAGTCTAGCCGCTGGTAAACACTCCATGAC" +
|
||||
"CCCGGCTCTCCATTGATGCCACGGCGATTGTTGGAGAGCCAGCAGCGACTGCAAACGTCA" +
|
||||
"GATCAGAGTAATACTAGCAAGCGATAAGTCCCTAACTGGTTGTGGCCTTCTGTAGAGTGA" +
|
||||
"ACTTCACCACATATGCTGTCTCTGGCACGTGGATGGTTTGGAGAAATCAGATTCAAGTCT" +
|
||||
"GATCAACCTTCAAACAGATCTAGAGTCTAAAACAGTGATCTCCTGCGTGCGAGATAGAAA" +
|
||||
"TACTAGGTAACTACAGGGACTGCGACGTTTTAAACGTTGGTCCGTCAGAAGCGCCATTCA" +
|
||||
"GGATCACGTTACCCCGAAAAAAAGGTACCAGGAGCTCTTCTCCTCTGCAGTCAGGTCTAT" +
|
||||
"AGAAACTACACCATTAACCTTCCTGAGAACCGGGAGGTGGGAATCCGTCACATATGAGAA" +
|
||||
"GGTATTTGCCCGATAATCAATACTCCAGGCTTCTAACTTTTTCCACTCGCTTGAGCCGGC" +
|
||||
"TTGGCCTTTCTGCCTGAAGATTCGTTGGACTGGTGCCAACGCGCAGGCATAGTTCCAGGA" +
|
||||
"GAATTATCCGGGGGCAGTGACAACCAACATCTCGGGTCTTGCCCAACCGGTCTACACGCT" +
|
||||
"GATATAGCGAATCACCGAGAACCCGGCGCCACGCAATGGAACGTCCTTAACTCTGGCAGG" +
|
||||
"CAATTAAAGGGAACGTATATATAACGCAAAAAAACTGGAAAATTGGCGAGAGAATCTTCT" +
|
||||
"CTCTGTCTATCGAAGAATGGCCACGCGGAGGCATGCGTCATGCTAGCGTGCGGGGTACTC" +
|
||||
"TTGCTATCCATTTGGGTCACAGGACACTCGCTGTTTTCGAATTTACCCTTTATGCGCCGG" +
|
||||
"TATTGAACCACGCTTATGCCCAGCATCGTTACAACCAGACTGATACTAGATGTATAATGT" +
|
||||
"CCGCCATGCAGACGAAACCAGTCGGAGATTACCGAGCATTCTATCACGTCGGCGACCACT" +
|
||||
"AGTGAGCTACTGGAGCCGAGGGGTAACGATGATGCCCCTAAGAACCTCTCGGTCGACGCA" +
|
||||
"AGCGATTACACTCCTGTCACATCATAATCGTTTGCTATTCAGGGGTTGACCAACACCGGA" +
|
||||
"TAGCTTTTCACTTGAAGTATTATGCACGACAGGGTGCGTGTACCAACTAAACCTGTTTTA" +
|
||||
"ACTTACCTCAGACTAGTTGGAAGTGTGGCTAGATCTTAGCTTTCGTCACTAGAGGGCCCA" +
|
||||
"CGCTTAGTTTTTATGATCCATTGATCTCCTAGACGCTGCAAGATTTGCAACCAGGCAGAC" +
|
||||
"TTAGCGGTAGGTCCTAGTGCAGCGGGACTTTTTTTCTATAGTCGTTGAGAGGAGGAGTCG" +
|
||||
"TCAGACCAGATACCTTTGATGTCCTGATTGGAAGGACCGTTGGCCCCCGACCCTTAGACA" +
|
||||
"GTGTACTCAGTTCTATAAACGAGCTATTAGATATGAGATCCGTAGATTGAAAAGGGTGAC" +
|
||||
"GGAATTCGCCCGGACGCAAAAGACGGACAGCTAGGTATCCTGAGCACGGTTGCGCGTCCG" +
|
||||
"AATCAAGCTCCTCTTTACAGGCCCCGGTTTCTGTTGGTCGTAGAGCGCAGAACGGATTGG" +
|
||||
"GGGGATGTACGACAATATCTCTTAGTCACCTTTGGGTCACGGTCTGCTACCTTACAGGAA" +
|
||||
"TTCAGACCGTCCTTTAATTTCCCTTGCATATATGTTGCGTTTCTTCGACCTTCTAACCGC" +
|
||||
"ACCCTTAGGACGAAGACAGATACGTTCTTACCCATACTCCACCGTTGGCAGCGGGATCGC" +
|
||||
"ATGTCCCACGTGAAACATTGCTAAACCCTCAGGCCTCTGAGCGACAAAAGCTTTAAAGGG" +
|
||||
"AAATTCGCGCCCATAACTTGGTCCGAATACGGGTTCTAGCATCGTTCGTCTGAGTTTGTT" +
|
||||
"CTATATAAAACGGGCGCAATGTCTGCTTTGATCAACCTCCAATACCTCGTATGATTGTGC" +
|
||||
"ACCCGCCGGTGACCACTCAATGATGTGGGGTCCCCGTTGCAACTACGAGGATTTATTGAG" +
|
||||
"ACCGACCTACGTTCGGCATTGTGGGCAGAGTGAAGTATTGGCAAACGTTAAGTGCCGAAC" +
|
||||
"TAGATCTGACCTAACGGTAAGAGAGTTTCATAATACGTCCAGCCGCATGCGCAGGGTACA" +
|
||||
"TTTGGACAGTATTGAATGGACTCTGATCAACCTTCACACCGATCTAGAAACGAGTGCGTA" +
|
||||
"GATCAGCCAGGTGCAAACCAAAAATTCTAGGTTACTAGAAGTTTTGCGACGTTCTAAGAG" +
|
||||
"TTGGACGAAATGTTTCGCGACCTAGGATGAGGTCGCCCTAGAAAATAGATTTCTGCTACT" +
|
||||
"CTCCTCATAAGCAGTCCGGTGTATCGAAAGTACAAGACTAGCCTTGCTAGCAACCGCGGG" +
|
||||
"CTGGGAGCCTAAGGCATCACTCAAGATACAGGCTCGGTAACGTACGCTCTAGCCATCTAA" +
|
||||
"CTATCCCCTATGTCTTATAGGGACCTACGTTATCTGCCTG";
|
||||
|
||||
protected final static String REF_MD5 = new String(DigestUtils.md5(REF));
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,249 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph;
|
||||
import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.RandomDNA;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.haplotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Tests for {@link AssemblyResultSet}.
|
||||
*
|
||||
* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
|
||||
*/
|
||||
public class AssemblyResultSetUnitTest extends BaseTest
|
||||
{
|
||||
private GenomeLocParser genomeLocParser;
|
||||
private SAMFileHeader header;
|
||||
|
||||
@BeforeClass
|
||||
public void init() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmptyResultSet() {
|
||||
final AssemblyResultSet subject = new AssemblyResultSet();
|
||||
|
||||
Assert.assertEquals(subject.getHaplotypeList().size(), 0);
|
||||
Assert.assertEquals(subject.getHaplotypeCount(),0);
|
||||
Assert.assertEquals(subject.getReferenceHaplotype(),null);
|
||||
Assert.assertEquals(subject.getFullReferenceWithPadding(),null);
|
||||
Assert.assertEquals(subject.getPaddedReferenceLoc(),null);
|
||||
Assert.assertEquals(subject.getRegionForGenotyping(),null);
|
||||
Assert.assertEquals(subject.getUniqueReadThreadingGraph(10),null);
|
||||
Assert.assertFalse(subject.hasMultipleKmerSizes());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddReferenceHaplotype() {
|
||||
|
||||
final Haplotype ref = new Haplotype("ACGT".getBytes(),true);
|
||||
ref.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,ref.length() + 1 ));
|
||||
final AssemblyResultSet subject = new AssemblyResultSet();
|
||||
|
||||
Assert.assertTrue(subject.add(ref));
|
||||
Assert.assertFalse(subject.add(ref));
|
||||
|
||||
Assert.assertEquals(subject.getReferenceHaplotype(),ref);
|
||||
Assert.assertEquals(subject.getHaplotypeCount(),1);
|
||||
Assert.assertEquals(subject.getHaplotypeList().size(),1);
|
||||
}
|
||||
|
||||
@Test(dataProvider="assemblyResults")
|
||||
public void testAddManyHaplotypes(final java.util.List<AssemblyResult> assemblyResults,
|
||||
final java.util.List<java.util.List<Haplotype>> haplotypes) {
|
||||
final AssemblyResultSet subject = new AssemblyResultSet();
|
||||
for (int i = 0; i < haplotypes.size(); i++) {
|
||||
final int haplotypeCountBefore = subject.getHaplotypeCount();
|
||||
final java.util.List<Haplotype> haplos = haplotypes.get(i);
|
||||
final AssemblyResult ar = assemblyResults.get(i);
|
||||
for (final Haplotype h : haplos) {
|
||||
Assert.assertTrue(subject.add(h, ar));
|
||||
Assert.assertFalse(subject.add(h,ar));
|
||||
if (h.isReference())
|
||||
Assert.assertEquals(subject.getReferenceHaplotype(),h);
|
||||
}
|
||||
final int haplotypeCountAfter = subject.getHaplotypeCount();
|
||||
Assert.assertEquals(haplos.size(),haplotypeCountAfter - haplotypeCountBefore);
|
||||
Assert.assertTrue(subject.getMaximumKmerSize() >= ar.getKmerSize());
|
||||
Assert.assertTrue(subject.getMinimumKmerSize() <= ar.getKmerSize());
|
||||
Assert.assertEquals(subject.getUniqueReadThreadingGraph(ar.getKmerSize()), ar.getThreadingGraph());
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider="trimmingData")
|
||||
public void testTrimTo(final Map<Haplotype,AssemblyResult> haplotypesAndResultSets, final ActiveRegion original) {
|
||||
final AssemblyResultSet subject = new AssemblyResultSet();
|
||||
for (final Map.Entry<Haplotype,AssemblyResult> entry : haplotypesAndResultSets.entrySet())
|
||||
subject.add(entry.getKey(),entry.getValue());
|
||||
subject.setRegionForGenotyping(original);
|
||||
final GenomeLoc originalLocation = original.getExtendedLoc();
|
||||
final int length = originalLocation.size();
|
||||
final GenomeLoc newLocation = originalLocation.setStop(originalLocation.setStart(originalLocation,originalLocation.getStart() + length / 2),originalLocation.getStop() - length / 2);
|
||||
final ActiveRegion newRegion = original.trim(newLocation);
|
||||
|
||||
final Map<Haplotype,Haplotype> originalHaplotypesByTrimmed = new HashMap<>(haplotypesAndResultSets.size());
|
||||
for (final Haplotype h : haplotypesAndResultSets.keySet())
|
||||
originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h);
|
||||
|
||||
final AssemblyResultSet trimmed = subject.trimTo(newRegion, originalHaplotypesByTrimmed);
|
||||
|
||||
Assert.assertFalse(subject.wasTrimmed());
|
||||
Assert.assertTrue(trimmed.wasTrimmed());
|
||||
|
||||
for (final Haplotype h : trimmed.getHaplotypeList()) {
|
||||
Assert.assertEquals(h.getGenomeLocation(),newLocation);
|
||||
Assert.assertEquals(h.getBases().length,newLocation.size());
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name="trimmingData")
|
||||
public Iterator<Object[]> trimmingData() {
|
||||
final ActiveRegion activeRegion = new ActiveRegion(genomeLocParser.createGenomeLoc("chr1",1000,1100),genomeLocParser,25);
|
||||
final int length = activeRegion.getExtendedLoc().size();
|
||||
final RandomDNA rnd = new RandomDNA(13); // keep it prepoducible by fixing the seed to lucky 13.
|
||||
final ActiveRegionTestDataSet actd = new ActiveRegionTestDataSet(10,new String(rnd.nextBases(length)),new String[] {
|
||||
"Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]);
|
||||
|
||||
final List<Haplotype> haplotypes = actd.haplotypeList();
|
||||
for (final Haplotype h : haplotypes)
|
||||
h.setGenomeLocation(activeRegion.getExtendedLoc());
|
||||
|
||||
final ReadThreadingGraph rtg = new ReadThreadingGraph(10);
|
||||
for (final Haplotype h : haplotypes)
|
||||
rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference());
|
||||
final SeqGraph seqGraph = rtg.convertToSequenceGraph();
|
||||
final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph);
|
||||
ar.setThreadingGraph(rtg);
|
||||
final Map<Haplotype,AssemblyResult> result =
|
||||
new HashMap<>();
|
||||
for (final Haplotype h : haplotypes)
|
||||
result.put(h,ar);
|
||||
return Collections.singleton(new Object[] {result,activeRegion}).iterator();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@DataProvider(name="assemblyResults")
|
||||
public java.util.Iterator<Object[]> assemblyResults() {
|
||||
final int size = THREE_KS_GRAPH_AND_HAPLOTYPES.length * (1 + TEN_KS_GRAPH_AND_HAPLOTYPES.length);
|
||||
final Object[][] result = new Object[size][];
|
||||
|
||||
for (int i = 0; i < THREE_KS_GRAPH_AND_HAPLOTYPES.length; i++) {
|
||||
final ReadThreadingGraph rtg = new ReadThreadingGraph((String) THREE_KS_GRAPH_AND_HAPLOTYPES[i][0]);
|
||||
final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg.convertToSequenceGraph());
|
||||
ar.setThreadingGraph(rtg);
|
||||
final Object[] haplotypeStrings = (Object[]) THREE_KS_GRAPH_AND_HAPLOTYPES[i][1];
|
||||
final Haplotype[] haplotypes = new Haplotype[haplotypeStrings.length];
|
||||
for (int j = 0; j < haplotypeStrings.length; j++) {
|
||||
haplotypes[j] = new Haplotype(((String)haplotypeStrings[j]).getBytes(),j == 0);
|
||||
haplotypes[j].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,haplotypes[j].length() + 1));
|
||||
}
|
||||
result[i] = new Object[] { Collections.singletonList(ar),Arrays.asList(Arrays.asList(haplotypes))};
|
||||
for (int j = 0; j < TEN_KS_GRAPH_AND_HAPLOTYPES.length; j++) {
|
||||
final ReadThreadingGraph rtg10 = new ReadThreadingGraph((String) TEN_KS_GRAPH_AND_HAPLOTYPES[j][0]);
|
||||
final AssemblyResult ar10 = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg10.convertToSequenceGraph());
|
||||
ar10.setThreadingGraph(rtg10);
|
||||
final Object[] haplotypeStrings10 = (Object[]) TEN_KS_GRAPH_AND_HAPLOTYPES[j][1];
|
||||
final Haplotype[] haplotype10 = new Haplotype[haplotypeStrings10.length];
|
||||
for (int k = 0; k < haplotypeStrings10.length; k++) {
|
||||
haplotype10[k] = new Haplotype(((String)haplotypeStrings10[k]).getBytes(),false);
|
||||
haplotype10[k].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1", 1, haplotype10[k].length() + 1));
|
||||
}
|
||||
|
||||
result[THREE_KS_GRAPH_AND_HAPLOTYPES.length + i * TEN_KS_GRAPH_AND_HAPLOTYPES.length + j] = new Object[] { Arrays.asList(ar,ar10),
|
||||
Arrays.asList( Arrays.asList(haplotypes), Arrays.asList(haplotype10)) };
|
||||
}
|
||||
}
|
||||
return Arrays.asList(result).iterator();
|
||||
}
|
||||
|
||||
|
||||
private static final Object[][] THREE_KS_GRAPH_AND_HAPLOTYPES = new Object[][] {
|
||||
{"[ks=3]{REF: ACT}",new Object[] {"ACT"}},
|
||||
{"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" +
|
||||
"{ (3) -> A -> G -> (2) }" +
|
||||
"{ (1) -> A -> G -> (2) }",new Object[] {"ACTTGA","ACTAGGA","ACTTAGGA"}},
|
||||
{"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}", new Object[] {"ACTCG"}} ,
|
||||
{"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" +
|
||||
"{A(1) -> T -> A(2) }", new Object[] {"ACTAGACGT","ACTATACGT"}} ,
|
||||
{"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" +
|
||||
"{ ACT -> A -> T(2) -> C -> T -> A -> C -> G -> T -> A(1) -> T}",
|
||||
new Object[] {"ACTATCAGTACGTAT","ACTATCTACGTAT"}} ,
|
||||
{"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}",
|
||||
new Object[] {"ACTATCAGTACGTAT"}},
|
||||
{"[ks=3]{REF: ACT -> A -> T(1) }" +
|
||||
"{ ACT -> A -> T(1) }", new Object[] {"ACTAT"}},
|
||||
{"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ", new Object[] {"TTTACT","TTTAT"}}
|
||||
};
|
||||
|
||||
private static final Object[][] TEN_KS_GRAPH_AND_HAPLOTYPES = new Object[][] {
|
||||
{"[ks=10]{ACTAGTAAAT -> A -> T -> A -> A -> T -> A", new Object[] {"ACTAGTAAATATAATA"}},
|
||||
{"[ks=10]{ATAGTAATAA(1) -> A -> C -> T -> A(2) -> C}{ (1) -> C -> C -> C -> A(2) -> C}",
|
||||
new Object[] {"ATAGTAATAAACTAC","ATAGTAATAACCCAC"}},
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,401 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: valentin
|
||||
* Date: 8/7/13
|
||||
* Time: 5:58 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class CivarUnitTest extends BaseTest {
|
||||
|
||||
|
||||
@Test(dataProvider="validCivarExamples")
|
||||
public void testValidCivarInstanciation(final String civarString) {
|
||||
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertNotNull(civar);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider="expectedElementLengths")
|
||||
public void testValidCivarElementLength(final String civarString, final int expected) {
|
||||
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.elements().size(), expected);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider="expectedElementSizes")
|
||||
public void testValidCivarElementSizes(final String civarString, final int[] expected) {
|
||||
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.elements().size(),expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
Assert.assertEquals(civar.elements().get(i).size(),expected[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider="expectedElementOperators")
|
||||
public void testValidCivarElementOperators(final String civarString, final String expected) {
|
||||
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.elements().size(),expected.length());
|
||||
for (int i = 0; i < expected.length(); i++) {
|
||||
Assert.assertEquals(civar.elements().get(i).operator().charValue,expected.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider="expectedMinimumSequenceLength")
|
||||
public void testValidCivarMinimumSequenceLength(final String civarString, final int expected) {
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.minimumTemplateSequenceSize(),expected);
|
||||
}
|
||||
|
||||
@Test(dataProvider="expectedHasVariation")
|
||||
public void testValidCivarHasVariation(final String civarString, final boolean expected) {
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.hasVariation(),expected);
|
||||
}
|
||||
|
||||
|
||||
@Test(dataProvider="invalidCivarExamples", expectedExceptions = {IllegalArgumentException.class})
|
||||
public void testInvalidInstanciation(final String civarString) {
|
||||
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
}
|
||||
|
||||
@Test(dataProvider="unrolledTestDataIsUnrolledExamples")
|
||||
public void testInUnrolled(final String civarString, final boolean expected) {
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.isUnrolled(),expected);
|
||||
}
|
||||
|
||||
@Test(dataProvider="unrolledTestDataUnrolledCivarExamples")
|
||||
public void testValidCivarUnrolling(final String civarString, final String[] expected) {
|
||||
Set<String> expectedSet = new HashSet<>();
|
||||
expectedSet.addAll(Arrays.asList(expected));
|
||||
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
java.util.List<Civar> unrolledList = civar.unroll();
|
||||
Assert.assertEquals(unrolledList.size(),expected.length);
|
||||
for (int i = 0; i < expected.length; i++) {
|
||||
Assert.assertTrue(expectedSet.contains(unrolledList.get(i).toString()),
|
||||
"Unrolled civar " + unrolledList.get(i).toString() + " not present in expected Set: " +
|
||||
Arrays.toString(expected) + ". Unrolled set is: " + Arrays.toString(unrolledList.toArray()));
|
||||
}
|
||||
}
|
||||
|
||||
@Test(dataProvider="applyToDataExamples")
|
||||
public void testValidCivarUnrolling(final String civarString, final String before, final String expectedAfter) {
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.applyTo(before),expectedAfter);
|
||||
}
|
||||
|
||||
@Test(dataProvider="optionizeDataExamples")
|
||||
public void testValidOptionizeAll(final String civarString, final String expected) {
|
||||
final Civar civar = Civar.fromCharSequence(civarString);
|
||||
Assert.assertEquals(civar.optionalizeAll().toString(),expected);
|
||||
}
|
||||
|
||||
@DataProvider(name="validCivarExamples")
|
||||
public Iterator<Object[]> validCivarExamples() {
|
||||
return new Iterator<Object[]>() {
|
||||
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < VALID_CIVAR_EXAMPLES.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] next() {
|
||||
return new Object[] { VALID_CIVAR_EXAMPLES[i++][0] };
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@DataProvider(name="expectedHasVariation")
|
||||
public Iterator<Object[]> expectedHasVariation () {
|
||||
return validCivarExamples(5);
|
||||
}
|
||||
|
||||
@DataProvider(name="expectedMinimumSequenceLength")
|
||||
public Iterator<Object[]> expectedMinimumSequenceLength () {
|
||||
return validCivarExamples(4);
|
||||
}
|
||||
|
||||
@DataProvider(name="expectedElementOperators")
|
||||
public Iterator<Object[]> expectedElementOperators() {
|
||||
return validCivarExamples(3);
|
||||
}
|
||||
|
||||
@DataProvider(name="expectedElementSizes")
|
||||
public Iterator<Object[]> expectedElementSizes() {
|
||||
return validCivarExamples(2);
|
||||
}
|
||||
|
||||
@DataProvider(name="expectedElementLengths")
|
||||
public Iterator<Object[]> expectedElementLengths() {
|
||||
return validCivarExamples(1);
|
||||
}
|
||||
|
||||
public Iterator<Object[]> validCivarExamples(final int field) {
|
||||
return new Iterator<Object[]>() {
|
||||
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < VALID_CIVAR_EXAMPLES.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] next() {
|
||||
return new Object[] { VALID_CIVAR_EXAMPLES[i][0], VALID_CIVAR_EXAMPLES[i++][field] };
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@DataProvider(name="unrolledTestDataIsUnrolledExamples")
|
||||
public Iterator<Object[]> unrolledTestDataIsUnrolledExamples() {
|
||||
return unrolledTestDataExamples(1);
|
||||
}
|
||||
|
||||
@DataProvider(name="unrolledTestDataUnrolledCivarExamples")
|
||||
public Iterator<Object[]> unrolledTestDataUnrolledCivarExamples() {
|
||||
return unrolledTestDataExamples(2);
|
||||
}
|
||||
|
||||
public Iterator<Object[]> unrolledTestDataExamples(final int field) {
|
||||
return new Iterator<Object[]>() {
|
||||
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < UNROLLED_TEST_DATA.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] next() {
|
||||
return new Object[] { UNROLLED_TEST_DATA[i][0], UNROLLED_TEST_DATA[i++][field] };
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@DataProvider(name="optionizeDataExamples")
|
||||
public Iterator<Object[]> optionizeDataExamples() {
|
||||
return optionizeDataExamples(1);
|
||||
}
|
||||
|
||||
public Iterator<Object[]> optionizeDataExamples(final int field) {
|
||||
return new Iterator<Object[]>() {
|
||||
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < OPTIONIZED_TEST_DATA.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] next() {
|
||||
return new Object[] { OPTIONIZED_TEST_DATA[i][0], OPTIONIZED_TEST_DATA[i++][field] };
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@DataProvider(name="applyToDataExamples")
|
||||
public Iterator<Object[]> applyToDataExamples() {
|
||||
return new Iterator<Object[]>() {
|
||||
|
||||
int i = 0;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return i < APPLY_TO_TEST_DATA.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object[] next() {
|
||||
return APPLY_TO_TEST_DATA[i++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@DataProvider(name="invalidCivarExamples")
|
||||
public Object[][] invalidCivarExamples() {
|
||||
return INVALID_CIVAR_EXAMPLES;
|
||||
}
|
||||
|
||||
// columns : Civar string, number of elements.
|
||||
private static final Object[][] INVALID_CIVAR_EXAMPLES = new Object[][] {
|
||||
{"(100="},
|
||||
{"*=)"},
|
||||
{"10(=2T30="},
|
||||
{"2*=2T/3*="},
|
||||
{"3I(acc)"},
|
||||
{"a"},
|
||||
{")"},
|
||||
{"100&=1"},
|
||||
{"?100="},
|
||||
|
||||
};
|
||||
|
||||
|
||||
private static final Object[][] VALID_CIVAR_EXAMPLES = new Object[][] {
|
||||
{"100=", 1, ints(100), "=", 100, false },
|
||||
{"*=", 1 , ints(1), "=", 0, false },
|
||||
{"10=2T30=", 3, ints(10,2,30), "=T=",42 , true},
|
||||
{"*=2T3*=", 3, ints(1,2,3), "=T=",2 , true},
|
||||
{"3Iacc",1 , ints(3), "I", 0, true},
|
||||
{"Ia",1, ints(1), "I", 0, true},
|
||||
{"10D",1, ints(10), "D", 10, true},
|
||||
{"*", 1, ints(1), "=", 0, false},
|
||||
{"*D", 1, ints(1), "D", 0, true},
|
||||
{"10(1D)10=",3, ints(10,1,10), "=(=", 21, true},
|
||||
{"1*",1, ints(1), "=", 0, false},
|
||||
{"1*2*",2, ints(1,2), "==", 0, false},
|
||||
{"*11",2, ints(1,11), "==", 11, false},
|
||||
{"100=1T100=", 3, ints(100,1,100), "=T=", 201, true},
|
||||
{"100=3Iacg101=", 3, ints(100,3,101), "=I=", 201, true},
|
||||
{"100=30Igctcggatgccttgcggggctccagagtcc101=", 3 , ints(100,30,101), "=I=", 201, true},
|
||||
{"99=3D99=", 3, ints(99,3,99), "=D=", 201, true},
|
||||
{"84=30D84=", 3, ints(84,30,84), "=D=", 198, true},
|
||||
{"91=1T9=3Iacg100=", 5, ints(91,1,9,3,100), "=T=I=", 201, true},
|
||||
{"71=1T29=3Iacg100=",5, ints(71,1,29,3,100), "=T=I=",201, true},
|
||||
{"75=1T8=1T8=1T8=1T8=1T75=", 11, ints(75,1,8,1,8,1,8,1,8,1,75), "=T=T=T=T=T=",187, true},
|
||||
{"75=1T?8=", 3, ints(75,1,8), "=T=", 84, true}
|
||||
};
|
||||
|
||||
private static final Object[][] UNROLLED_TEST_DATA = new Object[][] {
|
||||
{ "10=1D10=", true, strs( "10=1D10=") },
|
||||
{ "10=(1D)10=", true, strs( "10=(1D)10=") },
|
||||
{ "10=1D?10=", false, strs("10=1=10=", "10=1D10=") },
|
||||
{ "10=1D?10=3Iacg?10=", false , strs("10=1=10=0=10=","10=1=10=3Iacg10=", "10=1D10=0=10=", "10=1D10=3Iacg10=") },
|
||||
{ "10=1D?10=" , false, strs("10=1D10=","10=1=10=") },
|
||||
{ "100=1T?100=" , false, strs("100=1T100=","100=1=100=") },
|
||||
{ "100=3Iacg?101=" , false, strs("100=3Iacg101=","100=0=101=") },
|
||||
{ "100=30Igctcggatgccttgcggggctccagagtcc?101=", false ,strs("100=30Igctcggatgccttgcggggctccagagtcc101=", "100=0=101=") },
|
||||
{ "99=3D?99=", false , strs("99=3D99=","99=3=99=") },
|
||||
{ "84=30D?84=", false, strs("84=30D84=", "84=30=84=")},
|
||||
{ "91=1T?9=3Iacg?100=", false, strs("91=1T9=3Iacg100=", "91=1=9=3Iacg100=", "91=1=9=0=100=", "91=1T9=0=100=") },
|
||||
{ "71=1T?29=3Iacg?100=", false , strs("71=1T29=3Iacg100=","71=1=29=3Iacg100=","71=1=29=0=100=", "71=1T29=0=100=") },
|
||||
// { "75=1T?8=1T?8=1T?8=1T?8=1T?75=", false, },
|
||||
{ "75=1T?8=", false, strs("75=1T8=","75=1=8=") }
|
||||
};
|
||||
|
||||
private static final Object[][] OPTIONIZED_TEST_DATA = new Object[][] {
|
||||
{ "10=1D10=", "10=1D?10=" },
|
||||
{"100=1T100=","100=1T?100=" },
|
||||
{"100=3Iacg101=", "100=3Iacg?101=" },
|
||||
{"100=30Igctcggatgccttgcggggctccagagtcc101=","100=30Igctcggatgccttgcggggctccagagtcc?101="},
|
||||
{"99=3D99=", "99=3D?99="},
|
||||
{"84=30D84=", "84=30D?84="},
|
||||
{"91=1T9=3Iacg100=", "91=1T?9=3Iacg?100="},
|
||||
{"71=1T29=3Iacg100=","71=1T?29=3Iacg?100="},
|
||||
{"75=1T8=1T8=1T8=1T8=1T75=", "75=1T?8=1T?8=1T?8=1T?8=1T?75="},
|
||||
{"75=1T?8=", "75=1T?8="}
|
||||
};
|
||||
|
||||
private static final Object[][] APPLY_TO_TEST_DATA = new Object[][] {
|
||||
{"3=1D3=", "ACTAACT", "ACTACT" },
|
||||
{"*=1C*=","ACTTACT", "ACTAACT" },
|
||||
{"4=3Iacg3=","ACTGACT","ACTGACGACT" },
|
||||
{"*=30Igctcggatgccttgcggggctccagagtcc*=","AA","AGCTCGGATGCCTTGCGGGGCTCCAGAGTCCA"},
|
||||
{"*=3D*=", "ACTTTAC","ACAC"},
|
||||
{"1=30D1=", "AGCTCGGATGCCTTGCGGGGCTCCAGAGTCCA","AA"},
|
||||
};
|
||||
|
||||
|
||||
private static int[] ints(final int ... iii) {
|
||||
return iii;
|
||||
}
|
||||
|
||||
private static String[] strs(final String ... sss) {
|
||||
return sss;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.caliper.Param;
|
||||
import com.google.caliper.SimpleBenchmark;
|
||||
import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet;
|
||||
import org.broadinstitute.sting.utils.pairhmm.FastLoglessPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: valentin
|
||||
* Date: 8/6/13
|
||||
* Time: 3:00 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark {
|
||||
// ./private/shell/googleCaliperCommand.csh org.broadinstitute.sting.gatk.walkers.haplotypecaller.HCLikelihoodCalculationEnginesBenchmark --saveResults build/benchmark/HCLikelihoodCalculationEnginesBenchmark
|
||||
|
||||
// @Param({"10", "25"})
|
||||
@Param({"10"})
|
||||
protected int kmerSize;
|
||||
|
||||
|
||||
// @Param({"100","250"})
|
||||
@Param({"100"})
|
||||
protected int readLength;
|
||||
|
||||
@Param({"*1T*", "*3Iacg*","*30Igctcggatgccttgcggggctccagagtcc*",
|
||||
"*3D*","*30D*","*1T3=3Iacg*","*1T*3Iacg*","*1T8=1T8=1T8=1T8=1T*","*1T*1T*1T*1T*1T*"})
|
||||
// @Param({"*1T*"})
|
||||
protected String variation;
|
||||
|
||||
@Param({"10000"})
|
||||
// @Param({"100", "300", "1000"})// "3000", "10000"})
|
||||
protected int readCount;
|
||||
|
||||
// @Param({"300","1000","3000"})
|
||||
@Param({"300"})
|
||||
protected int regionSize;
|
||||
|
||||
// Invariants:
|
||||
|
||||
protected final byte bq = 20;
|
||||
|
||||
protected final byte iq = 35;
|
||||
|
||||
protected final byte dq = 35;
|
||||
|
||||
protected ActiveRegionTestDataSet dataSet;
|
||||
|
||||
@Param({"true"})
|
||||
public boolean withErrors;
|
||||
|
||||
@Param({"13"})
|
||||
public int randomSeed;
|
||||
|
||||
public void setUp() {
|
||||
dataSet = ActiveRegionTestDataSetUnitTest.createActiveRegionTestDataSet(kmerSize, readLength, variation, readCount, regionSize, bq, iq, dq);
|
||||
final Random rnd = new Random(randomSeed);
|
||||
if (withErrors) dataSet.introduceErrors(rnd);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void timeGraphBasedLikelihoods(final int reps) {
|
||||
for (int i = 0; i < reps; i++) {
|
||||
GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX);
|
||||
rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonMap("anonymous", dataSet.readList()));
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void timeLoglessPairHMM(final int reps) {
|
||||
for (int i = 0; i < reps; i++) {
|
||||
final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, false,
|
||||
PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE);
|
||||
engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonMap("anonymous", dataSet.readList()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -94,6 +94,6 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGAMultiAllelic() {
|
||||
HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337",
|
||||
"8e6a2002c59eafb78bdbf1db9660164b");
|
||||
"f74d68cbc1ecb66a7128258e111cd030");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,6 +47,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variant.GATKVCFIndexType;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
|
@ -57,18 +59,18 @@ import java.util.List;
|
|||
public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
||||
@DataProvider(name = "MyDataProvider")
|
||||
public Object[][] makeMyDataProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000";
|
||||
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
|
||||
|
||||
// this functionality can be adapted to provide input data for whatever you might want in your data
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e32b7fc4de29ed141dcafc0d789d5ed6"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "ecac86e8ef4856e6dfa306c436e9b545"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"});
|
||||
tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "7828256b82df377cc3a26a55dbf68f91"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e41e0acf172a994e938a150390badd39"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"});
|
||||
tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"});
|
||||
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
@ -79,8 +81,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
*/
|
||||
@Test(dataProvider = "MyDataProvider")
|
||||
public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) {
|
||||
final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header",
|
||||
b37KGReference, bam, intervals, mode);
|
||||
final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
|
||||
b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
|
||||
executeTest(name, spec);
|
||||
|
|
@ -88,10 +90,42 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testERCRegionWithNoCalledHaplotypes() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF",
|
||||
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001");
|
||||
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",
|
||||
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testERCRegionWithNoCalledHaplotypes", spec);
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void testMissingGVCFIndexException() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF",
|
||||
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001");
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testMissingGVCFIndexingStrategyException", spec);
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void testWrongParameterGVCFIndexException() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",
|
||||
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testMissingGVCFIndexingStrategyException", spec);
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void testWrongTypeGVCFIndexException() {
|
||||
// ensure non-optimal, if optimal changes
|
||||
GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK;
|
||||
if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK)
|
||||
type = GATKVCFIndexType.DYNAMIC_SIZE;
|
||||
|
||||
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",
|
||||
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class);
|
||||
spec.disableShadowBCF();
|
||||
executeTest("testMissingGVCFIndexingStrategyException", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -91,6 +91,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerGraphBasedSingleSample() {
|
||||
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "213df0bdaa78a695e9336128333e4407");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerGraphBasedMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ceee711cac50b4bb66a084acb9264941");
|
||||
}
|
||||
|
||||
@Test(enabled = false) // can't annotate the rsID's yet
|
||||
public void testHaplotypeCallerSingleSampleWithDbsnp() {
|
||||
HCTest(NA12878_BAM, "-D " + b37dbSNP132, "");
|
||||
|
|
@ -136,16 +146,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) {
|
||||
final List<Pair<GenomeLoc, GenotypingEngine.Event>> VCs = new ArrayList<Pair<GenomeLoc, GenotypingEngine.Event>>();
|
||||
final List<Pair<GenomeLoc, GenotypingEngine.Event>> VCs = new ArrayList<>();
|
||||
try {
|
||||
for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) {
|
||||
VCs.add(new Pair<GenomeLoc, GenotypingEngine.Event>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc)));
|
||||
VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc)));
|
||||
}
|
||||
} catch( IOException e ) {
|
||||
throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read.");
|
||||
}
|
||||
|
||||
final Set<Pair<GenomeLoc, GenotypingEngine.Event>> VCsAsSet = new HashSet<Pair<GenomeLoc, GenotypingEngine.Event>>(VCs);
|
||||
final Set<Pair<GenomeLoc, GenotypingEngine.Event>> VCsAsSet = new HashSet<>(VCs);
|
||||
return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events.
|
||||
}
|
||||
|
||||
|
|
@ -233,7 +243,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestDBSNPAnnotationWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("f3e636d64042e766cc6515987e85a968"));
|
||||
Arrays.asList("a43d6226a51eb525f0774f88e3778189"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -256,7 +266,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestAggressivePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb"));
|
||||
Arrays.asList("19c2992541ede7407192660fdc1fadbf"));
|
||||
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
|
||||
|
|
@ -264,7 +274,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
public void HCTestConservativePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("16f7ffa063511c70bad795639a1c2638"));
|
||||
Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16"));
|
||||
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,10 +58,10 @@ import java.util.List;
|
|||
public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
|
||||
@DataProvider(name = "NCTDataProvider")
|
||||
public Object[][] makeNCTDataProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
List<Object[]> tests = new ArrayList<>();
|
||||
|
||||
for ( final int nct : Arrays.asList(1, 2, 4) ) {
|
||||
tests.add(new Object[]{nct, "e4bf389676fa090c95980349310ba5ca"});
|
||||
tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
|
|
|
|||
|
|
@ -1,266 +0,0 @@
|
|||
/*
|
||||
* By downloading the PROGRAM you agree to the following terms of use:
|
||||
*
|
||||
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
|
||||
*
|
||||
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
|
||||
*
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
|
||||
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
|
||||
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
|
||||
*
|
||||
* 1. DEFINITIONS
|
||||
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
|
||||
*
|
||||
* 2. LICENSE
|
||||
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
|
||||
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
|
||||
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
|
||||
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
|
||||
*
|
||||
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
|
||||
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
|
||||
* Copyright 2012 Broad Institute, Inc.
|
||||
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
|
||||
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
|
||||
*
|
||||
* 4. INDEMNIFICATION
|
||||
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
|
||||
*
|
||||
* 5. NO REPRESENTATIONS OR WARRANTIES
|
||||
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
|
||||
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
|
||||
*
|
||||
* 6. ASSIGNMENT
|
||||
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
|
||||
*
|
||||
* 7. MISCELLANEOUS
|
||||
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
|
||||
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
|
||||
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
|
||||
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
|
||||
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
|
||||
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
|
||||
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/14/12
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Unit tests for LikelihoodCalculationEngine
|
||||
*/
|
||||
public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
||||
|
||||
@Test
|
||||
public void testNormalizeDiploidLikelihoodMatrixFromLog10() {
|
||||
double[][] likelihoodMatrix = {
|
||||
{-90.2, 0, 0},
|
||||
{-190.1, -2.1, 0},
|
||||
{-7.0, -17.5, -35.9}
|
||||
};
|
||||
double[][] normalizedMatrix = {
|
||||
{-88.1, 0, 0},
|
||||
{-188.0, 0.0, 0},
|
||||
{-4.9, -15.4, -33.8}
|
||||
};
|
||||
|
||||
|
||||
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix));
|
||||
|
||||
double[][] likelihoodMatrix2 = {
|
||||
{-90.2, 0, 0, 0},
|
||||
{-190.1, -2.1, 0, 0},
|
||||
{-7.0, -17.5, -35.9, 0},
|
||||
{-7.0, -17.5, -35.9, -1000.0},
|
||||
};
|
||||
double[][] normalizedMatrix2 = {
|
||||
{-88.1, 0, 0, 0},
|
||||
{-188.0, 0.0, 0, 0},
|
||||
{-4.9, -15.4, -33.8, 0},
|
||||
{-4.9, -15.4, -33.8, -997.9},
|
||||
};
|
||||
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
|
||||
}
|
||||
|
||||
@DataProvider(name = "PcrErrorModelTestProvider")
|
||||
public Object[][] createPcrErrorModelTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) {
|
||||
for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) {
|
||||
tests.add(new Object[]{repeat, repeatLength});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PcrErrorModelTestProvider", enabled = true)
|
||||
public void createPcrErrorModelTest(final String repeat, final int repeatLength) {
|
||||
|
||||
final LikelihoodCalculationEngine engine = new LikelihoodCalculationEngine((byte)0, false, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE);
|
||||
|
||||
final String readString = Utils.dupString(repeat, repeatLength);
|
||||
final byte[] insQuals = new byte[readString.length()];
|
||||
final byte[] delQuals = new byte[readString.length()];
|
||||
Arrays.fill(insQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
|
||||
Arrays.fill(delQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
|
||||
|
||||
engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals);
|
||||
|
||||
final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
|
||||
repeatCovariate.initialize(LikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, LikelihoodCalculationEngine.MAX_REPEAT_LENGTH);
|
||||
|
||||
for ( int i = 1; i < insQuals.length; i++ ) {
|
||||
|
||||
final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond();
|
||||
final byte adjustedScore = LikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0);
|
||||
|
||||
Assert.assertEquals(insQuals[i-1], adjustedScore);
|
||||
Assert.assertEquals(delQuals[i-1], adjustedScore);
|
||||
}
|
||||
}
|
||||
|
||||
// BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests!
|
||||
/*
|
||||
private class BasicLikelihoodTestProvider extends TestDataProvider {
|
||||
public Double readLikelihoodForHaplotype1;
|
||||
public Double readLikelihoodForHaplotype2;
|
||||
public Double readLikelihoodForHaplotype3;
|
||||
|
||||
public BasicLikelihoodTestProvider(double a, double b) {
|
||||
super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f",a,b));
|
||||
readLikelihoodForHaplotype1 = a;
|
||||
readLikelihoodForHaplotype2 = b;
|
||||
readLikelihoodForHaplotype3 = null;
|
||||
}
|
||||
|
||||
public BasicLikelihoodTestProvider(double a, double b, double c) {
|
||||
super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f / %f",a,b,c));
|
||||
readLikelihoodForHaplotype1 = a;
|
||||
readLikelihoodForHaplotype2 = b;
|
||||
readLikelihoodForHaplotype3 = c;
|
||||
}
|
||||
|
||||
public double[][] expectedDiploidHaplotypeMatrix() {
|
||||
if( readLikelihoodForHaplotype3 == null ) {
|
||||
double maxValue = Math.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2);
|
||||
double[][] normalizedMatrix = {
|
||||
{readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY},
|
||||
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue}
|
||||
};
|
||||
return normalizedMatrix;
|
||||
} else {
|
||||
double maxValue = MathUtils.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2,readLikelihoodForHaplotype3);
|
||||
double[][] normalizedMatrix = {
|
||||
{readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY},
|
||||
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue, Double.NEGATIVE_INFINITY},
|
||||
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue,
|
||||
Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype2) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, readLikelihoodForHaplotype3 - maxValue}
|
||||
};
|
||||
return normalizedMatrix;
|
||||
}
|
||||
}
|
||||
|
||||
public double[][] calcDiploidHaplotypeMatrix() {
|
||||
ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
|
||||
for( int iii = 1; iii <= 3; iii++) {
|
||||
Double readLikelihood = ( iii == 1 ? readLikelihoodForHaplotype1 : ( iii == 2 ? readLikelihoodForHaplotype2 : readLikelihoodForHaplotype3) );
|
||||
int readCount = 1;
|
||||
if( readLikelihood != null ) {
|
||||
Haplotype haplotype = new Haplotype( (iii == 1 ? "AAAA" : (iii == 2 ? "CCCC" : "TTTT")).getBytes() );
|
||||
haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood}, new int[]{readCount});
|
||||
haplotypes.add(haplotype);
|
||||
}
|
||||
}
|
||||
final HashSet<String> sampleSet = new HashSet<String>(1);
|
||||
sampleSet.add("myTestSample");
|
||||
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BasicLikelihoodTestProvider")
|
||||
public Object[][] makeBasicLikelihoodTests() {
|
||||
new BasicLikelihoodTestProvider(-1.1, -2.2);
|
||||
new BasicLikelihoodTestProvider(-2.2, -1.1);
|
||||
new BasicLikelihoodTestProvider(-1.1, -1.1);
|
||||
new BasicLikelihoodTestProvider(-9.7, -15.0);
|
||||
new BasicLikelihoodTestProvider(-1.1, -2000.2);
|
||||
new BasicLikelihoodTestProvider(-1000.1, -2.2);
|
||||
new BasicLikelihoodTestProvider(0, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, 0);
|
||||
new BasicLikelihoodTestProvider(0, -2.2);
|
||||
new BasicLikelihoodTestProvider(-100.1, -200.2);
|
||||
|
||||
new BasicLikelihoodTestProvider(-1.1, -2.2, 0);
|
||||
new BasicLikelihoodTestProvider(-2.2, -1.1, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, -1.1, 0);
|
||||
new BasicLikelihoodTestProvider(-9.7, -15.0, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, -2000.2, 0);
|
||||
new BasicLikelihoodTestProvider(-1000.1, -2.2, 0);
|
||||
new BasicLikelihoodTestProvider(0, 0, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, 0, 0);
|
||||
new BasicLikelihoodTestProvider(0, -2.2, 0);
|
||||
new BasicLikelihoodTestProvider(-100.1, -200.2, 0);
|
||||
|
||||
new BasicLikelihoodTestProvider(-1.1, -2.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(-2.2, -1.1, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1.1, -1.1, -12.121);
|
||||
new BasicLikelihoodTestProvider(-9.7, -15.0, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1.1, -2000.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1000.1, -2.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(0, 0, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1.1, 0, -12.121);
|
||||
new BasicLikelihoodTestProvider(0, -2.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(-100.1, -200.2, -12.121);
|
||||
|
||||
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
|
||||
public void testOneReadWithTwoOrThreeHaplotypes(BasicLikelihoodTestProvider cfg) {
|
||||
double[][] calculatedMatrix = cfg.calcDiploidHaplotypeMatrix();
|
||||
double[][] expectedMatrix = cfg.expectedDiploidHaplotypeMatrix();
|
||||
logger.warn(String.format("Test: %s", cfg.toString()));
|
||||
Assert.assertTrue(compareDoubleArrays(calculatedMatrix, expectedMatrix));
|
||||
}
|
||||
*/
|
||||
|
||||
//Private function to compare 2d arrays
|
||||
private boolean compareDoubleArrays(double[][] b1, double[][] b2) {
|
||||
if( b1.length != b2.length ) {
|
||||
return false; // sanity check
|
||||
}
|
||||
|
||||
for( int i=0; i < b1.length; i++ ){
|
||||
if( b1[i].length != b2[i].length) {
|
||||
return false; // sanity check
|
||||
}
|
||||
for( int j=0; j < b1.length; j++ ){
|
||||
if ( MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0 && !Double.isInfinite(b1[i][j]) && !Double.isInfinite(b2[i][j]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue