diff --git a/build.xml b/build.xml index 16db1cec1..fd0801bfb 100644 --- a/build.xml +++ b/build.xml @@ -22,6 +22,7 @@ ~ OTHER DEALINGS IN THE SOFTWARE. --> + @@ -291,7 +292,7 @@ - + @@ -465,7 +466,7 @@ - + @@ -502,7 +503,7 @@ - + Generating Queue GATK extensions... @@ -520,11 +521,12 @@ - + + @@ -536,7 +538,7 @@ Building Scala... - + @@ -595,8 +597,7 @@ - + @@ -687,7 +688,7 @@ - + @@ -786,20 +787,20 @@ - + - + - + - + @@ -831,7 +832,7 @@ - + @@ -849,7 +850,7 @@ - + @@ -909,7 +910,7 @@ - + - + @@ -948,17 +949,17 @@ - + - + - + - + - + @@ -994,7 +995,7 @@ - + - + @@ -1218,7 +1219,7 @@ - + @@ -1367,7 +1368,7 @@ - + @@ -1399,7 +1400,7 @@ - + @@ -1414,6 +1415,13 @@ + + + + + + + @@ -1433,7 +1441,7 @@ - + diff --git a/ivy.xml b/ivy.xml index ed13af1c2..2e45247ab 100644 --- a/ivy.xml +++ b/ivy.xml @@ -82,8 +82,8 @@ - - + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java index 70ee049f3..c331451d5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -54,8 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; -import java.io.PrintStream; -import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -70,16 +68,40 @@ import java.util.Map; public class StandardCallerArgumentCollection { /** - * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: - * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 + * The expected heterozygosity value used to compute prior probability that a locus is non-reference. + * + * The default priors are for provided for humans: + * + * het = 1e-3 + * + * which means that the probability of N samples being hom-ref at a site is: + * + * 1 - sum_i_2N (het / i) + * + * Note that heterozygosity as used here is the population genetics concept: + * + * http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics + * + * That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population of organisms + * would differ from each other (one being A and the other B) at a rate of 1 in 100 bp. + * + * Note that this quantity has nothing to do with the likelihood of any given sample having a heterozygous genotype, + * which in the GATK is purely determined by the probability of the observed data P(D | AB) under the model that there + * may be a AB het genotype. The posterior probability of this AB genotype would use the het prior, but the GATK + * only uses this posterior probability in determining the prob. that a site is polymorphic. So changing the + * het parameters only increases the chance that a site will be called non-reference across all samples, but + * doesn't actually change the output genotype likelihoods at all, as these aren't posterior probabilities at all. + * + * The quantity that changes whether the GATK considers the possibility of a het genotype at all is the ploidy, + * which determines how many chromosomes each individual in the species carries. */ - @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false) public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; /** * This argument informs the prior probability of having an indel at a site. */ - @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false) + @Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false) public double INDEL_HETEROZYGOSITY = 1.0/8000; @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) @@ -147,11 +169,11 @@ public class StandardCallerArgumentCollection { */ @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION; - public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05; + public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0; /** * This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples. - * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION + * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION. **/ @Advanced @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index b22ea7931..0da865a85 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -112,18 +112,18 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { - HashMap alleleCounts = new HashMap(); - for ( Allele allele : vc.getAlleles() ) + final HashMap alleleCounts = new HashMap<>(); + for ( final Allele allele : vc.getAlleles() ) alleleCounts.put(allele.getBases()[0], 0); - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - for ( PileupElement p : pileup ) { + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); + for ( final PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); } // we need to add counts in the correct order - int[] counts = new int[alleleCounts.size()]; + final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); @@ -141,7 +141,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final HashMap alleleCounts = new HashMap<>(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + for ( final Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java index 9bd641011..21325e6f1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java @@ -51,7 +51,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -68,11 +67,15 @@ import java.util.*; /** - * The depth of coverage of each allele per sample + * The depth of coverage for informative reads for each sample. * - * the depth for the HC is the sum of the informative alleles at this site. It's not perfect (as we cannot - * differentiate between reads that align over the event but aren't informative vs. those that aren't even - * close) but it's a pretty good proxy and it matches with the AD field (i.e., sum(AD) = DP). + * An informative read is defined as one from which the allele it carries can be easily distinguished. An example of a + * case where a read might be uninformative is where it only partially overlaps a short tandem repeat and it is not clear + * whether the read contains the reference allele or e.g. an extra repeat. + * The depth here is the sum of the informative reads at this site as determined by the Haplotype Caller; as such it can + * only be calculated and generated through the Haplotype Caller (it will not work when run through the Variant Annotator). + * This calculation is not perfect but it is a pretty good proxy for depth and it does match the values in the AD field + * (i.e., sum(AD) = DP). */ public class DepthPerSampleHC extends GenotypeAnnotation { public void annotate(final RefMetaDataTracker tracker, @@ -121,6 +124,6 @@ public class DepthPerSampleHC extends GenotypeAnnotation { } public List getDescriptions() { - return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0))); + return Collections.singletonList(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY)); } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 876dbf039..95be967a2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -58,6 +58,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -81,6 +83,7 @@ import java.util.*; *

The Fisher Strand test may not be calculated for certain complex indel cases or for multi-allelic sites.

*/ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { + private final static boolean ENABLE_DEBUGGING = false; private final static Logger logger = Logger.getLogger(FisherStrand.class); private static final String FS = "FS"; @@ -96,9 +99,18 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( !vc.isVariant() ) return null; + if ( vc.hasGenotypes() ) { + final int[][] tableFromPerSampleAnnotations = getTableFromSamples( vc.getGenotypes() ); + if ( tableFromPerSampleAnnotations != null ) { + return pValueForBestTable(tableFromPerSampleAnnotations, null); + } + } + if (vc.isSNP() && stratifiedContexts != null) { final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); + printTable("unfiltered", tableNoFiltering); + printTable("filtered", tableFiltering); return pValueForBestTable(tableFiltering, tableNoFiltering); } else if (stratifiedPerReadAlleleLikelihoodMap != null) { @@ -114,6 +126,32 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return null; } + /** + * Create the FisherStrand table by retrieving the per-sample strand bias annotation and adding them together + * @param genotypes the genotypes from which to pull out the per-sample strand bias annotation + * @return the table used for the FisherStrand p-value calculation, will be null if none of the genotypes contain the per-sample SB annotation + */ + private int[][] getTableFromSamples( final GenotypesContext genotypes ) { + if( genotypes == null ) { throw new IllegalArgumentException("Genotypes cannot be null."); } + + final int[] sbArray = {0,0,0,0}; // forward-reverse -by- alternate-reference + boolean foundData = false; + + for( final Genotype g : genotypes ) { + if( g.isNoCall() || ! g.hasAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME) ) + continue; + + foundData = true; + final String sbbsString = (String) g.getAnyAttribute(StrandBiasBySample.STRAND_BIAS_BY_SAMPLE_KEY_NAME); + final int[] data = encodeSBBS(sbbsString); + for( int index = 0; index < sbArray.length; index++ ) { + sbArray[index] += data[index]; + } + } + + return ( foundData ? decodeSBBS(sbArray) : null ); + } + /** * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 * @@ -145,12 +183,56 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat } public List getKeyNames() { - return Arrays.asList(FS); + return Collections.singletonList(FS); } public List getDescriptions() { - return Arrays.asList( - new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + return Collections.singletonList(new VCFInfoHeaderLine(FS, 1, VCFHeaderLineType.Float, "Phred-scaled p-value using Fisher's exact test to detect strand bias")); + } + + /** + * Helper function to turn the FisherStrand table into the SB annotation array + * @param table the table used by the FisherStrand annotation + * @return the array used by the per-sample Strand Bias annotation + */ + public static int[] getContingencyArray( final int[][] table ) { + if(table.length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + if(table[0].length != 2) { throw new IllegalArgumentException("Expecting a 2x2 strand bias table."); } + final int[] array = new int[4]; // TODO - if we ever want to do something clever with multi-allelic sites this will need to change + array[0] = table[0][0]; + array[1] = table[0][1]; + array[2] = table[1][0]; + array[3] = table[1][1]; + return array; + } + + /** + * Helper function to parse the genotype annotation into the SB annotation array + * @param string the string that is returned by genotype.getAnnotation("SB") + * @return the array used by the per-sample Strand Bias annotation + */ + private static int[] encodeSBBS( final String string ) { + final int[] array = new int[4]; + final StringTokenizer tokenizer = new StringTokenizer(string, ",", false); + for( int index = 0; index < 4; index++ ) { + array[index] = Integer.parseInt(tokenizer.nextToken()); + } + return array; + } + + /** + * Helper function to turn the SB annotation array into the FisherStrand table + * @param array the array used by the per-sample Strand Bias annotation + * @return the table used by the FisherStrand annotation + */ + private static int[][] decodeSBBS( final int[] array ) { + if(array.length != 4) { throw new IllegalArgumentException("Expecting a length = 4 strand bias array."); } + final int[][] table = new int[2][2]; + table[0][0] = array[0]; + table[0][1] = array[1]; + table[1][0] = array[2]; + table[1][1] = array[3]; + return table; } private Double pValueForContingencyTable(int[][] originalTable) { @@ -203,6 +285,20 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue)); } + /** + * Printing information to logger.info for debugging purposes + * + * @param name the name of the table + * @param table the table itself + */ + private void printTable(final String name, final int[][] table) { + if ( ENABLE_DEBUGGING ) { + final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(FS); + logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s", + name, table[0][0], table[0][1], table[1][0], table[1][1], pValue)); + } + } + private static boolean rotateTable(int[][] table) { table[0][0] -= 1; table[1][0] += 1; @@ -267,13 +363,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + public static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + if( stratifiedPerReadAlleleLikelihoodMap == null ) { throw new IllegalArgumentException("stratifiedPerReadAlleleLikelihoodMap cannot be null"); } + if( vc == null ) { throw new IllegalArgumentException("input vc cannot be null"); } + final Allele ref = vc.getReference(); final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); - int[][] table = new int[2][2]; + final int[][] table = new int[2][2]; - for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index bdf37df71..3f815346d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -70,14 +70,16 @@ import java.util.*; * * A continuous generalization of the Hardy-Weinberg test for disequilibrium that works * well with limited coverage per sample. See the 1000 Genomes Phase I release for - * more information. Note that the Inbreeding Coefficient will not be calculated for files - * with fewer than a minimum (generally 10) number of samples. + * more information. Note that the Inbreeding Coefficient can only be calculated for + * cohorts containing at least 10 founder samples. */ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final int MIN_SAMPLES = 10; + private static final String INBREEDING_COEFFICIENT_KEY_NAME = "InbreedingCoeff"; private Set founderIds; + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -92,15 +94,15 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) + if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) return null; int idxAA = 0, idxAB = 1, idxBB = 2; if (!vc.isBiallelic()) { // for non-bliallelic case, do test with most common alt allele. - // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. - int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); + // Get then corresponding indices in GL vectors to retrieve GL of AA,AB and BB. + final int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); idxAA = idxVector[0]; idxAB = idxVector[1]; idxBB = idxVector[2]; @@ -132,12 +134,12 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno final double q = 1.0 - p; // expected alternative allele frequency final double F = 1.0 - ( hetCount / ( 2.0 * p * q * (double)N ) ); // inbreeding coefficient - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.4f", F)); - return map; + return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F)); } - public List getKeyNames() { return Arrays.asList("InbreedingCoeff"); } + @Override + public List getKeyNames() { return Collections.singletonList(INBREEDING_COEFFICIENT_KEY_NAME); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("InbreedingCoeff", 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } + @Override + public List getDescriptions() { return Collections.singletonList(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY_NAME, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index ad974a083..4e6e87797 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -70,12 +70,19 @@ import java.util.*; *

Given a variant context, this tool uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation * versus the likelihood of the site transmitting according to mendelian rules.

* + *

Caveats

+ * + *

This tool assumes that the organism is diploid.

+ * *

Note that this annotation requires a valid ped file.

* - *

Caveat

- *

This tool assumes that the organism is diploid. When multiple trios are present, the annotation is simply the maximum + *

When multiple trios are present, the annotation is simply the maximum * of the likelihood ratios, rather than the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain * sites and many trios.

+ * + *

This annotation can only be used from the Variant Annotator. + * If you attempt to use it from the UnifiedGenotyper, the run will fail with an error message to that effect. + * If you attempt to use it from the HaplotypeCaller, the run will complete successfully but the annotation will not be added to any variants.

*/ public class MVLikelihoodRatio extends InfoFieldAnnotation implements RodRequiringAnnotation { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index a3fbcc439..906cfa021 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -54,6 +54,8 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.coverage.DepthOfCoverage; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; @@ -94,19 +96,20 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( !genotype.isHet() && !genotype.isHomVar() ) continue; - if (stratifiedContexts!= null) { - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if (stratifiedContexts!= null && !stratifiedContexts.isEmpty()) { + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; depth += context.getBasePileup().depthOfCoverage(); - } - else if (perReadAlleleLikelihoodMap != null) { - PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + } else if (perReadAlleleLikelihoodMap != null) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) continue; depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + } else if (genotype.hasDP() && vc.isBiallelic()) { // TODO -- this currently only works with biallelic variants for now because multiallelics have had their PLs stripped out and therefore their qual score can't be recomputed + depth += genotype.getDP(); } } @@ -116,7 +119,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); QD = fixTooHighQD(QD); - Map map = new HashMap(); + Map map = new HashMap<>(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 1ba13afa1..ab5a40145 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -83,7 +83,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null + // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() == 0) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java new file mode 100644 index 000000000..fde344e9f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/StrandBiasBySample.java @@ -0,0 +1,100 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypeBuilder; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; + +import java.util.*; + +/** + * Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias + * User: rpoplin + * Date: 8/28/13 + */ + +public class StrandBiasBySample extends GenotypeAnnotation implements ExperimentalAnnotation { + + public final static String STRAND_BIAS_BY_SAMPLE_KEY_NAME = "SB"; + + @Override + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) + return; + + if (alleleLikelihoodMap == null ) + throw new IllegalStateException("StrandBiasBySample can only be used with likelihood based annotations in the HaplotypeCaller"); + + final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc); + + gb.attribute(STRAND_BIAS_BY_SAMPLE_KEY_NAME, FisherStrand.getContingencyArray(table)); + } + + @Override + public List getKeyNames() { return Collections.singletonList(STRAND_BIAS_BY_SAMPLE_KEY_NAME); } + + @Override + public List getDescriptions() { return Collections.singletonList(new VCFFormatHeaderLine(getKeyNames().get(0), 4, VCFHeaderLineType.Integer, "Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.")); } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index ba2c2ae56..5e84076fd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -59,7 +59,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; * out due to mapping or base quality. */ public class HeaderElement { - private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site. + private BaseAndQualsCounts positiveConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site. + private BaseAndQualsCounts negativeConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site. private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site. private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right private int location; // Genome location of this site (the sliding window knows which contig we're at @@ -70,14 +71,20 @@ public class HeaderElement { return location; } - public BaseAndQualsCounts getFilteredBaseCounts() { + /** + * Get the base counts object for the consensus type + * + * @param consensusType the type to use + * @return non-null base counts + */ + public BaseAndQualsCounts getBaseCounts(final SlidingWindow.ConsensusType consensusType) { + if ( consensusType == SlidingWindow.ConsensusType.POSITIVE_CONSENSUS ) + return positiveConsensusBaseCounts; + if ( consensusType == SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS ) + return negativeConsensusBaseCounts; return filteredBaseCounts; } - public BaseAndQualsCounts getConsensusBaseCounts() { - return consensusBaseCounts; - } - /** * Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty * filteredBaseCounts - 0 insertions to the right - empty mappingQuality list @@ -85,7 +92,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location); } /** @@ -95,20 +102,22 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location); } /** * Creates a new HeaderElement with all given parameters * - * @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read + * @param positiveConsensusBaseCounts the BaseCounts object for the running positive consensus synthetic read + * @param negativeConsensusBaseCounts the BaseCounts object for the running negative consensus synthetic read * @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read * @param insertionsToTheRight number of insertions to the right of this HeaderElement * @param location the reference location of this reference element * HeaderElement */ - public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int location) { - this.consensusBaseCounts = consensusBaseCounts; + public HeaderElement(final BaseAndQualsCounts positiveConsensusBaseCounts, final BaseAndQualsCounts negativeConsensusBaseCounts, final BaseAndQualsCounts filteredBaseCounts, final int insertionsToTheRight, final int location) { + this.positiveConsensusBaseCounts = positiveConsensusBaseCounts; + this.negativeConsensusBaseCounts = negativeConsensusBaseCounts; this.filteredBaseCounts = filteredBaseCounts; this.insertionsToTheRight = insertionsToTheRight; this.location = location; @@ -124,7 +133,8 @@ public class HeaderElement { * @return true if site is variant by any definition. False otherwise. */ public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) { - return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); + return ( hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) ) + && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); } /** @@ -138,13 +148,18 @@ public class HeaderElement { * @param minBaseQual the minimum base qual allowed to be a good base * @param minMappingQual the minimum mapping qual allowed to be a good read * @param isSoftClipped true if the base is soft-clipped in the original read + * @param isNegativeStrand true if the base comes from a read on the negative strand */ - public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { + public void addBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) { // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts - if ( baseMappingQuality >= minMappingQual ) - consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - else + if ( baseMappingQuality >= minMappingQual ) { + if ( isNegativeStrand ) + negativeConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); + else + positiveConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); + } else { filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); + } } /** @@ -158,14 +173,20 @@ public class HeaderElement { * @param minBaseQual the minimum base qual allowed to be a good base * @param minMappingQual the minimum mapping qual allowed to be a good read * @param isSoftClipped true if the base is soft-clipped in the original read + * @param isNegativeStrand true if the base comes from a read on the negative strand */ - public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { + public void removeBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) { // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts - if ( baseMappingQuality >= minMappingQual ) - consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - else + if ( baseMappingQuality >= minMappingQual ) { + if ( isNegativeStrand ) + negativeConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); + else + positiveConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); + } else { filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); + } } + /** * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions * should be added to the right of the element. @@ -177,19 +198,11 @@ public class HeaderElement { /** * Does this HeaderElement contain consensus data? * + * @param consensusType the type to use * @return whether or not this HeaderElement contains consensus data */ - public boolean hasConsensusData() { - return consensusBaseCounts.totalCount() > 0; - } - - /** - * Does this HeaderElement contain filtered data? - * - * @return whether or not this HeaderElement contains filtered data - */ - public boolean hasFilteredData() { - return filteredBaseCounts.totalCount() > 0; + public boolean hasConsensusData(final SlidingWindow.ConsensusType consensusType) { + return getBaseCounts(consensusType).totalCount() > 0; } /** @@ -198,7 +211,7 @@ public class HeaderElement { * @return whether or not this HeaderElement has no data */ public boolean isEmpty() { - return (!hasFilteredData() && !hasConsensusData()); + return !hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.FILTERED); } /** @@ -224,7 +237,7 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess insertions */ private boolean isVariantFromInsertions(double minIndelProportion) { - final int numberOfBases = consensusBaseCounts.totalCount(); + final int numberOfBases = totalCountForBothStrands(); if (numberOfBases == 0) return (insertionsToTheRight > 0); // do we only have insertions? @@ -232,13 +245,18 @@ public class HeaderElement { return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; } + private int totalCountForBothStrands() { + return positiveConsensusBaseCounts.totalCount() + negativeConsensusBaseCounts.totalCount(); + } + /** * Whether or not the HeaderElement is variant due to excess deletions * * @return whether or not the HeaderElement is variant due to excess deletions */ private boolean isVariantFromDeletions(double minIndelProportion) { - return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; + return positiveConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || positiveConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion + || negativeConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || negativeConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; } /** @@ -249,9 +267,23 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess mismatches */ protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) { - final int totalCount = consensusBaseCounts.totalCountWithoutIndels(); - final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); - final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon); + return isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || + isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS); + } + + /** + * Whether or not the HeaderElement is variant due to excess mismatches + * + * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage). + * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage). + * @param consensusType the consensus type to use + * @return whether or not the HeaderElement is variant due to excess mismatches + */ + private boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion, final SlidingWindow.ConsensusType consensusType) { + final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType); + final int totalCount = baseAndQualsCounts.totalCountWithoutIndels(); + final BaseIndex mostCommon = baseAndQualsCounts.baseIndexWithMostProbabilityWithoutIndels(); + final int countOfOtherBases = totalCount - baseAndQualsCounts.countOfBase(mostCommon); return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion); } @@ -262,8 +294,20 @@ public class HeaderElement { * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. */ protected boolean isVariantFromSoftClips() { - final int nSoftClippedBases = consensusBaseCounts.nSoftclips(); - return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); + return isVariantFromSoftClips(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || isVariantFromSoftClips(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS); + } + + /** + * This handles the special case where we have more bases that came from soft clips than bases that came from + * normal bases by forcing it to become a variant region. We don't want a consensus based on too little information. + * + * @param consensusType the consensus type to use + * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. + */ + private boolean isVariantFromSoftClips(final SlidingWindow.ConsensusType consensusType) { + final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType); + final int nSoftClippedBases = baseAndQualsCounts.nSoftclips(); + return nSoftClippedBases > 0 && nSoftClippedBases >= (baseAndQualsCounts.totalCount() - nSoftClippedBases); } /** @@ -287,9 +331,9 @@ public class HeaderElement { */ public ObjectArrayList getAlleles(final double minVariantPvalue, final double minVariantProportion) { // make sure we have bases at all - final int totalBaseCount = consensusBaseCounts.totalCount(); + final int totalBaseCount = totalCountForBothStrands(); if ( totalBaseCount == 0 ) - return new ObjectArrayList(0); + return new ObjectArrayList<>(0); // next, check for insertions; technically, the insertion count can be greater than totalBaseCount // (because of the way insertions are counted), so we need to account for that @@ -297,9 +341,9 @@ public class HeaderElement { return null; // finally, check for the bases themselves (including deletions) - final ObjectArrayList alleles = new ObjectArrayList(4); + final ObjectArrayList alleles = new ObjectArrayList<>(4); for ( final BaseIndex base : BaseIndex.values() ) { - final int baseCount = consensusBaseCounts.countOfBase(base); + final int baseCount = positiveConsensusBaseCounts.countOfBase(base) + negativeConsensusBaseCounts.countOfBase(base); if ( baseCount == 0 ) continue; @@ -320,7 +364,7 @@ public class HeaderElement { * @return true if there are significant softclips, false otherwise */ public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) { - return hasSignificantCount(consensusBaseCounts.nSoftclips(), consensusBaseCounts.totalCount(), minVariantPvalue, minVariantProportion); + return hasSignificantCount(positiveConsensusBaseCounts.nSoftclips() + negativeConsensusBaseCounts.nSoftclips(), totalCountForBothStrands(), minVariantPvalue, minVariantProportion); } /* diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 31fe7e380..383ba5ee9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -84,10 +84,10 @@ import java.util.List; * Reduces the BAM file using read based compression that keeps only essential information for variant calling * *

- * This walker will generated reduced versions of the BAM files that still follow the BAM spec - * and contain all the information necessary for the GSA variant calling pipeline. Some options - * allow you to tune in how much compression you want to achieve. The default values have been - * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the + * This tool will generate reduced versions of the BAM files that still follow the BAM specification + * and contain all the information necessary to call variants according to the GATK Best Practices recommendations. + * Some options allow you to tune how much compression you want to achieve. The default values have been + * shown to reduce a typical whole exome BAM file by 100x. The higher the coverage, the bigger the * savings in file size and performance of the downstream tools. * *

Input

@@ -121,25 +121,25 @@ public class ReduceReads extends ReadWalker, Redu private SAMFileWriter writerToUse = null; /** - * The number of bases to keep around mismatches (potential variation) + * */ - @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) + @Argument(fullName = "context_size", shortName = "cs", doc = "The number of bases to keep around mismatches (potential variation)", required = false) public int contextSize = 10; /** - * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have + * Reads that have * mapping quality below this threshold will not be counted towards consensus, but are still counted * towards variable regions. */ - @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) + @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "The minimum mapping quality to be considered for the consensus synthetic read", required = false) public int minMappingQuality = 20; /** - * The minimum base quality to be considered for the consensus synthetic read. Reads that have + * Reads that have * base quality below this threshold will not be counted towards consensus, but are still counted * towards variable regions. */ - @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) + @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "The minimum base quality to be considered for the consensus synthetic read", required = false) public byte minBaseQual = 15; /** @@ -160,81 +160,77 @@ public class ReduceReads extends ReadWalker, Redu public List> known = Collections.emptyList(); /** - * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals - * and read group). + * This strips away all extra information of the read -- anything other than bases, quals + * and read group. */ - @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) + @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "Do not simplify read", required = false) public boolean DONT_SIMPLIFY_READS = false; /** - * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. - * The program will behave correctly in those cases. + * Note that it is not necessary to turn this on for reads that are not mate paired. + * The program will behave correctly by default in those cases. */ - @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) + @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "Do not hard clip adaptor sequences", required = false) public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; /** - * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail + * This option overrides the argument of minimum tail * quality. */ - @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) + @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "Do not hard clip the low quality tails of the reads", required = false) public boolean DONT_CLIP_LOW_QUAL_TAILS = false; /** - * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped + * By default, ReduceReads will hard clip away any low quality soft clipped * base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) */ - @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) + @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "Do not use high quality soft-clipped bases", required = false) public boolean DONT_USE_SOFTCLIPPED_BASES = false; /** - * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee + * By default, ReduceReads will compress read names to numbers and guarantee * uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. */ - @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) + @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "Do not compress read names", required = false) public boolean DONT_COMPRESS_READ_NAMES = false; /** - * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval - * border. + * The hard clips will happen exactly at the interval border. */ - @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false) + @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "Hard clip all incoming reads to the desired intervals", required = false) public boolean HARD_CLIP_TO_INTERVAL = false; /** - * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be + * Anything below this will be * considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that * this value is used only regions with high coverage. */ @Advanced - @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) + @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "Minimum proportion of mismatches in a site to trigger a variant region", required = false) public double minAltProportionToTriggerVariant = 0.05; /** - * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region. * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to * trigger polyploid compression). Note that this value is used only regions with low coverage. */ @Advanced - @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false) + @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region", required = false) public double minAltPValueToTriggerVariant = 0.01; /** - * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be - * considered consensus. + * Anything below this will be considered consensus. */ - @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) + @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "Minimum proportion of indels in a site to trigger a variant region", required = false) public double minIndelProportionToTriggerVariant = 0.05; /** - * The number of reads emitted per sample in a variant region can be downsampled for better compression. * This level of downsampling only happens after the region has been evaluated, therefore it can * be combined with the engine level downsampling. * A value of 0 turns downsampling off. */ - @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) + @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "Downsample the number of reads emitted per sample in a variant region for better compression", required = false) public int downsampleCoverage = 250; /** @@ -243,27 +239,27 @@ public class ReduceReads extends ReadWalker, Redu * To prevent users from unintentionally running the tool in a less than ideal manner, we require them * to explicitly enable multi-sample analysis with this argument. */ - @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "enable multi-samples reduction for cancer analysis", required = false) + @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "Enable multi-sample reduction for cancer analysis", required = false) public boolean ALLOW_MULTIPLE_SAMPLES = false; @Hidden - @Argument(fullName = "nwayout", shortName = "nw", doc = "", required = false) + @Argument(fullName = "nwayout", shortName = "nw", doc = "Generate separate output files per input file", required = false) public boolean nwayout = false; @Hidden - @Argument(fullName = "", shortName = "dl", doc = "", required = false) + @Argument(fullName = "", shortName = "dl", doc = "Debug level", required = false) public int debugLevel = 0; @Hidden - @Argument(fullName = "", shortName = "dr", doc = "", required = false) + @Argument(fullName = "", shortName = "dr", doc = "Debug read", required = false) public String debugRead = ""; @Hidden - @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) + @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "Downsampling strategy", required = false) public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; @Hidden - @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags", required = false) public boolean NO_PG_TAG = false; public enum DownsampleStrategy { @@ -297,7 +293,7 @@ public class ReduceReads extends ReadWalker, Redu throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes"); if ( nwayout && out != null ) - throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other"); + throw new UserException.CommandLineException("--out and --nwayout cannot be used simultaneously; please use one or the other"); if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); @@ -306,7 +302,7 @@ public class ReduceReads extends ReadWalker, Redu throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES ) - throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis"); + throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis. If that is what you want to do, use the -cancer_mode flag."); if ( known.isEmpty() ) knownSnpPositions = null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 5115a6777..d5aa8f944 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -59,7 +59,6 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -87,12 +86,10 @@ public class SlidingWindow { protected int downsampleCoverage; // Running consensus data - protected SyntheticRead runningConsensus; protected int consensusCounter; protected String consensusReadName; // Filtered Data Consensus data - protected SyntheticRead filteredDataConsensus; protected int filteredDataConsensusCounter; protected String filteredDataReadName; @@ -109,12 +106,12 @@ public class SlidingWindow { private static CompressionStash emptyRegions = new CompressionStash(); /** - * The types of synthetic reads to use in the finalizeAndAdd method + * The types of synthetic reads */ - private enum ConsensusType { - CONSENSUS, - FILTERED, - BOTH + protected enum ConsensusType { + POSITIVE_CONSENSUS, + NEGATIVE_CONSENSUS, + FILTERED } public int getStopLocation() { @@ -144,9 +141,9 @@ public class SlidingWindow { contextSize = 10; - this.windowHeader = new LinkedList(); + this.windowHeader = new LinkedList<>(); windowHeader.addFirst(new HeaderElement(startLocation)); - this.readsInWindow = new PriorityQueue(100, new Comparator() { + this.readsInWindow = new PriorityQueue<>(100, new Comparator() { @Override public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { return read1.getSoftEnd() - read2.getSoftEnd(); @@ -168,8 +165,8 @@ public class SlidingWindow { this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; this.MIN_MAPPING_QUALITY = minMappingQuality; - this.windowHeader = new LinkedList(); - this.readsInWindow = new PriorityQueue(1000, new Comparator() { + this.windowHeader = new LinkedList<>(); + this.readsInWindow = new PriorityQueue<>(1000, new Comparator() { @Override public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { return read1.getSoftEnd() - read2.getSoftEnd(); @@ -187,9 +184,6 @@ public class SlidingWindow { this.filteredDataConsensusCounter = 0; this.filteredDataReadName = "Filtered-" + windowNumber + "-"; - this.runningConsensus = null; - this.filteredDataConsensus = null; - this.downsampleStrategy = downsampleStrategy; this.hasIndelQualities = hasIndelQualities; } @@ -209,7 +203,9 @@ public class SlidingWindow { @Ensures("result != null") public CompressionStash addRead(GATKSAMRecord read) { addToHeader(windowHeader, read); // update the window header counts - readsInWindow.add(read); // add read to sliding reads + // no need to track low mapping quality reads + if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) + readsInWindow.add(read); // add read to sliding reads return slideWindow(read.getUnclippedStart()); } @@ -296,7 +292,7 @@ public class SlidingWindow { } while (!readsInWindow.isEmpty() && readsInWindow.peek().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.poll(); + readsInWindow.poll(); } return regions; @@ -413,280 +409,83 @@ public class SlidingWindow { * * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus * - * @param header the window header + * @param header the header to use * @param start the first header index to add to consensus * @param end the first header index NOT TO add to consensus - * @param strandType the strandedness that the synthetic read should be represented as having + * @param consensusType the consensus type to use * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. */ @Requires({"start >= 0 && (end >= start || end == 0)"}) @Ensures("result != null") - protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { - final ObjectArrayList reads = new ObjectArrayList(); + protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final ConsensusType consensusType) { + final ObjectArrayList reads = new ObjectArrayList<>(); - if ( start < end ) { - final ListIterator headerElementIterator = header.listIterator(start); + SyntheticRead consensus = null; + final ListIterator headerElementIterator = header.listIterator(start); + boolean wasInConsensus = false; - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end)); + for ( int currentPosition = start; currentPosition < end; currentPosition++ ) { - HeaderElement headerElement = headerElementIterator.next(); + if ( ! headerElementIterator.hasNext() ) + throw new IllegalStateException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end)); + final HeaderElement headerElement = headerElementIterator.next(); - if (headerElement.hasConsensusData()) { + if ( headerElement.hasConsensusData(consensusType) ) { + wasInConsensus = true; - // find the end of the consecutive consensus data in the window - final int endOfConsensus = findNextNonConsensusElement(header, start, end); - if (endOfConsensus <= start) - throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); + // add to running consensus + if ( consensus == null ) + consensus = createNewConsensus(consensusType, headerElement.getLocation()); - // add to running consensus and recurse - addToRunningConsensus(header, start, endOfConsensus, strandType); - reads.addAll(addToSyntheticReads(header, endOfConsensus, end, strandType)); + genericAddBaseToConsensus(consensus, headerElement.getBaseCounts(consensusType)); } else { // add any outstanding consensus data - reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); + if ( wasInConsensus ) { + reads.addAll(finalizeAndAdd(consensus, consensusType)); + consensus = null; + } - // find the end of the consecutive empty data in the window - final int endOfEmptyData = findNextConsensusElement(header, start, end); - if (endOfEmptyData <= start) - throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); - - // recurse out of the empty region - reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, strandType)); + wasInConsensus = false; } } + // add any outstanding consensus data + reads.addAll(finalizeAndAdd(consensus, consensusType)); + return reads; } + private SyntheticRead createNewConsensus(final ConsensusType consensusType, final int start) { + if ( consensusType == ConsensusType.FILTERED ) + return new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, start, hasIndelQualities, SyntheticRead.StrandType.STRANDLESS); + return new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, start, hasIndelQualities, consensusType == ConsensusType.POSITIVE_CONSENSUS ? SyntheticRead.StrandType.POSITIVE : SyntheticRead.StrandType.NEGATIVE); + } + /** - * Finalizes one or more synthetic reads. + * Finalizes a synthetic read. * + * @param consensus the consensus to finalize * @param type the synthetic reads you want to close - * @return a possibly null list of GATKSAMRecords generated by finalizing the synthetic reads + * @return a possibly empty list of GATKSAMRecords generated by finalizing the synthetic reads */ - private ObjectArrayList finalizeAndAdd(final ConsensusType type) { + private ObjectArrayList finalizeAndAdd(final SyntheticRead consensus, final ConsensusType type) { - final ObjectArrayList list = new ObjectArrayList(); + final ObjectArrayList list = new ObjectArrayList<>(); - if ( type == ConsensusType.CONSENSUS || type == ConsensusType.BOTH ) { - final GATKSAMRecord read = finalizeRunningConsensus(); - if ( read != null ) - list.add(read); - } + final GATKSAMRecord read; + if ( type == ConsensusType.FILTERED ) + read = finalizeFilteredDataConsensus(consensus); + else + read = finalizeRunningConsensus(consensus); - if ( type == ConsensusType.FILTERED || type == ConsensusType.BOTH ) { - final GATKSAMRecord read = finalizeFilteredDataConsensus(); - if ( read != null ) - list.add(read); - } + if ( read != null ) + list.add(read); return list; } - /** - * Looks for the next position without consensus data - * - * @param header the header to check - * @param start beginning of the filtered region - * @param upTo limit to search for another consensus element - * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position - */ - private int findNextNonConsensusElement(final LinkedList header, final int start, final int upTo) { - final Iterator headerElementIterator = header.listIterator(start); - int index = start; - while (index < upTo) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("There are no more header elements in this window"); - - if (!headerElementIterator.next().hasConsensusData()) - break; - index++; - } - return index; - } - - /** - * Looks for the next position witho consensus data - * - * @param header the header to check - * @param start beginning of the filtered region - * @param upTo limit to search for another consensus element - * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position - */ - private int findNextConsensusElement(final LinkedList header, final int start, final int upTo) { - final Iterator headerElementIterator = header.listIterator(start); - int index = start; - while (index < upTo) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("There are no more header elements in this window"); - - if (headerElementIterator.next().hasConsensusData()) - break; - index++; - } - return index; - } - - /** - * Adds bases to the filtered data synthetic read. - * - * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData - * bases. - * - * @param header the window header - * @param start the first header index to add to consensus - * @param end the first header index NOT TO add to consensus - * @param strandType the strandedness that the synthetic read should be represented as having - */ - @Requires({"start >= 0 && (end >= start || end == 0)"}) - private void addToRunningConsensus(final LinkedList header, final int start, final int end, final SyntheticRead.StrandType strandType) { - if (runningConsensus == null) - runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), hasIndelQualities, strandType); - - final Iterator headerElementIterator = header.listIterator(start); - - for (int index = start; index < end; index++) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist"); - - final HeaderElement headerElement = headerElementIterator.next(); - if (!headerElement.hasConsensusData()) - throw new ReviewedStingException("No CONSENSUS data in " + index); - - genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts()); - } - } - - /** - * Adds bases to the running filtered data accordingly - * - * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus - * - * @param header the window header - * @param start the first header index to add to consensus - * @param end the first header index NOT TO add to consensus - * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. - */ - @Requires({"start >= 0 && (end >= start || end == 0)"}) - @Ensures("result != null") - protected ObjectArrayList addToFilteredReads(final LinkedList header, final int start, final int end) { - final ObjectArrayList reads = new ObjectArrayList(); - - if ( start < end ) { - final ListIterator headerElementIterator = header.listIterator(start); - - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end)); - - HeaderElement headerElement = headerElementIterator.next(); - - if (headerElement.hasFilteredData()) { - - // find the end of the consecutive filtered data in the window - final int endOfFiltered = findNextNonFilteredElement(header, start, end); - if (endOfFiltered <= start) - throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFiltered, start)); - - // add to running filtered consensus and recurse - addToFilteredData(header, start, endOfFiltered); - reads.addAll(addToFilteredReads(header, endOfFiltered, end)); - - } else { - - // add any outstanding filtered data - reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); - - // find the end of the consecutive empty data in the window - final int endOfEmptyData = findNextFilteredElement(header, start, end); - if (endOfEmptyData <= start) - throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); - - // recurse out of the empty region - reads.addAll(addToFilteredReads(header, endOfEmptyData, end)); - } - } - - return reads; - } - - /** - * Looks for the next position without consensus data - * - * @param header the header to check - * @param start beginning of the filtered region - * @param upTo limit to search for another consensus element - * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position - */ - private int findNextNonFilteredElement(final LinkedList header, final int start, final int upTo) { - final Iterator headerElementIterator = header.listIterator(start); - int index = start; - while (index < upTo) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("There are no more header elements in this window"); - - if (!headerElementIterator.next().hasFilteredData()) - break; - index++; - } - return index; - } - - /** - * Looks for the next position witho consensus data - * - * @param header the header to check - * @param start beginning of the filtered region - * @param upTo limit to search for another consensus element - * @return next position in local coordinates (relative to the windowHeader) with consensus data; otherwise, the start position - */ - private int findNextFilteredElement(final LinkedList header, final int start, final int upTo) { - final Iterator headerElementIterator = header.listIterator(start); - int index = start; - while (index < upTo) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("There are no more header elements in this window"); - - if (headerElementIterator.next().hasFilteredData()) - break; - index++; - } - return index; - } - - - /** - * Adds bases to the filtered data synthetic read. - * - * Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData bases. - * - * @param header the window header - * @param start the first header index to add to consensus - * @param end the first header index NOT TO add to consensus - */ - @Requires({"start >= 0 && (end >= start || end == 0)"}) - @Ensures("result != null") - private void addToFilteredData(final LinkedList header, final int start, final int end) { - - if (filteredDataConsensus == null) - filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), hasIndelQualities, SyntheticRead.StrandType.STRANDLESS); - - ListIterator headerElementIterator = header.listIterator(start); - for (int index = start; index < end; index++) { - if (!headerElementIterator.hasNext()) - throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist"); - - final HeaderElement headerElement = headerElementIterator.next(); - - if (!headerElement.hasFilteredData()) - throw new ReviewedStingException("No filtered data in " + index); - - genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts()); - } - } - /** * Generic accessor to add base and qualities to a synthetic read * @@ -695,10 +494,10 @@ public class SlidingWindow { */ private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) { final BaseIndex base = baseCounts.baseIndexWithMostProbability(); - byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE); - byte qual = baseCounts.averageQualsOfBase(base); - byte insQual = baseCounts.averageInsertionQualsOfBase(base); - byte delQual = baseCounts.averageDeletionQualsOfBase(base); + final int count = baseCounts.countOfBase(base); + final byte qual = baseCounts.averageQualsOfBase(base); + final byte insQual = baseCounts.averageInsertionQualsOfBase(base); + final byte delQual = baseCounts.averageDeletionQualsOfBase(base); syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS()); } @@ -734,20 +533,24 @@ public class SlidingWindow { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); - final ObjectList toRemove = new ObjectArrayList(); + final ObjectList toRemoveFromWindow = new ObjectArrayList<>(); + final ObjectList toEmit = new ObjectArrayList<>(); for ( final GATKSAMRecord read : readsInWindow ) { if ( read.getSoftStart() <= refStop ) { if ( read.getAlignmentEnd() >= refStart ) { - allReads.reads.add(read); + toEmit.add(read); removeFromHeader(windowHeader, read); } - toRemove.add(read); + toRemoveFromWindow.add(read); } } // remove all used reads - for ( final GATKSAMRecord read : toRemove ) + for ( final GATKSAMRecord read : toRemoveFromWindow ) readsInWindow.remove(read); + + // down-sample the unreduced reads if needed + allReads.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(toEmit) : toEmit); } return allReads; @@ -814,7 +617,7 @@ public class SlidingWindow { continue; if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) || - headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) > 1 ) + headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) != 1 ) return true; } @@ -833,14 +636,23 @@ public class SlidingWindow { @Ensures("result != null") protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions); + allReads.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1)); + return allReads; + } - final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed); - result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads); - result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS)); - result.reads.addAll(addToFilteredReads(windowHeader, 0, allReads.stopPerformed + 1)); - result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); - - return result; // finalized reads will be downsampled if necessary + /** + * Adds reads for all possible strands (positive, negative, filtered) from the global windowHeader object + * + * @param start the start position (inclusive) + * @param end the end position (exclusive) + * @return non-null but possibly empty array list with reduced reads + */ + private ObjectArrayList addAllSyntheticReadTypes(final int start, final int end) { + final ObjectArrayList reads = new ObjectArrayList<>(); + reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.POSITIVE_CONSENSUS)); + reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.NEGATIVE_CONSENSUS)); + reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.FILTERED)); + return reads; } /* @@ -851,7 +663,7 @@ public class SlidingWindow { } private static final class CloseVariantRegionResult { - final private ObjectList reads = new ObjectArrayList(); + final private ObjectList reads = new ObjectArrayList<>(); private int stopPerformed; public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; } @@ -866,7 +678,7 @@ public class SlidingWindow { * @return a non-null set of reduced reads representing the finalized regions */ public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) { - final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); + final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet<>(new AlignmentStartWithNoTiesComparator()); if ( !regions.isEmpty() ) { int windowHeaderStart = getStartLocation(windowHeader); @@ -945,9 +757,9 @@ public class SlidingWindow { if (downsampleCoverage >= nReads) return allReads; - ReservoirDownsampler downsampler = new ReservoirDownsampler(downsampleCoverage); + ReservoirDownsampler downsampler = new ReservoirDownsampler<>(downsampleCoverage); downsampler.submit(allReads); - return new ObjectArrayList(downsampler.consumeFinalizedItems()); + return new ObjectArrayList<>(downsampler.consumeFinalizedItems()); } @@ -962,7 +774,7 @@ public class SlidingWindow { @Ensures("result != null") public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { // mark variant regions - ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); + ObjectSet finalizedReads = new ObjectAVLTreeSet<>(new AlignmentStartWithNoTiesComparator()); CompressionStash regions = new CompressionStash(); if (!windowHeader.isEmpty()) { @@ -970,48 +782,45 @@ public class SlidingWindow { regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true); finalizedReads = closeVariantRegions(regions, knownSnpPositions, true); - if (!windowHeader.isEmpty()) { - finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS)); - finalizedReads.addAll(addToFilteredReads(windowHeader, 0, windowHeader.size())); - finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up - } + if (!windowHeader.isEmpty()) + finalizedReads.addAll(addAllSyntheticReadTypes(0, windowHeader.size())); } - return new Pair, CompressionStash>(finalizedReads, regions); + return new Pair<>(finalizedReads, regions); } /** * generates the SAM record for the running consensus read and resets it (to null) * + * @param runningConsensus the consensus to finalize * @return the read contained in the running consensus or null */ - protected GATKSAMRecord finalizeRunningConsensus() { + protected GATKSAMRecord finalizeRunningConsensus(final SyntheticRead runningConsensus) { GATKSAMRecord finalizedRead = null; - if (runningConsensus != null) { - if (runningConsensus.size() > 0) + + if ( runningConsensus != null ) { + if ( runningConsensus.size() > 0 ) finalizedRead = runningConsensus.close(); else consensusCounter--; - - runningConsensus = null; } + return finalizedRead; } /** * generates the SAM record for the filtered data consensus and resets it (to null) * + * @param filteredDataConsensus the consensus to finalize * @return the read contained in the running consensus or null */ - protected GATKSAMRecord finalizeFilteredDataConsensus() { + protected GATKSAMRecord finalizeFilteredDataConsensus(final SyntheticRead filteredDataConsensus) { GATKSAMRecord finalizedRead = null; if (filteredDataConsensus != null) { if (filteredDataConsensus.size() > 0) finalizedRead = filteredDataConsensus.close(); else filteredDataConsensusCounter--; - - filteredDataConsensus = null; } return finalizedRead; } @@ -1021,7 +830,7 @@ public class SlidingWindow { private final static class SingleStrandConsensusData { final HeaderElementList consensus = new HeaderElementList(); - final ObjectList reads = new ObjectArrayList(); + final ObjectList reads = new ObjectArrayList<>(); } /** @@ -1042,6 +851,7 @@ public class SlidingWindow { // initialize the mapping from base (allele) to header final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); + alleleHeaderMap.defaultReturnValue(-1); for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ) { final int currentIndex = alleleHeaderMap.size(); if ( currentIndex > 1 ) @@ -1056,7 +866,7 @@ public class SlidingWindow { if ( alleleHeaderMap.size() != 2 ) throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); - final ObjectList readsToRemove = new ObjectArrayList(); + final ObjectList readsToRemove = new ObjectArrayList<>(); for ( final GATKSAMRecord read : readsInWindow ) { @@ -1081,10 +891,10 @@ public class SlidingWindow { final byte base = read.getReadBases()[readPosOfHet]; // check which allele this read represents - final Integer allele = alleleHeaderMap.get(base); + final int allele = alleleHeaderMap.get(base); // ignore the read if it represents a base that's not part of the consensus - if ( allele != null ) { + if ( allele != -1 ) { // add to the appropriate polyploid header final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; header.reads.add(read); @@ -1096,7 +906,7 @@ public class SlidingWindow { readsInWindow.remove(read); // create the polyploid synthetic reads if we can - final ObjectList hetReads = new ObjectArrayList(); + final ObjectList hetReads = new ObjectArrayList<>(); // sanity check that no new "variant region" exists on just a single consensus strand due to softclips // or multi-allelic sites now that we've broken everything out into their component parts. if one does @@ -1125,10 +935,12 @@ public class SlidingWindow { * @param result list in which to store results */ protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) { - if ( header.size() > 0 ) - result.addAll(addToSyntheticReads(header, 0, header.size(), isNegativeStrand ? SyntheticRead.StrandType.NEGATIVE : SyntheticRead.StrandType.POSITIVE)); - if ( runningConsensus != null ) - result.add(finalizeRunningConsensus()); + if ( header.size() > 0 ) { + if ( isNegativeStrand ) + result.addAll(addToSyntheticReads(header, 0, header.size(), ConsensusType.NEGATIVE_CONSENSUS)); + else + result.addAll(addToSyntheticReads(header, 0, header.size(), ConsensusType.POSITIVE_CONSENSUS)); + } } private void addToHeader(LinkedList header, GATKSAMRecord read) { @@ -1221,7 +1033,8 @@ public class SlidingWindow { protected void actuallyUpdateHeaderForRead(final LinkedList header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) { final Iterator headerElementIterator = header.listIterator(startIndex); - final byte mappingQuality = (byte) read.getMappingQuality(); + final int mappingQuality = read.getMappingQuality(); + final boolean isNegativeStrand = read.getReadNegativeStrandFlag(); // iterator variables int locationIndex = startIndex; @@ -1249,14 +1062,15 @@ public class SlidingWindow { break; case D: - // deletions are added to the baseCounts with the read mapping quality as it's quality score + // deletions are added to the baseCounts with the read mapping quality as its quality score final int nDeletionBases = cigarElement.getLength(); + final byte MQbyte = mappingQuality > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)mappingQuality; for ( int i = 0; i < nDeletionBases; i++ ) { headerElement = headerElementIterator.next(); if (removeRead) - headerElement.removeBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); + headerElement.removeBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand); else - headerElement.addBase(BaseUtils.Base.D.base, mappingQuality, mappingQuality, mappingQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false); + headerElement.addBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand); } locationIndex += nDeletionBases; break; @@ -1279,9 +1093,9 @@ public class SlidingWindow { final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1; if ( removeRead ) - headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); + headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip, isNegativeStrand); else - headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip); + headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip, isNegativeStrand); readBaseIndex++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java index ad6023579..bd69cbdbd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/FindCoveredIntervals.java @@ -101,11 +101,24 @@ public class FindCoveredIntervals extends ActiveRegionWalker { @Argument(fullName = "coverage_threshold", shortName = "cov", doc = "The minimum allowable coverage to be considered covered", required = false) private int coverageThreshold = 20; + @Argument(fullName = "minBaseQuality", shortName = "minBQ", doc = "The minimum allowable base quality score to be counted for coverage",required = false) + private int minBaseQuality = 0; + + @Argument(fullName = "minMappingQuality", shortName = "minMQ", doc = "The minimum allowable mapping quality score to be counted for coverage",required = false) + private int minMappingQuality = 0; + + + + @Override // Look to see if the region has sufficient coverage public ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - int depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage(); + int depth; + if(minBaseQuality == 0 && minMappingQuality == 0) + depth = context.getBasePileup().getBaseFilteredPileup(coverageThreshold).depthOfCoverage(); + else + depth = context.getBasePileup().getBaseAndMappingFilteredPileup(minBaseQuality,minMappingQuality).depthOfCoverage(); // note the linear probability scale return new ActivityProfileState(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index bde324e3c..fbf4b23c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -89,7 +89,12 @@ import java.util.*; *

*

Output

*

- * A modified VCF detailing each interval by sample + * A modified VCF detailing each interval by sample and information for each interval according to the thresholds used. + * Interval information includes GC Content, average interval depth, callable status among others. + * + * If you use the --missing option, you can get as a second output a intervals file with the loci that have missing data. + * This file can then be used as input to QualifyMissingIntervals for full qualification and interpretation of why + * the data is missing. *

*

*

Examples

@@ -117,6 +122,7 @@ public class DiagnoseTargets extends LocusWalker { private static final String AVG_INTERVAL_DP_KEY = "IDP"; private static final String LOW_COVERAGE_LOCI = "LL"; private static final String ZERO_COVERAGE_LOCI = "ZL"; + private static final String GC_CONTENT_KEY = "GC"; @Output(doc = "File to which interval statistics should be written") @@ -161,7 +167,7 @@ public class DiagnoseTargets extends LocusWalker { // at this point, all intervals in intervalMap overlap with this locus, so update all of them for (IntervalStratification intervalStratification : intervalMap.values()) - intervalStratification.addLocus(context); + intervalStratification.addLocus(context, ref); return 1L; } @@ -276,6 +282,7 @@ public class DiagnoseTargets extends LocusWalker { attributes.put(VCFConstants.END_KEY, interval.getStop()); attributes.put(AVG_INTERVAL_DP_KEY, stats.averageCoverage(interval.size())); + attributes.put(GC_CONTENT_KEY, stats.gcContent()); vcb = vcb.attributes(attributes); vcb = vcb.genotypes(genotypes); @@ -391,6 +398,7 @@ public class DiagnoseTargets extends LocusWalker { // INFO fields for overall data headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); headerLines.add(new VCFInfoHeaderLine(AVG_INTERVAL_DP_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine(GC_CONTENT_KEY, 1, VCFHeaderLineType.Float, "GC Content of the interval")); headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java index 3b5a23d51..cd38d28c6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.diagnosetargets; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -57,9 +58,13 @@ final class IntervalStratification extends AbstractStratification { private final Map samples; private final GenomeLoc interval; private List callableStatuses; + private long gcCount = 0; public IntervalStratification(Set samples, GenomeLoc interval, ThresHolder thresholds) { super(thresholds); + + assert interval != null && interval.size() > 0; // contracts + this.interval = interval; this.samples = new HashMap(samples.size()); for (String sample : samples) @@ -83,8 +88,11 @@ final class IntervalStratification extends AbstractStratification { * This takes the input and manages passing the data to the SampleStatistics and Locus Statistics * * @param context The alignment context given from the walker + * @param ref The reference context given from the walker */ - public void addLocus(AlignmentContext context) { + public void addLocus(final AlignmentContext context, final ReferenceContext ref) { + assert ref != null; // contracts + ReadBackedPileup pileup = context.getBasePileup(); Map samplePileups = pileup.getPileupsForSamples(samples.keySet()); @@ -99,7 +107,11 @@ final class IntervalStratification extends AbstractStratification { sampleStratification.addLocus(context.getLocation(), samplePileup); } + gcCount += (ref.getBase() == 'G' || ref.getBase() == 'C') ? 1 : 0; + } + public double gcContent() { + return (double) gcCount / interval.size(); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java index a6cbc1da3..b088951e5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/ThresHolder.java @@ -77,7 +77,7 @@ final class ThresHolder { * If at any locus, a sample has more coverage than this, it will be reported as EXCESSIVE_COVERAGE */ @Argument(fullName = "maximum_coverage", shortName = "max", doc = "The maximum allowable coverage, used for calling EXCESSIVE_COVERAGE", required = false) - public int maximumCoverage = 700; + public int maximumCoverage = Integer.MAX_VALUE / 2; /** * If any sample has a paired read whose distance between alignment starts (between the pairs) is greater than this, it will be reported as BAD_MATE diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java index 5e3da5f4f..63c35fd65 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/Metrics.java @@ -47,29 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; /** - * Short one line description of the walker. - *

- *

- * [Long description of the walker] - *

- *

- *

- *

Input

- *

- * [Description of the Input] - *

- *

- *

Output

- *

- * [Description of the Output] - *

- *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T [walker name]
- *  
+ * Metrics class for the QualifyMissingInterval walker * * @author Mauricio Carneiro * @since 5/1/13 @@ -81,6 +59,8 @@ final class Metrics { private int reads; private int refs; + public Metrics() {} + void reads(int reads) {this.reads = reads;} void refs(int refs) {this.refs = refs;} @@ -91,6 +71,7 @@ final class Metrics { double gccontent() {return refs > 0 ? gccontent/refs : 0.0;} double baseQual() {return reads > 0 ? baseQual/reads : 0.0;} double mapQual() {return reads > 0 ? mapQual/reads : 0.0;} + double depth() {return refs > 0 ? (double) reads/refs : 0.0;} /** * Combines two metrics @@ -107,4 +88,13 @@ final class Metrics { return this; } + + // Test related constructor and methods + protected Metrics(double gccontent, double baseQual, double mapQual, int reads, int refs) { + this.gccontent = gccontent; + this.baseQual = baseQual; + this.mapQual = mapQual; + this.reads = reads; + this.refs = refs; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java index d0db3ef98..54fc6e97e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -47,28 +47,24 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Gather; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.report.GATKReportGatherer; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.text.XReadLines; -import java.io.File; -import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.List; @@ -79,10 +75,12 @@ import java.util.List; *
    *
  • Average Base Quality
  • *
  • Average Mapping Quality
  • + *
  • Average Depth
  • *
  • GC Content
  • - *
  • Position in the target
  • - *
  • Coding Sequence / Intron
  • - *
  • Length of the uncovered area
  • + *
  • Position in the target (Integer.MIN_VALUE if no overlap)
  • + *
  • Length of the overlapping target (zero if no overlap)
  • + *
  • Coding Sequence / Intron (optional)
  • + *
  • Length of the uncovered interval
  • *
* *

Input

@@ -92,7 +90,7 @@ import java.util.List; * *

Output

*

- * GC content calculations per interval. + * GC content, distance from the end of the target, coding sequence intersection, mapping and base quality averages and average depth per "missing" interval. *

* *

Example

@@ -110,32 +108,96 @@ import java.util.List; */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @By(DataSource.REFERENCE) +@PartitionBy(PartitionType.INTERVAL) public final class QualifyMissingIntervals extends LocusWalker implements NanoSchedulable { + /** + * A single GATKReport table with the qualifications on why the intervals passed by the -L argument were missing. + */ + @Gather(GATKReportGatherer.class) @Output protected PrintStream out; + /** + * List of targets used in the experiment. This file will be used to calculate the distance your missing + * intervals are to the targets (usually exons). Typically this is your hybrid selection targets file + * (e.g. Agilent exome target list) + */ @Argument(shortName = "targets", required = true) - public File targetsFile; + public String targetsFile; - @Argument(shortName = "cds", required = false) - public File cdsFile = null; + /** + * List of baits to distinguish untargeted intervals from those that are targeted but not covered + */ + @Argument(shortName = "baits", required = false) + public String baitsFile = null; + + /** + * This value will be used to determine whether or not an interval had too high or too low GC content to be + * sequenced. This is only applied if there was not enough data in the interval. + */ + @Argument(doc = "upper and lower bound for an interval to be considered high/low GC content", + shortName = "gc", required = false) + public double gcThreshold = 0.3; + + /** + * The coverage of a missing interval may determine whether or not an interval is sequenceable. A low coverage will + * trigger gc content, mapping, base qualities and other checks to figure out why this interval was deemed + * unsequenceable. + */ + @Argument(doc = "minimum coverage to be considered sequenceable", + shortName = "cov", required = false) + public int coverageThreshold = 20; + + /** + * An average mapping quality above this value will determine the interval to be mappable. + */ + @Argument(doc = "minimum mapping quality for it to be considered usable", + shortName = "mmq", required = false) + public byte mappingThreshold = 20; + + /** + * An average base quality above this value will rule out the possibility of context specific problems with the + * sequencer. + */ + @Argument(doc = "minimum base quality for it to be considered usable", + shortName = "mbq", required = false) + public byte qualThreshold = 20; + + /** + * Intervals that are too small generate biased analysis. For example an interval of size 1 will have GC content + * 1 or 0. To avoid misinterpreting small intervals, all intervals below this threshold will be ignored in the + * interpretation. + */ + @Argument(doc = "minimum interval length to be considered", + shortName = "size", required = false) + public byte intervalSizeThreshold = 10; + + enum Interpretation { + UNKNOWN, + UNMAPPABLE, + UNSEQUENCEABLE, + GCCONTENT, + NO_DATA, + SMALL_INTERVAL + } GATKReport simpleReport; - GenomeLocSortedSet target; - GenomeLocSortedSet cds; + GenomeLocSortedSet targets; + GenomeLocSortedSet baits; public boolean isReduceByInterval() { return true; } public void initialize() { - simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "IN", "GC", "BQ", "MQ", "TP", "CD", "LN"); + // if cds file is not provided, just use the targets file (no harm done) + if (baitsFile == null) + baitsFile = targetsFile; + + simpleReport = GATKReport.newSimpleReport("QualifyMissingIntervals", "INTERVAL", "GC", "BQ", "MQ", "DP", "POS_IN_TARGET", "TARGET_SIZE", "BAITED", "MISSING_SIZE", "INTERPRETATION"); final GenomeLocParser parser = getToolkit().getGenomeLocParser(); - target = new GenomeLocSortedSet(parser); - cds = new GenomeLocSortedSet(parser); - parseFile(targetsFile, target, parser); - if (cdsFile != null) - parseFile(cdsFile, cds, parser); + targets = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, targetsFile)); + baits = new GenomeLocSortedSet(parser, IntervalUtils.intervalFileToList(parser, baitsFile)); } public Metrics reduceInit() { @@ -156,7 +218,7 @@ public final class QualifyMissingIntervals extends LocusWalker baseQual += qual; } double mapQual = 0.0; - for (byte qual : pileup.getMappingQuals()) { + for (int qual : pileup.getMappingQuals()) { mapQual += qual; } @@ -176,53 +238,90 @@ public final class QualifyMissingIntervals extends LocusWalker public void onTraversalDone(List> results) { for (Pair r : results) { - GenomeLoc interval = r.getFirst(); - Metrics metrics = r.getSecond(); + final GenomeLoc interval = r.getFirst(); + final Metrics metrics = r.getSecond(); + final List overlappingIntervals = targets.getOverlapping(interval); + simpleReport.addRow( interval.toString(), metrics.gccontent(), metrics.baseQual(), metrics.mapQual(), - getPositionInTarget(interval), - cds.overlaps(interval), - interval.size() + metrics.depth(), + getPositionInTarget(interval, overlappingIntervals), + getTargetSize(overlappingIntervals), + baits.overlaps(interval), + interval.size(), + interpret(metrics, interval) ); } simpleReport.print(out); out.close(); } - private static GenomeLoc parseInterval(String s, GenomeLocParser parser) { - if (s.isEmpty()) { - return null; - } - String[] first = s.split(":"); - if (first.length == 2) { - String[] second = first[1].split("\\-"); - return parser.createGenomeLoc(first[0], Integer.decode(second[0]), Integer.decode(second[1])); - } else { - throw new UserException.BadInput("Interval doesn't parse correctly: " + s); + protected static int getPositionInTarget(final GenomeLoc interval, final List targets) { + if (targets.size() > 0) { + final GenomeLoc target = targets.get(0); + + // interval is larger on both ends than the target -- return the maximum distance to either side as a negative number. (min of 2 negative numbers) + if (interval.getStart() < target.getStart() && interval.getStop() > target.getStop()) + return Math.min(target.getStart() - interval.getStart(), target.getStop() - interval.getStop()); + + // interval is a left overlap -- return a negative number representing the distance between the two starts + else if (interval.getStart() < target.getStart()) + return interval.getStart() - target.getStart(); + + // interval is a right overlap -- return a negative number representing the distance between the two stops + else if (interval.getStop() > target.getStop()) + return target.getStop() - interval.getStop(); + + // interval is fully contained -- return the smallest distance to the edge of the target (left or right) as a positive number + return Math.min(interval.getStart() - target.getStart(), target.getStop() - interval.getStop()); } + // if there is no overlapping interval, return int min value. + return Integer.MIN_VALUE; } - private void parseFile(File file, GenomeLocSortedSet set, GenomeLocParser parser) { - try { - for (String s : new XReadLines(file) ) { - GenomeLoc interval = parseInterval(s, parser); - if (interval != null) - set.add(interval, true); - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } + private int getTargetSize(final List overlappingIntervals) { + return overlappingIntervals.size() > 0 ? overlappingIntervals.get(0).size() : -1; } - private int getPositionInTarget(GenomeLoc interval) { - final List hits = target.getOverlapping(interval); - int result = 0; - for (GenomeLoc hit : hits) { - result = interval.getStart() - hit.getStart(); // if there are multiple hits, we'll get the last one. + String interpret(final Metrics metrics, final GenomeLoc interval) { + if (interval.size() < intervalSizeThreshold) { + return Interpretation.SMALL_INTERVAL.toString(); } - return result; + else if (metrics.depth() == 0.0) { + return Interpretation.NO_DATA.toString(); + } + return trim(checkMappability(metrics) + checkGCContent(metrics) + checkContext(metrics)); } + + String checkMappability(Metrics metrics) { + return metrics.depth() >= coverageThreshold && metrics.mapQual() < mappingThreshold ? + Interpretation.UNMAPPABLE + ", " : ""; + } + + String checkGCContent(Metrics metrics) { + return metrics.depth() < coverageThreshold && (metrics.gccontent() < gcThreshold || metrics.gccontent() > 1.0-gcThreshold) ? + Interpretation.GCCONTENT + ", " : ""; + } + + String checkContext(Metrics metrics) { + return metrics.depth() < coverageThreshold && metrics.baseQual() < qualThreshold ? + Interpretation.UNSEQUENCEABLE + ", " : ""; + } + + String trim (String s) { + if (s.isEmpty()) + return Interpretation.UNKNOWN.toString(); + + s = s.trim(); + if (s.endsWith(",")) + s = s.substring(0, s.length() - 1); + return s; + } + + + + } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index ddf47805f..6f16a704f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -76,16 +76,13 @@ public class ConsensusAlleleCounter { private final int minIndelCountForGenotyping; private final boolean doMultiAllelicCalls; private final double minFractionInOneSample; - private final GenomeLocParser locParser; - public ConsensusAlleleCounter(final GenomeLocParser locParser, - final boolean doMultiAllelicCalls, + public ConsensusAlleleCounter(final boolean doMultiAllelicCalls, final int minIndelCountForGenotyping, final double minFractionInOneSample) { this.minIndelCountForGenotyping = minIndelCountForGenotyping; this.doMultiAllelicCalls = doMultiAllelicCalls; this.minFractionInOneSample = minFractionInOneSample; - this.locParser = locParser; } /** @@ -289,7 +286,7 @@ public class ConsensusAlleleCounter { if (vcs.isEmpty()) return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false); return mergedVC.getAlleles(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 93df9f091..f3b26f295 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.DiploidGenotype; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -94,7 +95,7 @@ import static java.lang.Math.pow; */ public class DiploidSNPGenotypeLikelihoods implements Cloneable { - public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; + public final static double DEFAULT_PCR_ERROR_RATE = FragmentUtils.DEFAULT_PCR_ERROR_RATE; protected final static int FIXED_PLOIDY = 2; protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index 9c4694955..3cee8f2d8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -108,7 +108,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener final List allAllelesToUse){ - List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true); + List alleles = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, UAC,true); if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE) alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 0f3f7739d..4a3231b3e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -89,9 +89,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected static List computeConsensusAlleles(final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenomeLocParser locParser, final UnifiedArgumentCollection UAC) { - ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); + ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE); return counter.computeConsensusAlleles(ref, contexts, contextType); } @@ -113,7 +112,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // starting a new site: clear allele list haplotypeMap.clear(); perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods - alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); + alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, UAC, ignoreSNPAllelesWhenGenotypingIndels); if (alleleList.isEmpty()) return null; } @@ -212,7 +211,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final ReferenceContext ref, final Map contexts, final AlignmentContextUtils.ReadOrientation contextType, - final GenomeLocParser locParser, final UnifiedArgumentCollection UAC, final boolean ignoreSNPAllelesWhenGenotypingIndels) { @@ -244,7 +242,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } else { - alleles = computeConsensusAlleles(ref, contexts, contextType, locParser, UAC); + alleles = computeConsensusAlleles(ref, contexts, contextType, UAC); } return alleles; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 360f88e51..f94baf09f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -147,13 +147,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // if we only want variants, then we don't need to calculate genotype likelihoods if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) return builder.make(); - // if user requires all PLs at all sites, add all possible alt alleles - else if (UAC.annotateAllSitesWithPLs) { - for ( final byte base : BaseUtils.BASES ) { - if ( base != refBase ) - alleles.add(Allele.create(base)); - } - } else // otherwise, choose any alternate allele (it doesn't really matter) @@ -199,6 +192,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(myLikelihoods, false, true); gb.PL(genotypeLikelihoods); gb.DP(sampleData.depth); + if (UAC.annotateAllSitesWithPLs) + gb.attribute(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true))); genotypes.add(gb.make()); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 4fae3d6e3..04c5587c3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -95,7 +95,11 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 17; - @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) + /** + * If the fraction of reads with deletions spanning a locus is greater than this value, the site will not be considered callable and will be skipped. + * To disable the use of this parameter, set its value to >1. + */ + @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable", required = false) public Double MAX_DELETION_FRACTION = 0.05; /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 54fcad1df..17d0217f0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -183,6 +183,10 @@ public class UnifiedGenotyper extends LocusWalker, Unif @Output(doc="File to which variants should be written") protected VariantContextWriter writer = null; + @Advanced + @Argument(fullName = "onlyEmitSamples", shortName = "onlyEmitSamples", doc = "If provided, only these samples will be emitted into the VCF, regardless of which samples are present in the BAM file", required = false) + protected Set onlyEmitSamples = Collections.emptySet(); + @Hidden @Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false) protected PrintStream verboseWriter = null; @@ -288,9 +292,16 @@ public class UnifiedGenotyper extends LocusWalker, Unif // and perform any necessary initialization/validation steps annotationEngine.invokeAnnotationInitializationMethods(headerInfo); - writer.writeHeader(new VCFHeader(headerInfo, samples)); - - + final Set samplesForHeader; + if ( ! onlyEmitSamples.isEmpty() ) { + // make sure that onlyEmitSamples is a subset of samples + if ( ! samples.containsAll(onlyEmitSamples) ) + throw new UserException.BadArgumentValue("onlyEmitSamples", "must be a strict subset of the samples in the BAM files but is wasn't"); + samplesForHeader = onlyEmitSamples; + } else { + samplesForHeader = samples; + } + writer.writeHeader(new VCFHeader(headerInfo, samplesForHeader)); } public static Set getHeaderInfo(final UnifiedArgumentCollection UAC, @@ -318,6 +329,9 @@ public class UnifiedGenotyper extends LocusWalker, Unif headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth")); } + if (UAC.annotateAllSitesWithPLs) { + headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT.")); + } VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.DOWNSAMPLED_KEY, VCFConstants.MLE_ALLELE_COUNT_KEY, @@ -384,7 +398,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif try { // we are actually making a call sum.nCallsMade++; - writer.add(call); + writer.add(subsetToEmitSamples(call)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(e.getMessage()); } @@ -400,6 +414,19 @@ public class UnifiedGenotyper extends LocusWalker, Unif return sum; } + /** + * Subset the VariantContext down to just the emitting samples, if onlyEmitSamples has been provided + * @param fullVC the VariantContext containing calls for all samples in the BAM files + * @return a VariantContext that has been appropriately reduced to a subset of samples, if required + */ + private VariantContext subsetToEmitSamples(final VariantContext fullVC) { + if ( onlyEmitSamples.isEmpty() ) { + return fullVC; + } else { + return GATKVariantContextUtils.trimAlleles(fullVC.subContextFromSamples(onlyEmitSamples, false), false, true); + } + } + public void onTraversalDone(UGStatistics sum) { if ( metricsWriter != null ) { metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 9f3368cf8..5c6e9dc01 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -79,6 +79,7 @@ public class UnifiedGenotyperEngine { private static final String GPSTRING = "GENERALPLOIDY"; public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; + public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL"; public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; @@ -373,7 +374,7 @@ public class UnifiedGenotyperEngine { final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java index 063e3b218..f1db5bcd7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTrimmer.java @@ -86,6 +86,7 @@ class ActiveRegionTrimmer { if ( maxDistanceInExtensionForGenotyping < 0 ) throw new IllegalArgumentException("maxDistanceInExtensionForGenotyping must be >= 0 but got " + maxDistanceInExtensionForGenotyping); if ( parser == null ) throw new IllegalArgumentException("parser cannot be null"); + logger.debug("Trimmer created with parameters " + logTrimming + " " + snpPadding + " " + nonSnpPadding + " " + maxDistanceInExtensionForGenotyping); this.logTrimming = logTrimming; this.snpPadding = snpPadding; this.nonSnpPadding = nonSnpPadding; @@ -101,28 +102,35 @@ class ActiveRegionTrimmer { * * @param region our full active region * @param allVariantsWithinExtendedRegion all of the variants found in the entire region, sorted by their start position + * @param emitReferenceConfidence are we going to estimate the reference confidence with this active region? * @return a new ActiveRegion trimmed down to just what's needed for genotyping, or null if we couldn't do this successfully */ - public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion) { + public ActiveRegion trimRegion(final ActiveRegion region, final TreeSet allVariantsWithinExtendedRegion, final boolean emitReferenceConfidence) { + if ( allVariantsWithinExtendedRegion.isEmpty() ) // no variants, so just return the current region return null; - final List withinActiveRegion = new LinkedList(); - int pad = snpPadding; + final List withinActiveRegion = new LinkedList<>(); + boolean foundNonSnp = false; GenomeLoc trimLoc = null; for ( final VariantContext vc : allVariantsWithinExtendedRegion ) { final GenomeLoc vcLoc = parser.createGenomeLoc(vc); if ( region.getLocation().overlapsP(vcLoc) ) { if ( ! vc.isSNP() ) // if anything isn't a SNP use the bigger padding - pad = nonSnpPadding; + foundNonSnp = true; trimLoc = trimLoc == null ? vcLoc : trimLoc.endpointSpan(vcLoc); withinActiveRegion.add(vc); } } + final int pad = ( emitReferenceConfidence || foundNonSnp ? nonSnpPadding : snpPadding ); // we don't actually have anything in the region after removing variants that don't overlap the region's full location if ( trimLoc == null ) return null; +// final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); + // Try to have one kmer before and after any event. + + final GenomeLoc regionLoc = region.getLocation(); final GenomeLoc maxSpan = parser.createPaddedGenomeLoc(region.getLocation(), maxDistanceInExtensionForGenotyping); final GenomeLoc idealSpan = parser.createPaddedGenomeLoc(trimLoc, pad); final GenomeLoc finalSpan = maxSpan.intersect(idealSpan); @@ -130,6 +138,7 @@ class ActiveRegionTrimmer { final ActiveRegion trimmedRegion = region.trim(finalSpan); if ( logTrimming ) { logger.info("events : " + withinActiveRegion); + logger.info("region : " + regionLoc); logger.info("trimLoc : " + trimLoc); logger.info("pad : " + pad); logger.info("idealSpan : " + idealSpan); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java index f07dbb392..658ffc10e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResult.java @@ -47,6 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; /** * Result of assembling, with the resulting graph and status @@ -57,6 +58,7 @@ import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; */ public class AssemblyResult { private final Status status; + private ReadThreadingGraph threadingGraph; private final SeqGraph graph; /** @@ -72,9 +74,25 @@ public class AssemblyResult { this.graph = graph; } + /** + * Returns the threading-graph associated with this assembly-result. + */ + public void setThreadingGraph(final ReadThreadingGraph threadingGraph) { + this.threadingGraph = threadingGraph; + } + + public ReadThreadingGraph getThreadingGraph() { + return threadingGraph; + } + public Status getStatus() { return status; } public SeqGraph getGraph() { return graph; } + public int getKmerSize() { + return graph.getKmerSize(); + } + + /** * Status of the assembly result */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java new file mode 100644 index 000000000..091c09e8d --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSet.java @@ -0,0 +1,466 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.*; + +/** + * Collection of read assembly using several kmerSizes. + * + *

+ * There could be a different assembly per each kmerSize. In turn, haplotypes are result of one of those + * assemblies. + *

+ * + *

+ * Where there is more than one possible kmerSize that generates a haplotype we consider the smaller one. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class AssemblyResultSet { + + private final Map assemblyResultByKmerSize; + private final Set haplotypes; + private final Map assemblyResultByHaplotype; + private ActiveRegion regionForGenotyping; + private byte[] fullReferenceWithPadding; + private GenomeLoc paddedReferenceLoc; + private boolean variationPresent; + private Haplotype refHaplotype; + private boolean wasTrimmed = false; + private final CountSet kmerSizes; + + /** + * Constructs a new empty assembly result set. + */ + public AssemblyResultSet() { + assemblyResultByKmerSize = new LinkedHashMap<>(4); + haplotypes = new LinkedHashSet<>(10); + assemblyResultByHaplotype = new LinkedHashMap<>(10); + kmerSizes = new CountSet(4); + } + + /** + * Trims an assembly result set down based on a new set of trimmed haplotypes. + * + * @param originalByTrimmedHaplotypes map from trimmed to original haplotypes. + * @param trimmedActiveRegion the trimmed down active region. + * + * @throws NullPointerException if any argument in {@code null} or + * if there are {@code null} entries in {@code originalByTrimmedHaplotypes} for trimmed haplotype keys. + * @throws IllegalArgumentException if there is no reference haplotype amongst the trimmed ones. + * + * + * @return never {@code null}, a new trimmed assembly result set. + */ + public AssemblyResultSet trimTo(final ActiveRegion trimmedActiveRegion, + final Map originalByTrimmedHaplotypes) { + if (refHaplotype == null) throw new IllegalStateException(); + if (trimmedActiveRegion == null) throw new NullPointerException(); + final AssemblyResultSet result = new AssemblyResultSet(); + + for (final Haplotype trimmed : originalByTrimmedHaplotypes.keySet()) { + final Haplotype original = originalByTrimmedHaplotypes.get(trimmed); + if (original == null) + throw new NullPointerException("all trimmed haplotypes must have an original one"); + final AssemblyResult as = assemblyResultByHaplotype.get(original); + if (as == null) result.add(trimmed); else result.add(trimmed, as); + } + + result.setRegionForGenotyping(trimmedActiveRegion); + result.setFullReferenceWithPadding(this.fullReferenceWithPadding); + result.setPaddedReferenceLoc(this.paddedReferenceLoc); + if (result.refHaplotype == null) + throw new IllegalStateException("missing reference haplotype in the trimmed set"); + result.wasTrimmed = true; + return result; + } + + /** + * Query the reference haplotype in the result set. + * @return {@code null} if none wasn't yet added, otherwise a reference haplotype. + */ + public Haplotype getReferenceHaplotype() { + return refHaplotype; + } + + /** + * Checks whether there is any variation present in the assembly result set. + * + *

+ * This is equivalent to whether there is more than one haplotype. + *

+ * + * @return {@code true} if there is variation present, {@code false} otherwise. + */ + public boolean isVariationPresent() { + return variationPresent && haplotypes.size() > 1; + } + + /** + * Dumps debugging information into a print-writer. + * + * @param pw where to dump the information. + * + * @throws NullPointerException if {@code pw} is {@code null}. + */ + public void debugDump(final PrintWriter pw) { + if (getHaplotypeList().size() == 0) { + return; + } + pw.println("Active Region " + this.regionForGenotyping.getLocation()); + pw.println("Extended Act Region " + this.getRegionForGenotyping().getExtendedLoc()); + pw.println("Ref haplotype coords " + getHaplotypeList().get(0).getGenomeLocation()); + pw.println("Haplotype count " + haplotypes.size()); + final Map kmerSizeToCount = new HashMap<>(); + + for (final Map.Entry e : assemblyResultByHaplotype.entrySet()) { + final AssemblyResult as = e.getValue(); + final int kmerSize = as.getGraph().getKmerSize(); + if (kmerSizeToCount.containsKey(kmerSize)) { + kmerSizeToCount.put(kmerSize,kmerSizeToCount.get(kmerSize) + 1); + } else { + kmerSizeToCount.put(kmerSize,1); + } + } + pw.println("Kmer sizes count " + kmerSizeToCount.entrySet().size() ); + Integer[] kmerSizes = new Integer[kmerSizeToCount.size()]; + kmerSizes = kmerSizeToCount.keySet().toArray(kmerSizes); + Arrays.sort(kmerSizes); + pw.println("Kmer sizes values " + Arrays.toString(kmerSizes)); + for (int size : kmerSizes) { + pw.println("Kmer size " + size + " count " + kmerSizeToCount.get(size)); + } + } + + /** + * Adds a haplotype to the result set without indicating a generating assembly result. + * + *

+ * It is possible to call this method with the same haplotype several times. In that the second and further + * calls won't have any effect (thus returning {@code false}). + *

+ * + * @param h the haplotype to add to the assembly result set. + * + * @throws NullPointerException if {@code h} is {@code null} + * @throws IllegalArgumentException if {@code h} does not have a genome location. + * + * @return {@code true} if the assembly result set has been modified as a result of this call. + */ + public boolean add(final Haplotype h) { + if (h == null) throw new NullPointerException("input haplotype cannot be null"); + if (h.getGenomeLocation() == null) + throw new IllegalArgumentException("the haplotype provided must have a genomic location"); + if (haplotypes.contains(h)) + return false; + haplotypes.add(h); + updateReferenceHaplotype(h); + return true; + } + + /** + * Adds simultaneously a haplotype and the generating assembly-result. + * + *

+ * Haplotypes and their assembly-result can be added multiple times although just the first call will have + * any effect (return value is {@code true}). + *

+ * + * + * @param h haplotype to add. + * @param ar assembly-result that is assumed to have given rise to that haplotype. + * + * @throws NullPointerException if {@code h} or {@code ar} is {@code null}. + * @throws IllegalArgumentException if {@code h} has not defined genome location. + * + * @return {@code true} iff this called changes the assembly result set. + */ + public boolean add(final Haplotype h, final AssemblyResult ar) { + if (h == null) throw new NullPointerException("input haplotype cannot be null"); + if (ar == null) throw new NullPointerException("input assembly-result cannot be null"); + if (h.getGenomeLocation() == null) + throw new IllegalArgumentException("the haplotype provided must have a genomic location"); + + final boolean assemblyResultAdditionReturn = add(ar); + + if (haplotypes.contains(h)) { + final AssemblyResult previousAr = assemblyResultByHaplotype.get(h); + if (previousAr == null) { + assemblyResultByHaplotype.put(h, ar); + return true; + } else if (!previousAr.equals(ar)) + throw new IllegalStateException("there is already a different assembly result for the input haplotype"); + else + return assemblyResultAdditionReturn; + } else { + haplotypes.add(h); + assemblyResultByHaplotype.put(h,ar); + updateReferenceHaplotype(h); + if (h.isNonReference()) variationPresent = true; + return true; + } + } + + /** + * Add a assembly-result object. + * + * @param ar the assembly result to add. + * + * @throws NullPointerException if {@code ar} is {@code null}. + * @throws IllegalStateException if there is an assembly result with the same kmerSize. + * @return {@code true} iff this addition changed the assembly result set. + */ + public boolean add(final AssemblyResult ar) { + if (ar == null) + throw new NullPointerException(); + final int kmerSize = ar.getKmerSize(); + if (assemblyResultByKmerSize.containsKey(kmerSize)) { + if (!assemblyResultByKmerSize.get(kmerSize).equals(ar)) + throw new IllegalStateException("a different assembly result with the same kmerSize was already added"); + return false; + } else { + assemblyResultByKmerSize.put(kmerSize, ar); + kmerSizes.add(kmerSize); + return true; + } + } + + /** + * Returns the current region for genotyping. + * + * @return might be {@code null}. + */ + public ActiveRegion getRegionForGenotyping() { + return regionForGenotyping; + } + + /** + * Sets the region for genotyping. + * + * @param regionForGenotyping the new value. + */ + public void setRegionForGenotyping(final ActiveRegion regionForGenotyping) { + this.regionForGenotyping = regionForGenotyping; + } + + /** + * Returns the current full reference with padding. + * + * @return might be {@code null}. + */ + public byte[] getFullReferenceWithPadding() { + return fullReferenceWithPadding; + } + + /** + * Sets the full reference with padding base sequence. + * + * @param fullReferenceWithPadding the new value. + */ + public void setFullReferenceWithPadding(final byte[] fullReferenceWithPadding) { + this.fullReferenceWithPadding = fullReferenceWithPadding; + } + + /** + * Returns the padded reference location. + * + * @return might be {@code null} + */ + public GenomeLoc getPaddedReferenceLoc() { + return paddedReferenceLoc; + } + + /** + * Changes the padded reference location. + * @param paddedReferenceLoc the new value. + */ + public void setPaddedReferenceLoc(final GenomeLoc paddedReferenceLoc) { + this.paddedReferenceLoc = paddedReferenceLoc; + } + + /** + * Returns the number of haplotypes in the assembly result set. + * @return {@code 0} or greater. + */ + public int getHaplotypeCount() { + return haplotypes.size(); + } + + /** + * Returns the haplotypes as a list. + * + *

+ * The result is unmodifiable. + *

+ * + * @return never {@code null}, but perhaps a empty list if no haplotype was generated during assembly. + */ + public List getHaplotypeList() { + return Arrays.asList(haplotypes.toArray(new Haplotype[haplotypes.size()])); + } + + /** + * Returns the maximum kmerSize available. + * + * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. + * + * @return greater than 0. + */ + public int getMaximumKmerSize() { + if (kmerSizes.size() == 0) + throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); + return kmerSizes.max(); + } + + /** + * Indicates whether there are more than one kmerSize in the set. + * + * @return {@code true} iff there is more than one kmerSize assembly in the set. + */ + public boolean hasMultipleKmerSizes() { + return kmerSizes.size() > 1; + } + + /** + * Returns the minimum kmerSize available. + * + * @throws IllegalStateException if no assembly-result was added to the set, thus there is no kmerSize. + * + * @return greater than 0. + */ + public int getMinimumKmerSize() { + if (kmerSizes.size() == 0) + throw new IllegalStateException("there is yet no kmerSize in this assembly result set"); + return kmerSizes.min(); + } + + /** + * Returns a read-threading graph in the assembly set that has a particular kmerSize. + * + * @param kmerSize the requested kmerSize. + * + * @return {@code null} if there is no read-threading-graph amongst assembly results with that kmerSize. + */ + public ReadThreadingGraph getUniqueReadThreadingGraph(final int kmerSize) { + final AssemblyResult assemblyResult = assemblyResultByKmerSize.get(kmerSize); + if (assemblyResult == null) return null; + return assemblyResult.getThreadingGraph(); + } + + /** + * Checks whether this assembly result set was trimmed. + * + * @return {@code true} iff this assembly result set was trimmed. + */ + public boolean wasTrimmed() { + return wasTrimmed; + } + + /** + * Marks the assembly as not having variation even if it has more than one haplotype. + */ + public void resetVariationPresent() { + variationPresent = false; + } + + /** + * Dumps debugging information into a logger. + * + * @param logger where to dump the information. + * + * @throws NullPointerException if {@code logger} is {@code null}. + */ + public void debugDump(final Logger logger) { + final StringWriter sw = new StringWriter(); + final PrintWriter pw = new PrintWriter(sw); + debugDump(pw); + final String str = sw.toString(); + final String[] lines = str.split("\n"); + for (final String line : lines) { + if (line.isEmpty()) { + continue; + } + logger.debug(line); + } + } + + /** + * Given whether a new haplotype that has been already added to {@link #haplotypes} collection is the + * reference haplotype and updates {@link #refHaplotype} accordingly. + * + *

+ * This method assumes that the colling code has verified that the haplotype was not already in {@link #haplotypes} + * I.e. that it is really a new one. Otherwise it will result in an exception if it happen to be a reference + * haplotype and this has already be set. This is the case even if the new haplotypes and the current reference + * are equal. + *

+ * + * @param newHaplotype the new haplotype. + * @throws NullPointerException if {@code newHaplotype} is {@code null}. + * @throws IllegalStateException if there is already a reference haplotype. + */ + private void updateReferenceHaplotype(final Haplotype newHaplotype) { + if (!newHaplotype.isReference()) return; + if (refHaplotype == null) + refHaplotype = newHaplotype; + else // assumes that we have checked wether the haplotype is already in the collection and so is no need to check equality. + throw new IllegalStateException("the assembly-result-set already have a reference haplotype that is different"); + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java deleted file mode 100644 index 1a59cdb63..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssembler.java +++ /dev/null @@ -1,263 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; - -/** - * DeBruijn assembler for the HaplotypeCaller - * - * User: ebanks, rpoplin - * Date: Mar 14, 2011 - */ -public class DeBruijnAssembler extends LocalAssemblyEngine { - private final static Logger logger = Logger.getLogger(DeBruijnAssembler.class); - - // TODO -- this number is very low, and limits our ability to explore low-frequency variants. It should - // TODO -- be increased to a large number of eliminated altogether when moving to the bubble caller where - // TODO -- we are no longer considering a combinatorial number of haplotypes as the number of bubbles increases - private final static int NUM_PATHS_PER_GRAPH = 25; - private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers - private static final int GRAPH_KMER_STEP = 6; - private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000; - - private final int minKmer; - private final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - - protected DeBruijnAssembler() { - this(25, -1); - } - - public DeBruijnAssembler(final int minKmer, final int onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) { - super(NUM_PATHS_PER_GRAPH); - this.minKmer = minKmer; - this.onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms = onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms; - } - - @Override - protected List assemble(final List reads, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { - final List results = new LinkedList<>(); - - final int maxKmer = ReadUtils.getMaxReadLength(reads) - KMER_OVERLAP - 1; - if( maxKmer < minKmer) { - // Reads are too small for assembly so don't try to create any assembly graphs - return Collections.emptyList(); - } - // create the graph for each possible kmer - for( int kmer = maxKmer; kmer >= minKmer; kmer -= GRAPH_KMER_STEP ) { - if ( debugGraphTransformations && kmer > onlyBuildKmersOfThisSizeWhenDebuggingGraphAlgorithms) - continue; - - if ( debug ) logger.info("Creating de Bruijn graph for " + kmer + " kmer using " + reads.size() + " reads"); - DeBruijnGraph graph = createGraphFromSequences(reads, kmer, refHaplotype, activeAlleleHaplotypes); - if( graph != null ) { // graphs that fail during creation ( for example, because there are cycles in the reference graph ) will show up here as a null graph object - // do a series of steps to clean up the raw assembly graph to make it analysis-ready - if ( debugGraphTransformations ) graph.printGraph(new File("unpruned.dot"), pruneFactor); - - if ( shouldErrorCorrectKmers() ) { - throw new UserException("Error correction no longer supported because of the " + - "incredibly naive way this was implemented. The command line argument remains because some" + - " future subsystem will actually go and error correct the reads"); - } - - results.add(cleanupSeqGraph(graph.convertToSequenceGraph())); - - if ( debugGraphTransformations ) // we only want to use one graph size - break; - } - } - - return results; - } - - @Requires({"reads != null", "kmerLength > 0", "refHaplotype != null"}) - protected DeBruijnGraph createGraphFromSequences( final List reads, final int kmerLength, final Haplotype refHaplotype, final List activeAlleleHaplotypes ) { - final DeBruijnGraph graph = new DeBruijnGraph(kmerLength); - final DeBruijnGraphBuilder builder = new DeBruijnGraphBuilder(graph); - - // First pull kmers from the reference haplotype and add them to the graph - if ( ! addReferenceKmersToGraph(builder, refHaplotype.getBases()) ) - // something went wrong, so abort right now with a null graph - return null; - - // add the artificial GGA haplotypes to the graph - if ( ! addGGAKmersToGraph(builder, activeAlleleHaplotypes) ) - // something went wrong, so abort right now with a null graph - return null; - - // now go through the graph already seeded with the reference sequence and add the read kmers to it - if ( ! addReadKmersToGraph(builder, reads) ) - // some problem was detected adding the reads to the graph, return null to indicate we failed - return null; - - graph.cleanNonRefPaths(); - return graph; - } - - /** - * Add the high-quality kmers from the artificial GGA haplotypes to the graph - * - * @param builder a debruijn graph builder to add the read kmers to - * @param activeAlleleHaplotypes a list of haplotypes to add to the graph for GGA mode - * @return true if we successfully added the read kmers to the graph without corrupting it in some way - */ - protected boolean addGGAKmersToGraph(final DeBruijnGraphBuilder builder, final List activeAlleleHaplotypes) { - - final int kmerLength = builder.getKmerSize(); - - for( final Haplotype haplotype : activeAlleleHaplotypes ) { - final int end = haplotype.length() - kmerLength; - for( int start = 0; start < end; start++ ) { - builder.addKmerPairFromSeqToGraph( haplotype.getBases(), start, GGA_MODE_ARTIFICIAL_COUNTS ); - } - } - - // always returns true now, but it's possible that we'd add kmers and decide we don't like the graph in some way - return true; - } - - /** - * Add the high-quality kmers from the reads to the graph - * - * @param builder a debruijn graph builder to add the read kmers to - * @param reads a non-null list of reads whose kmers we want to add to the graph - * @return true if we successfully added the read kmers to the graph without corrupting it in some way - */ - protected boolean addReadKmersToGraph(final DeBruijnGraphBuilder builder, final List reads) { - final int kmerLength = builder.getKmerSize(); - - // Next pull kmers out of every read and throw them on the graph - for( final GATKSAMRecord read : reads ) { - final byte[] sequence = read.getReadBases(); - final byte[] qualities = read.getBaseQualities(); - final int[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced - if ( sequence.length > kmerLength + KMER_OVERLAP ) { - int lastGood = -1; // the index of the last good base we've seen - for( int end = 0; end < sequence.length; end++ ) { - if ( qualities[end] < minBaseQualityToUseInAssembly ) { - lastGood = -1; // reset the last good base - } else if ( lastGood == -1 ) { - lastGood = end; // we're at a good base, the last good one is us - } else if ( end - kmerLength >= lastGood ) { - // end - kmerLength (the start) is after the lastGood base, so that kmer is good - final int start = end - kmerLength; - // how many observations of this kmer have we seen? A normal read counts for 1, but - // a reduced read might imply a higher multiplicity for our the edge - int countNumber = 1; - if ( read.isReducedRead() ) { - // compute mean number of reduced read counts in current kmer span - // precise rounding can make a difference with low consensus counts - // TODO -- optimization: should extend arrayMax function to take start stop values - countNumber = MathUtils.arrayMax(Arrays.copyOfRange(reducedReadCounts, start, end)); - } - - builder.addKmerPairFromSeqToGraph(sequence, start, countNumber); - } - } - } - } - - builder.flushKmersToGraph(false); - - // always returns true now, but it's possible that we'd add reads and decide we don't like the graph in some way - return true; - } - - /** - * Add the kmers from the reference sequence to the DeBruijnGraph - * - * @param builder the graph to add the reference kmers to. Must be empty - * @param refSequence the reference sequence from which we'll get our kmers - * @return true if we succeeded in creating a good graph from the reference sequence, false otherwise - */ - protected boolean addReferenceKmersToGraph(final DeBruijnGraphBuilder builder, final byte[] refSequence) { - if ( builder == null ) throw new IllegalArgumentException("graph cannot be null"); - if ( builder.getGraph().vertexSet().size() != 0 ) - throw new IllegalArgumentException("Reference sequences must be added before any other vertices, but got a graph with " + builder.getGraph().vertexSet().size() + " vertices in it already: " + builder.getGraph()); - if ( refSequence == null ) throw new IllegalArgumentException("refSequence cannot be null"); - - final int kmerLength = builder.getKmerSize(); - if( refSequence.length < kmerLength + KMER_OVERLAP ) { - // not enough reference sequence to build a kmer graph of this length, return null - return false; - } - - final int kmersInSequence = refSequence.length - kmerLength + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - builder.addKmerPairFromSeqToGraph(refSequence, iii, 1); - } - builder.flushKmersToGraph(true); - - // we expect that every kmer in the sequence is unique, so that the graph has exactly kmersInSequence vertices - if ( builder.getGraph().vertexSet().size() != kmersInSequence ) { - if( debug ) logger.info("Cycle detected in reference graph for kmer = " + kmerLength + " ...skipping"); - return false; - } - - return true; - } - - @Override - public String toString() { - return "DeBruijnAssembler{" + - "minKmer=" + minKmer + - '}'; - } -} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java new file mode 100644 index 000000000..be7305085 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlock.java @@ -0,0 +1,169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; + +import java.util.*; + +/** + * Represents an event block in the graph. + * + *

+ * Event block is defined as the non-trivial section of the haplotype-graph between two vertices along the + * reference route, that has at least one alternative route between those two vertices. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class EventBlock { + + private final HaplotypeGraph graph; + + private final MultiDeBruijnVertex source; + + private final int sourcePosition; + + private final MultiDeBruijnVertex sink; + + private final int sinkPosition; + + private Set> routesAcross; + + + /** + * Constructs a event block given the base haplotype graph and the source and sink vertice (both included in the block) + * @param graph the base haplotype graph. + * @param source the starting vertex. + * @param sink the ending vertex. + * + * @throws NullPointerException if any of the input is {@code null}. + * @throws IllegalArgumentException if {@code source} or {@code sink} are not part of the graphs reference route, + * such a route does not exists, any of the vertices is not part of such a route or they are out of order. + */ + public EventBlock(final HaplotypeGraph graph, final MultiDeBruijnVertex source, final MultiDeBruijnVertex sink) { + if (graph == null) throw new NullPointerException("the graph cannot be null"); + if (source == null) throw new NullPointerException("the source vertex is null"); + if (sink == null) throw new NullPointerException("the sink node is null"); + this.graph = graph; + this.source = source; + this.sink = sink; + final HaplotypeRoute route = graph.getReferenceRoute(); + if (route == null) + throw new IllegalArgumentException("there is reference route in the graph"); + this.sourcePosition = route.getVertexPosition(source); + this.sinkPosition = route.getVertexPosition(sink); + if (sourcePosition == -1) + throw new IllegalArgumentException("the source vertex does not belong to the reference route"); + if (sinkPosition == -1) + throw new IllegalArgumentException("the sink vertex does not belong to the reference route"); + if (sourcePosition > sinkPosition) + throw new IllegalArgumentException("source and sink vertices are out of order in reference route"); + } + + /** + * Returns a reference to the event block graph. + * + * @return never {@code null}. + */ + public HaplotypeGraph getGraph() { + return graph; + } + + /** + * Returns a reference to the block starting vertex. + * + * @return never {@code null}. + */ + public MultiDeBruijnVertex getSource() { + return source; + } + + /** + * Returns the reference ot the end block vertex. + * + * @return never {@code null}. + */ + public MultiDeBruijnVertex getSink() { + return sink; + } + + /** + * Returns all possible routes between the event block start and end vertices. + * @return never {@code null}, and unmodifiable route set. + */ + public Set> getRoutesAcross() { + // catching: + if (routesAcross != null) return routesAcross; + + final Set> result = new HashSet<>(10); // 10 is rather generous. + + // bread-first iterative search for all paths. + final Queue> queue = new LinkedList<>(); + + queue.add(new Route<>(source, graph)); // the seed is the empty route at the start vertex. + + final HaplotypeRoute referenceRoute = graph.getReferenceRoute(); + + while (!queue.isEmpty()) { + final Route route = queue.remove(); + final MultiDeBruijnVertex routeEndVertex = route.getLastVertex(); + + if (routeEndVertex == sink) // bingo!!! + result.add(route); + else { // only queue promising extension of this route. + final int routeEndPosition = referenceRoute.getVertexPosition(routeEndVertex); + if (routeEndPosition == -1 || (routeEndPosition >= sourcePosition && routeEndPosition < sinkPosition)) + for (final MultiSampleEdge e : graph.outgoingEdgesOf(routeEndVertex)) + queue.add(new Route<>(route, e)); + } + } + return routesAcross = Collections.unmodifiableSet(result); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java new file mode 100644 index 000000000..ca4985cef --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/EventBlockFinder.java @@ -0,0 +1,287 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.*; + +/** + * Encapsulates the graph traversals needed to find event-blocks. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class EventBlockFinder { + + private final HaplotypeGraph graph; + + private final Map,EventBlock> eventBlockCache; + + /** + * Constructs a new engine. + * + * @param graph the base haplotype graph to iterate over. + */ + public EventBlockFinder(final HaplotypeGraph graph) { + if (graph == null) throw new NullPointerException(); + this.graph = graph; + eventBlockCache = new HashMap<>(20); + } + + /** + * Create a new traversal object based on a read anchoring. + * @param anchoring + * @return never {@code null}. + */ + public Traversal traversal(final ReadAnchoring anchoring) { + if (anchoring == null) throw new NullPointerException(); + return new Traversal(anchoring); + } + + + public class Traversal implements Iterable { + + private final ReadAnchoring anchoring; + + private EventBlock lastEventBlock; + + + private Traversal(final ReadAnchoring anchoring) { + this.anchoring = anchoring; + lastEventBlock = findLastEventBlock(anchoring); + } + + @Override + public java.util.Iterator iterator() { + return lastEventBlock == null ? Collections.EMPTY_SET.iterator() : new Iterator(); + } + + private class Iterator implements java.util.Iterator { + + private MultiDeBruijnVertex currentVertex; + + private Iterator() { + currentVertex = anchoring.leftAnchorVertex; + } + + @Override + public boolean hasNext() { + return currentVertex != null; + } + + @Override + public EventBlock next() { + final EventBlock result; + if (currentVertex == null) + throw new NoSuchElementException("going beyond last event block"); + else if (currentVertex == lastEventBlock.getSource()) { + result = lastEventBlock; + currentVertex = null; + } else { + final EventBlock candidate = findEventBlock(anchoring,false,currentVertex,lastEventBlock.getSource()); + if (candidate == null) { + result = findEventBlock(anchoring,false,currentVertex,anchoring.rightAnchorVertex); + currentVertex = null; + } else { + result = candidate; + currentVertex = candidate.getSink(); + } + } + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + } + + /** + * Finds the last event block. + *

+ * It can do it forward or backwards. + *

+ * + * @param anchoring target read anchoring information. + * @return {@code null} if there is no event block, depending on {@code backwards} before or after current + */ + private EventBlock findLastEventBlock( + final ReadAnchoring anchoring) { + return findEventBlock(anchoring,true,anchoring.leftAnchorVertex,anchoring.rightAnchorVertex); + } + + /** + * Finds an event block forward or backwards along the reference route. + * @param anchoring the read anchoring information. + * @param backwards true if the block should be constructed from right to left. + * @param leftVertex the left vertex + * @param rightVertex the right vertex + * @return {@code null} if there is no such a event block between these coordinates. + */ + private EventBlock findEventBlock( + final ReadAnchoring anchoring, final boolean backwards, + final MultiDeBruijnVertex leftVertex, final MultiDeBruijnVertex rightVertex) { + + MultiDeBruijnVertex currentVertex = backwards ? rightVertex : leftVertex; + boolean foundEvent = false; + final CountSet pathSizes = new CountSet(10); // typically more than enough. + pathSizes.setTo(0); + + // Map between reference vertices where there is some expected open alternative path rejoining and the + // predicted length of paths rejoining at that point counting from the beginning of the block. + final Map expectedAlternativePathRejoins = new HashMap<>(4); + + // Keeps record of possible left-clipping veritces; those that are located before any event path furcation + // has been found. The value indicates the blockLength at the time we traverse that node. + final Deque> possibleClippingPoints = new LinkedList<>(); + + // We keep the distance from the beggining of the block (leftVertex). + int blockLength = 0; + while (currentVertex != null) { + int openingDegree = backwards ? graph.outDegreeOf(currentVertex) : graph.inDegreeOf(currentVertex); + if (openingDegree > 1) { + final CountSet joiningPathLengths = expectedAlternativePathRejoins.remove(currentVertex); + if (joiningPathLengths != null) + pathSizes.addAll(joiningPathLengths); + } + final boolean isValidBlockEnd = isValidBlockEnd(anchoring, currentVertex, expectedAlternativePathRejoins); + if (foundEvent && isValidBlockEnd) // !gotcha we found a valid block end. + break; + else if (!foundEvent && isValidBlockEnd) // if no event has been found yet, still is a good clipping point. + possibleClippingPoints.addLast(new Pair<>(currentVertex, blockLength)); + + // We reached the end: + if (currentVertex == (backwards ? leftVertex : rightVertex)) + break; + + // process next vertices, the next one on the reference and also possible start of alternative paths, + // updates traversal structures accordingly. + currentVertex = advanceOnReferencePath(anchoring, backwards, currentVertex, pathSizes, expectedAlternativePathRejoins); + foundEvent |= expectedAlternativePathRejoins.size() > 0; + pathSizes.incAll(1); + blockLength++; + } + + // we have not found an event, thus there is no block to report: + if (!foundEvent) + return null; + + // We try to clip off as much as we can from the beginning of the block before any event, but at least + // leaving enough block length to meet the shortest path unless all paths have the same size (SNPs only) + final int maxClipping = pathSizes.size() <= 1 ? blockLength : pathSizes.min(); + MultiDeBruijnVertex clippingEnd = backwards ? anchoring.rightAnchorVertex : anchoring.leftAnchorVertex; + while (!possibleClippingPoints.isEmpty()) { + final Pair candidate = possibleClippingPoints.removeLast(); + if (candidate.getSecond() <= maxClipping) { + clippingEnd = candidate.getFirst(); + break; + } + } + + return resolveEventBlock(backwards ? new Pair<>(currentVertex, clippingEnd) : new Pair<>(clippingEnd, currentVertex)); + } + + /** + * Gets or constructs a event-block through the cache. + * @param borders the source and sink vertex pair for the requested event block. + * @return never {@code null} + */ + @Requires("borders != null && border.getFirst() != null && border.getSecond() != null") + private EventBlock resolveEventBlock(final Pair borders) { + EventBlock result = eventBlockCache.get(borders); + if (result == null) + eventBlockCache.put(borders,result = new EventBlock(graph, borders.getFirst(),borders.getSecond())); + return result; + } + + /** + * Move on vertex along the reference path checking for the presence of new opening alternative paths. + * + * @param anchoring anchoring information on the targeted read. + * @param backwards whether we are extending the block backwards or forwards. + * @param currentVertex the current vertex. + * @param pathSizes current block path sizes. + * @param expectedAlternativePathRejoins information about location of vertices along the reference path where open alternative paths will rejoin. + * @return the next current-vertex, never {@code null} unless there is a bug. + */ + private MultiDeBruijnVertex advanceOnReferencePath(final ReadAnchoring anchoring, final boolean backwards, final MultiDeBruijnVertex currentVertex, final CountSet pathSizes, final Map expectedAlternativePathRejoins) { + final Set nextEdges = backwards ? graph.incomingEdgesOf(currentVertex) : graph.outgoingEdgesOf(currentVertex); + MultiDeBruijnVertex nextReferenceVertex = null; + for (final MultiSampleEdge e : nextEdges) { + final MultiDeBruijnVertex nextVertex = backwards ? graph.getEdgeSource(e) : graph.getEdgeTarget(e); + if (e.isRef()) + nextReferenceVertex = nextVertex; + else { + final CountSet pathSizesPlusOne = pathSizes.clone(); + pathSizesPlusOne.incAll(1); + graph.calculateRejoins(nextVertex, expectedAlternativePathRejoins, anchoring.referenceWithinAnchorsMap.keySet(), pathSizesPlusOne, true, backwards); + } + } + return nextReferenceVertex; + } + + /** + * Check whether the current vertex is a valid block end. + * + * @param anchoring reads anchoring information necessary to make the evaluation. + * @param currentVertex target potential block end + * @param expectedAlternativePathRejoins traversal states regarding open alternative paths. + * + * @return {@code true} iff so. + */ + private boolean isValidBlockEnd(final ReadAnchoring anchoring, final MultiDeBruijnVertex currentVertex, final Map expectedAlternativePathRejoins) { + final boolean isUniqueKmer = anchoring.uniqueKmerOffsets.containsKey(currentVertex); + final boolean isAnchorable = graph.getAnchorableVertices().contains(currentVertex) && isUniqueKmer && expectedAlternativePathRejoins.size() == 0; + return isUniqueKmer && isAnchorable; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 04173b64f..697d162fd 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -56,7 +56,9 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.DefaultHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -166,6 +168,8 @@ public class GenotypingEngine { // Walk along each position in the key set and create each event to be outputted final Set calledHaplotypes = new HashSet<>(); final List returnCalls = new ArrayList<>(); + final Map emptyDownSamplingMap = new DefaultHashMap<>(0.0); + for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region final List eventsAtThisLoc = getVCsAtThisLocation(haplotypes, loc, activeAllelesToGenotype); @@ -179,11 +183,12 @@ public class GenotypingEngine { final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false); if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { - throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles."); + // this is possible in GGA mode when the same event is represented in multiple input records + throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s)."); } final Map mergeMap = new LinkedHashMap<>(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele @@ -197,13 +202,13 @@ public class GenotypingEngine { logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); } - final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION ); + final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : - convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) ); + convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); @@ -332,7 +337,7 @@ public class GenotypingEngine { for( final String sample : alleleReadMap.keySet() ) { final int numHaplotypes = mergedVC.getAlleles().size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); + final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -406,7 +411,7 @@ public class GenotypingEngine { // BUGBUG: ugh, too complicated protected Map convertHaplotypeReadMapToAlleleReadMap( final Map haplotypeReadMap, final Map> alleleMapper, - final double downsamplingFraction ) { + final Map perSampleDownsamplingFraction ) { final Map alleleReadMap = new LinkedHashMap<>(); for( final Map.Entry haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample @@ -423,7 +428,7 @@ public class GenotypingEngine { perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood); } } - perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java new file mode 100644 index 000000000..8a35ccb05 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -0,0 +1,158 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM; +import org.broadinstitute.sting.utils.pairhmm.FastLoglessPairHMM; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.io.FileWriter; +import java.io.PrintWriter; +import java.util.List; +import java.util.Map; + +/** + * Read likelihood calculation engine base on applying heuristic on the assembly graph. + */ +public class GraphBasedLikelihoodCalculationEngine implements LikelihoodCalculationEngine { + + private static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngine.class); + + /** + * Gap extension penalty in Phred scale. + */ + private byte gcpHMM; + + /** + * Fast-hmm implementation reused across active regions. + */ + private FlexibleHMM hmm; + + /** + * The worst reference vs best-alternative haplotype ratio for any read. The reference haplotype likelihood + * is changes to meet this maximum is needed. + */ + private double log10GlobalReadMismappingRate; + + /** + * How we resolve cases in where we have haplotypes coming from different kmer sizes. + */ + private HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution; + + private enum DebugMode { NONE, DEBUG, EXTRA_DEBUG }; + + private DebugMode debugMode; + + /** + * Creates a new likelihood engine. + * + * @param gapExtensionPenalty the gap extension penalty Phred scale. + * @param log10GlobalReadMismappingRate the global read mismapping rate. + * @param heterogeneousKmerSizeResolution who to resolve assembly with haplotypes generated from different kmerSizes. + * @param debug whether to output some debug messages. + * @param debugHaplotypeGraphAndLikelihoods whether to generate haplotype graph and likelihood files, please only use with small intervals. + */ + public GraphBasedLikelihoodCalculationEngine(final int gapExtensionPenalty, final double log10GlobalReadMismappingRate, + final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution, + final boolean debug, final boolean debugHaplotypeGraphAndLikelihoods) { + gcpHMM = (byte) gapExtensionPenalty; + hmm = new FastLoglessPairHMM(gcpHMM); + this.log10GlobalReadMismappingRate = log10GlobalReadMismappingRate; + this.heterogeneousKmerSizeResolution = heterogeneousKmerSizeResolution; + debugMode = debugHaplotypeGraphAndLikelihoods ? DebugMode.EXTRA_DEBUG : debug ? DebugMode.DEBUG : DebugMode.NONE; + } + + + @Override + public Map computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList) { + final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine = + new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet, + hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution); + final List haplotypes = assemblyResultSet.getHaplotypeList(); + final List supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList(); + if (supportedHaplotypes.size() != haplotypes.size()) logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size()); + final Map result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes, + perSampleReadList ); + if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result); + return result; + } + + /** + * A few debug messages associated with the GraphBased likelihoods engine. + */ + private void graphLikelihoodDebugDumps(final ActiveRegion originalActiveRegion, final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine, + final Map result) { + if (graphLikelihoodEngine.hasCycles()) + logger.debug("Resulting haplotype graph combining several kmer sizes has cycles"); + else if (graphLikelihoodEngine.haplotypeGraph.hasNonReferenceEnds()) + logger.debug("Resulting haplotype graph has ends that do not belong to the reference: " + originalActiveRegion.getLocation()); + else if (!graphLikelihoodEngine.hasVariation()) + logger.debug("Resulting haplotype graph does not contain any alternative haplotype path"); + if (debugMode == DebugMode.EXTRA_DEBUG) { + graphLikelihoodEngine.printGraph(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeGraph.dot"); + final SeqGraph sq = graphLikelihoodEngine.haplotypeGraph.convertToSequenceGraph(); + sq.simplifyGraph(); + sq.printGraph(new File(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeSeqGraph.dot"), 10000); + try { + FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt")); + PrintWriter pw = new PrintWriter(fw); + //Note: we only output the first sample likelihoods, perhaps should output all of them but for debugging this is normally what is needed. + pw.println(result.entrySet().iterator().next().getValue().toString()); + pw.close(); + fw.close(); + } catch (Exception ex) { + throw new StingException("", ex); + } + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java new file mode 100644 index 000000000..66ea7be03 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -0,0 +1,915 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Fast pseudo-likelihood calculation engine based on the assembly haplotype graph. + * + *

+ * An instance is good for active region. {@link GraphBasedLikelihoodCalculationEngine} instance them on demand + * as requested by the {@code HaplotypeCaller} code. + *

+ */ +public class GraphBasedLikelihoodCalculationEngineInstance { + + private final static Logger logger = Logger.getLogger(GraphBasedLikelihoodCalculationEngineInstance.class); + + + /** + * Unified kmer size used for the Haplotype graph. + */ + protected final int kmerSize; + + /** + * Reference to the haplotype graph. + */ + protected final HaplotypeGraph haplotypeGraph; + + /** + * Haplotypes included in the haplotype graph. + */ + private final List haplotypes; + + /** + * Whether there is some variation present in the haplotype assembly. + */ + private final boolean hasVariation; + + + /** + * Counts of reads that anchoread somewhere. + * + *

Used for debugging purposes

+ */ + private int anchoredReads = 0; + + /** + * Count of reads that didn't anchor anywere. + * + *

Used for debugging purposes

+ */ + private int nonAnchoredReads = 0; + + /** + * Pair-hmm implementation to use to calculate read likelihoods. + */ + private final FlexibleHMM hmm; + + /** + * Holds the log10 probability of passing from a indel to a match. + */ + private final double indelToMatchTransitionLog10Probability; + + /** + * Maximum likelihood difference between the reference haplotype and the best alternative haplotype. + * + *

If the difference is greater for a read, the reference haplotype likelihood is increase in order to not go + * beyond this limit

+ */ + protected final double log10globalReadMismappingRate; + + protected final EventBlockFinder eventBlockSearchEngine; + + + /** + * Constructs a new engine based on the results of the assembly. + * + * @param assemblyResultSet assembly-result set + * @param hmm fast-hmm implementation to use. + * @param log10globalReadMismappingRate maximum cost for the reference haplotype vs the best alternative available. + * @param heterogeneousKmerSizeResolution multi-kmersize dataset resolution. + * @throws NullPointerException if any argument is null. + * @throws IllegalArgumentException if log10globalReadMismappingRate >= 0. + */ + public GraphBasedLikelihoodCalculationEngineInstance(final AssemblyResultSet assemblyResultSet, final FlexibleHMM hmm, final double log10globalReadMismappingRate, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) { + if (heterogeneousKmerSizeResolution == null) throw new NullPointerException("the kmerSize resolution cannot be null"); + if (assemblyResultSet == null) throw new NullPointerException("the assembly result set cannot be null"); + if (hmm == null) throw new NullPointerException("the fast-hmm component cannot be null"); + if (log10globalReadMismappingRate >= 0) + throw new IllegalArgumentException("the global reading mismapping rate cannot be positive or zero"); + + this.hmm = hmm; + this.indelToMatchTransitionLog10Probability = QualityUtils.qualToProbLog10(hmm.getGapExtensionPenalty()); + this.log10globalReadMismappingRate = log10globalReadMismappingRate; + + haplotypes = new ArrayList<>(assemblyResultSet.getHaplotypeList()); + Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); + + // make sure that kmerSize is not bigger than the smallest haplotype. It can well happen when there are cycles and kmerSize inflates. + final Haplotype referenceHaplotype = assemblyResultSet.getReferenceHaplotype(); + int minHaplotypeLength = referenceHaplotype.length(); + for (final Haplotype h : haplotypes) + if (minHaplotypeLength > h.length()) + minHaplotypeLength = h.length(); + + // Determine the kmerSize to use for the unified haplotype assembly graph + + kmerSize = Math.min(minHaplotypeLength, + heterogeneousKmerSizeResolution.useMaximum() ? assemblyResultSet.getMaximumKmerSize() : assemblyResultSet.getMinimumKmerSize()); + + haplotypeGraph = new HaplotypeGraph(kmerSize,haplotypes); + + + if (haplotypeGraph.hasCycles()) + Utils.warnUser(logger, "cycle caused at merging haplotypes with different kmerSizes: active region " + assemblyResultSet.getRegionForGenotyping() + " will be skipped"); + + //TODO haplpotypeGraph.getReferenceSourceVertex() == null + //TODO Is a quick patch to ignore cases where the trimming has rendered kmerSize so big that is bigger than the haplotype + //TODO and reduction to the minimum haplotype size result in no unique kmers. + //TODO the actual solution: we need to impose a maximum trimming at least for Graph-based HC runs as we are loosing + //TODO a bit of sensitivity as trimming results in lack of unique kmers. + if (haplotypeGraph.hasCycles() || haplotypeGraph.getReferenceHaplotype() == null) { + hasVariation = false; + eventBlockSearchEngine = null; + return; + } + + haplotypeGraph.mergeCommonChains(); + //TODO recover dangling ends. Did not work the last time I tried but may be worth to retry. + //haplotypeGraph.recoverDanglingTails(-1); + logger.debug("using haplotype graph with kmerSize " + haplotypeGraph.getKmerSize()); + + hasVariation = !haplotypeGraph.hasCycles() && haplotypeGraph.getHaplotypes().size() > 1; + + eventBlockSearchEngine = new EventBlockFinder(haplotypeGraph); + } + + /** + * Determines whether based on result from assembly and the relevant user options we can reuse th existing + * + * @param assemblyResultSet assembly result set. + * @param kmerSize intended kmerSize for the haplotype graph. + * @param heterogeneousKmerSizeResolution user instruction as to how to resolve situation where we have haplotypes comming from different kmer sizes. + * @return {@code true} iff we can reuse an existing read-threading graph with that kmerSize in the assembly result set. + */ + @SuppressWarnings("unused") + private static boolean canReuseReadThreadingGraphAsHaplotypeGraph(final AssemblyResultSet assemblyResultSet, final int kmerSize, final HeterogeneousKmerSizeResolution heterogeneousKmerSizeResolution) { + return !assemblyResultSet.wasTrimmed() && (!assemblyResultSet.hasMultipleKmerSizes() || heterogeneousKmerSizeResolution.combinesKmerSizes()) && + assemblyResultSet.getUniqueReadThreadingGraph(kmerSize) != null; + } + + /** + * Checks whether the underlying haplotype graph assembly contains any variation worth analyzing. + * + * @return {@code true} iff so. + */ + public boolean hasVariation() { + return hasVariation; + } + + /** + * Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the + * active region assembly process. + * + * @param haplotypes to evaluate. + * @param perSampleReadList the input read sets stratified per sample. + * + * @throws NullPointerException if either parameter is {@code null}. + * + * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. + * The value maps can be potentially empty though. + */ + public Map computeReadLikelihoods( + final List haplotypes, + final Map> perSampleReadList) { + // General preparation on the input haplotypes: + Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); + final Map alleleVersions = new LinkedHashMap<>(haplotypes.size()); + for (final Haplotype haplotype : haplotypes) + alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference())); + + // The actual work: + final HashMap result = new HashMap<>(perSampleReadList.size()); + for (final Map.Entry> e : perSampleReadList.entrySet()) { + final String sample = e.getKey(); + final List reads = e.getValue(); + final Set mayNeedAdjustment = new HashSet<>(reads.size()); + // Get the cost/likelihood of each read at relevant subpaths on the tree: + final Map> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment); + // Create the resulting per-read maps: + final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions); + result.put(sample, prallm); + } + logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + ""); + return result; + } + + + /** + * Prints a graph into a dot file. + * + * @param fileName name of the output file. + */ + public void printGraph(final String fileName) { + if (haplotypeGraph != null) + haplotypeGraph.printGraph(fileName); + } + + /** + * Returns the kmerSize the engine is using to match read vs graph kmers thus reducing computation. + * + * @return greater than 0. + */ + public int getKmerSize() { + return kmerSize; + } + + /** + * Tells whether the underlying haplotype graph contained cycles. + * + * @return {@code true} iff so. + */ + public boolean hasCycles() { + // It is set to null if it contained cycles. + return haplotypeGraph == null; + } + + + /** + * Builds the result per-read allele likelihood map. + * + * @param haplotypes haplotypes to process. + * @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex. + * @param alleleVersions map between haplotypes and the corresponding allele. + * @return never {@code null} although perhaps empty. + */ + protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap( + final Collection haplotypes, + final Map> costsEndingByVertex, final Map alleleVersions) { + + final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); + if (haplotypeGraph == null) + return result; + final Map maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10); + final Set supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes()); + supportedHaplotypes.retainAll(haplotypes); + for (final Haplotype haplotype : supportedHaplotypes) + calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex); + + makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk); + applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk); + return result; + } + + /** + * Work done per haplotype to build the result per-read allele likelihood map. + *

+ *

+ * Basically for each haplotype we go through its path in the graph collecting all the read cost that we find + * on the way. For each read present we add up all its cost resulting in a single value per read, i.e. its + * "likelihood". + *

+ * + * @param haplotype the target haplotype + * @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output. + * @param result target where to add the read-vs-haplotype likelihoods. + * @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read. + * @param costsEndingByVertex read costs assorted by their end vertex. + */ + private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype, + final Map alleleVersions, + final PerReadAlleleLikelihoodMap result, + final Map maxAlleleLogLk, + final Map> costsEndingByVertex) { + final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype); + final Set haplotypeVertices = haplotypeRoute.vertexSet(); + final Map readCostByRead = new HashMap<>(); + final Set visitedVertices = new HashSet<>(haplotypeVertices.size()); + final List edgeList = haplotypeRoute.getEdges(); + MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex(); + Route pathSoFar = new Route<>(currentVertex, haplotypeGraph); + final Iterator edgeIterator = edgeList.iterator(); + while (true) { + visitedVertices.add(currentVertex); + final Set finishingAtElementCostSet = costsEndingByVertex.get(currentVertex); + updateReadCosts(readCostByRead, visitedVertices, pathSoFar, finishingAtElementCostSet); + if (!edgeIterator.hasNext()) break; + final MultiSampleEdge nextEdge = edgeIterator.next(); + pathSoFar = new Route<>(pathSoFar, nextEdge); + currentVertex = pathSoFar.getLastVertex(); + } + + final List readCosts = new ArrayList<>(readCostByRead.values()); + Collections.sort(readCosts, ReadCost.COMPARATOR); + for (final ReadCost rc : readCosts) + result.add(rc.read, alleleVersions.get(haplotype), rc.getCost()); + + for (final ReadCost rc : readCosts) { + final Double currentMax = maxAlleleLogLk.get(rc.read); + if (currentMax == null || currentMax < rc.getCost()) + maxAlleleLogLk.put(rc.read, rc.getCost()); + } + } + + /** + * Update the read cost based on the path cost found at a vertex. + * + * @param readCosts collection of read costs so far + * @param visitedVertices visited vertices collection. + * @param pathSoFar the haplotype path visited so far. + * @param finishingAtElementCostSet collection of path cost to process + */ + private void updateReadCosts(final Map readCosts, + final Set visitedVertices, + final Route pathSoFar, + final Set finishingAtElementCostSet) { + if (finishingAtElementCostSet != null) { + for (final ReadSegmentCost pc : finishingAtElementCostSet) { + if (!visitedVertices.contains(pc.path.getFirstVertex())) + continue; + if (!pathSoFar.isSuffix(pc.path)) + continue; + ReadCost rc = readCosts.get(pc.read); + if (rc == null) + readCosts.put(pc.read, rc = new ReadCost(pc.read,indelToMatchTransitionLog10Probability)); + rc.addCost(pc.getCost()); + } + } + } + + /** + * Likelihood penalty for unreported haplotype vs read likelihood with respect to the worst reported one. + */ + private static final int UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY = -3; + + /** + * Re-scales all haplotype vs read likelihoods so that for read, the best haplotype, hash likelihood 0. + * + * @param alleleVersions map between input haplotypes and output alleles. + * @param result where to change the likelihoods. + * @param mayNeedAdjustment set of read that might need adjustment. Others might be ignored. + * @param maxAlternative map from each read and the maximum alternative haplotype likelihood. + */ + @SuppressWarnings("unused") + private void makeLikelihoodAdjustment(final Map alleleVersions, + final PerReadAlleleLikelihoodMap result, + final Set mayNeedAdjustment, + final Map maxAlternative) { + final Map> map = result.getLikelihoodReadMap(); + + for (final GATKSAMRecord read : mayNeedAdjustment) { + final Map existingLikelihoods = map.get(read); + if (existingLikelihoods != null) { + Allele bestAllele = null; + double worstRelativeLikelihood = 0; + double bestRelativeLikelihood = Double.NEGATIVE_INFINITY; + for (final Map.Entry entry : map.get(read).entrySet()) { + final double candidateRelativeLikelihood = entry.getValue(); + if (candidateRelativeLikelihood > bestRelativeLikelihood) { + bestAllele = entry.getKey(); + bestRelativeLikelihood = candidateRelativeLikelihood; + } + if (!Double.isInfinite(candidateRelativeLikelihood) && worstRelativeLikelihood > candidateRelativeLikelihood) + worstRelativeLikelihood = candidateRelativeLikelihood; + } + + worstRelativeLikelihood += UNREPORTED_HAPLOTYPE_LIKELIHOOD_PENALTY; + if (bestAllele == null) + throw new IllegalStateException("No best allele for read " + read.getReadName()); + final double bestLikelihood = 0.0; // the best becomes zero. + maxAlternative.put(read, bestLikelihood); + for (final Map.Entry entry : alleleVersions.entrySet()) { + final Allele a = entry.getValue(); + final Double relativeLikelihoodO = existingLikelihoods.get(a); + final double relativeLikelihood = relativeLikelihoodO == null ? worstRelativeLikelihood : relativeLikelihoodO; + final double likelihood = relativeLikelihood - bestRelativeLikelihood + bestLikelihood; + if (likelihood > 0) + throw new IllegalStateException("Likelihood larger than 1 with read " + read.getReadName()); + existingLikelihoods.put(a, likelihood); + } + } + } + } + + /** + * Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele. + * The justification of this constraint is explained in + * {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}. + * + * @param alleleVersions correspondence between input haplotypes and output alleles. + * @param result the target result map. + * @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele. + */ + private void applyGlobalReadMismappingRate(final Map alleleVersions, + final PerReadAlleleLikelihoodMap result, + final Map maxAlleleLogLk) { + if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) { + final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype()); + for (final Map.Entry> entry : result.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = entry.getKey(); + final Map likelihoods = entry.getValue(); + final Double maxLogLk = maxAlleleLogLk.get(read); + if (maxAlleleLogLk == null) continue; + final Double referenceLogLk = likelihoods.get(referenceAllele); + final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate; + if (referenceLogLk == null || referenceLogLk < minReferenceLogLk) + likelihoods.put(referenceAllele, minReferenceLogLk); + } + } + } + + /** + * Calculates path costs for a set of reads. + *

+ *

+ * The resulting map has one entry per read, where the read is the key and the value list of path-cost sets. + * Each element in that list corresponds to an event block. Each path cost in one of those sets indicate the + * likelihood (cost) of traversing a possible path across the event block using that read. + *

+ * + * @param reads reads to analyze. + * @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment. + * @return never {@code null}. + */ + protected Map> calculatePathCostsByRead( + final List reads, final Set mayNeedAdjustment) { + final Map> result = new HashMap<>(reads.size()); + if (!hasVariation) + return Collections.emptyMap(); + for (final GATKSAMRecord r : reads) { + calculatePathCostsByRead(r, mayNeedAdjustment, result); + } + return result; + } + + /** + * Calculates path cost for a single read. + * + * @param read target read. + * @param mayNeedAdjustment set where to add read whose likelihood might need adjustment. + * @param result map where to add the result. + */ + private void calculatePathCostsByRead(final GATKSAMRecord read, final Set mayNeedAdjustment, + final Map> result) { + + final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph); + // cannot anchor so go the tradition pair-hmm way. + hmm.loadRead(read); + if (!anchoring.isAnchoredSomewhere()) { + defaultToRegularPairHMM(anchoring, result); + nonAnchoredReads++; + return; + } + + calculateReadSegmentCosts(anchoring, hmm, result); + + if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result); + mayNeedAdjustment.add(read); + anchoredReads++; + } + + /** + * Calculates read vs haplotype likelihoods using the classic PairHMM approach. + *

+ *

+ * It basically compares the read with each haplotype full path without short cuts. + *

+ * + * @param anchoring anchoring information of the read. + * @param destination where to leave the results indexed by ending veretex. + */ + private void defaultToRegularPairHMM(final ReadAnchoring anchoring, final Map> destination) { + + for (final Map.Entry entry : haplotypeGraph.getHaplotypeRouteMap().entrySet()) { + if (entry.getValue() == null) continue; + final byte[] haplotypeBases = entry.getKey().getBases(); + hmm.loadHaplotypeBases(haplotypeBases); + final double cost = hmm.calculateLocalLikelihood(0, anchoring.read.getReadLength(), 0, haplotypeBases.length, false); + final ReadSegmentCost readSegmentCost = new ReadSegmentCost(anchoring.read, entry.getValue(), cost); + addReadSegmentCost(destination, readSegmentCost); + } + } + + /** + * Add a new read-segment-cost to an ending vertex indexed map. + * @param destination where to add the read-segment-cost. + * @param cost the read-segment-cost to add. + */ + private void addReadSegmentCost(final Map> destination, final ReadSegmentCost cost) { + final MultiDeBruijnVertex endVertex = cost.path.getLastVertex(); + Set vpcSet = destination.get(endVertex); + if (vpcSet == null) + destination.put(endVertex, vpcSet = new HashSet<>(10)); + vpcSet.add(cost); + } + + /** + * Calculate the likelihood cost of path section of a read across the graph. + *

+ *

+ * Given a read, its anchors and other unique kmer mapable to the reference path we can divide the graph + * into event blocks: a set of one or more variations and the possible path across that block. + *

+ *

+ *

+ * The result value will have one element fo reach block. Each element is the set of all path costs (likelihoods) + * to traverse the block using all possible paths (different haplotypes). + *

+ *

+ *

+ * The current implementation has some added complexity in order to avoid a situation in where the last part + * of the anchored section of the read is thrown out. We first determine the last event block boundaries and we + * make sure that we won't run over its left limit when covering for earlier event blocks. + *

+ * + * @param anchoring target read graph anchoring information. + * @param hmm the pair-hmm calculation engine. It must have been loaded with the same {@code read} already. + * @param destination where to add the costs. + */ + private void calculateReadSegmentCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) { + + final EventBlockFinder.Traversal traversal = eventBlockSearchEngine.traversal(anchoring); + + for (final EventBlock eventBlock : traversal) { + + // final Set> acrossBlockPaths = + // calculateAllPathsBetweenVertices(anchoring, + // eventBlock.getSource(), eventBlock.getSink());//eventBlock.getRoutesAcross(); + + final Set> acrossBlockPaths = eventBlock.getRoutesAcross(); + + int leftBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSource()); + int rightBlockBoundaryIndex = anchoring.uniqueKmerOffsets.get(eventBlock.getSink()); + calculateCostForPathSet(anchoring.read, acrossBlockPaths, hmm, leftBlockBoundaryIndex, rightBlockBoundaryIndex, true, false, null, null, destination); + } + } + + /** + * Calculate path cost for a set of paths across a event block. + * + * @param read the target read. + * @param acrossBlockPaths event block paths to evaluate. + * @param hmm pair-hmm engine to use to calculate likelihoods. + * @param beforeBlockReadOffset kmer offset on the read for the vertex kmer before the block. + * @param afterBlockReadOffset kmer offset on the read for the vertex kmer after the block. + * @param doClipping whether to perform any clipping in order to save cpu time. + * @param prependVertex if not null, the end cost path with be prepended with this vertex. + * @param appendVertex if not null, the end cost path will be appended with this vertex. + * @param includePathEnds whether to include or exclude the vertices at the very end or beginning of the paths. + */ + private void calculateCostForPathSet( + final GATKSAMRecord read, final Set> acrossBlockPaths, + final FlexibleHMM hmm, final int beforeBlockReadOffset, final int afterBlockReadOffset, + final boolean doClipping, final boolean includePathEnds, + final MultiDeBruijnVertex prependVertex, + final MultiDeBruijnVertex appendVertex, + final Map> destination) { + + + final Set readSegmentCosts = new TreeSet<>(ReadSegmentComparator.INSTANCE); + + final int readStart = beforeBlockReadOffset + kmerSize; + final int readEnd = Math.max(readStart, afterBlockReadOffset + kmerSize - 1); + final byte[][] pathBases = new byte[acrossBlockPaths.size()][]; + final CountSet pathSizes = new CountSet(acrossBlockPaths.size()); + int nextPath = 0; + + // Complete the read segment cost with the corresponding path bases + for (final Route p : acrossBlockPaths) { + final ReadSegmentCost readSegmentCost = new ReadSegmentCost(read, p, Double.NaN); + pathBases[nextPath++] = readSegmentCost.bases = eventBlockPathBases(p, includePathEnds); + pathSizes.add(readSegmentCost.bases.length); + readSegmentCosts.add(readSegmentCost); + } + + // Add the read 'path size'. + pathSizes.add(readEnd - readStart); + + final byte[] readBases = hmm.getReadBases(); + + // Perform right clipping of bases that are common to all paths and read. + int rightClipping = !doClipping ? 0 : calculateRightClipping(readEnd, pathBases, readBases,pathSizes); + + // Calculate the costs. + for (final ReadSegmentCost readSegmentCost : readSegmentCosts) { + hmm.loadHaplotypeBases(readSegmentCost.bases); + readSegmentCost.setCost(hmm.calculateLocalLikelihood(Math.max(0, readStart), readEnd - rightClipping, 0, readSegmentCost.bases.length - rightClipping, false)); + if (prependVertex != null) + readSegmentCost.path = new Route<>(prependVertex,readSegmentCost.path); + if (appendVertex != null) + readSegmentCost.path = new Route<>(readSegmentCost.path,appendVertex); + addReadSegmentCost(destination,readSegmentCost); + } + + + } + + /** + * Determines how much we can clip away from the right side of a set of path without loosing accuracy when comparing + * likelihood vs the read. + * + * @param readEnd exclusive position right after the last one of the region considered. + * @param pathBases bases of possible path in the same event block. + * @param readBases full length read bases. + * @param pathSizes path size set. + * + * @return 0 or greater. + */ + private int calculateRightClipping(final int readEnd, final byte[][] pathBases, + final byte[] readBases, final CountSet pathSizes) { + final int maxClipping = pathSizes.size() > 1 ? 0 : Math.min(pathSizes.min(), kmerSize - 1); + int rightClipping = 0; + while (rightClipping < maxClipping) { + final byte readBase = readBases[readEnd - rightClipping - 1]; + boolean dontGoFurther = false; + for (int i = 0; !dontGoFurther && i < pathBases.length; i++) + if (pathBases[i][pathBases[i].length - rightClipping - 1] != readBase) + dontGoFurther = true; + if (dontGoFurther) + break; + rightClipping++; + } + return rightClipping; + } + + /** + * Calculates a graph path bases. + *

+ *

+ * When the path starts on a source vertex, all its sequence is considered as part of the path bases. For regular + * vertices start only the suffix (last) base is considered. + *

+ * + * @param path the targeted path. + * @param includePathEnds whether the bases included in the first and last vertex of the path should be included or excluded. + * @return never {@code null} but perhaps a zero-length base array if the final requested path length is zero. + */ + //TODO this method could be moved to the Path class, but require consider how to make the API more concise. + private byte[] eventBlockPathBases(final Path path, + final boolean includePathEnds) { + // We first calculate the size of the return. + final List vertices = path.getVertices(); + final boolean pathStartsAtSource = haplotypeGraph.isSource(path.getFirstVertex()); + final int resultLength = includePathEnds + ? vertices.size() + (pathStartsAtSource ? path.getFirstVertex().getSequence().length - 1 : 0) + : vertices.size() - 2; + // Trivial empty return cases: + if (resultLength <= 0) + return new byte[0]; + final byte[] result = new byte[resultLength]; + if (result.length == 0) { + return result; + } + // General return cases: + final ListIterator it = vertices.listIterator(includePathEnds ? 0 : 1); // skip the vertex (exclusive) + for (int i = 0; i < resultLength; i++) { // i < resultLength implicitly skips the last vertex (exclusive). + final MultiDeBruijnVertex vertex = it.next(); + if (i == 0 && includePathEnds && pathStartsAtSource) { + System.arraycopy(vertex.getSequence(), 0, result, 0, kmerSize); + i = kmerSize - 1; + } else + result[i] = vertex.getSuffix(); + } + return result; + } + + /** + * Calculate the path cost of dangling ends. + *

+ *

+ * A dangling end is the section of the read that falls before the left anchor or after the right anchor. + *

+ * + * @param anchoring anchoring information of the read vs the haplotype assembly graph. + * @param hmm the PairHMM engine to use to calculate likelihoods. + * @param destination cost destination. + */ + private void danglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, final Map> destination) { + if (anchoring.leftAnchorIndex > 0 || anchoring.leftAnchorIndex == 0 + && anchoring.leftAnchorVertex.hasAmbiguousSequence()) + leftDanglingEndPathCosts(anchoring, hmm,destination); + + if (anchoring.rightAnchorIndex < anchoring.read.getReadLength() - kmerSize) + rightDanglingEndPathCosts(anchoring, hmm, destination); + } + + /** + * Generates all relevant right dangling end path costs. + * + * @param anchoring the anchoring information for the read under analysis. + * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with + * the same read as {@code anchoring} refers to. + * @param destination where the place the resulting read-segment-costs. + */ + private void rightDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, + final Map> destination) { + final int readStart = anchoring.rightAnchorIndex; + final int readEnd = anchoring.read.getReadLength() - kmerSize + 1; + final Set> haplotypeRoutes = + extendsHaplotypeRoutesForwards(anchoring.rightAnchorVertex); + if (haplotypeRoutes.size() >= 2) + calculateCostForPathSet(anchoring.read, + haplotypeRoutes, hmm, readStart, readEnd, false, true,anchoring.rightAnchorVertex,null,destination); + + } + + /** + * Generates all relevant left dangling end path costs. + * + * @param anchoring the anchoring information for the read under analysis. + * @param hmm pair-hmm implementation to use to calculate likelihoods. It is assumed to be loaded with + * the same read as {@code anchoring} refers to. + * @param destination where the place the resulting read-segment-costs. + */ + private void leftDanglingEndPathCosts(final ReadAnchoring anchoring, final FlexibleHMM hmm, + final Map> destination) { + final int readStart = -kmerSize; + final int readEnd = anchoring.leftAnchorIndex; + final Set> haplotypeRoutes = + extendsHaplotypeRoutesBackwards(anchoring.leftAnchorVertex); + if (haplotypeRoutes.size() >= 2) // if there is just one haplotype route there is no relevant variation in the dangling end. + calculateCostForPathSet(anchoring.read, haplotypeRoutes, hmm, + readStart, readEnd, false, true, null, anchoring.leftAnchorVertex, destination); + } + + /** + * Construct haplotype routes prefixes to an anchor vertex. + *

+ *

+ * The output should contain a route for each haplotype that includes the input anchor vertex. + * This route would be the prefix of the haplotype that finishes at that vertex. + *

+ * + * @param anchorVertex the target anchor vertex. + * @return never {@code null}. + */ + private Set> extendsHaplotypeRoutesBackwards( + final MultiDeBruijnVertex anchorVertex) { + final Set> result = new HashSet<>(haplotypes.size()); + for (final MultiDeBruijnVertex parent : haplotypeGraph.incomingVerticesOf(anchorVertex)) + extendsHaplotypeRoutesFrom(parent, result, false); + return result; + } + + /** + * Construct haplotype routes suffix from an anchor vertex. + *

+ *

+ * The output should contain a route for each haplotype that includes the input anchor vertex. + * This route would be the suffix of the haplotype that starts at that vertex. + *

+ * + * @param anchorVertex the target anchor vertex. + * @return never {@code null}. + */ + private Set> extendsHaplotypeRoutesForwards( + final MultiDeBruijnVertex anchorVertex) { + final Set> result = new HashSet<>(haplotypes.size()); + for (final MultiDeBruijnVertex parent : haplotypeGraph.outgoingVerticesOf(anchorVertex)) + extendsHaplotypeRoutesFrom(parent, result, true); + return result; + } + + /** + * Extends from a vertex considering path furcations that are part of some valid haplotype + *

+ *

+ * In other words, it will ignore subpaths that are not valid part of an assembled haplotype. + *

+ * + * @param start start seed vertex. + * @param result destination for found extensions. + * @param forward whether to traverse edges forward or backwards. + */ + private void extendsHaplotypeRoutesFrom(final MultiDeBruijnVertex start, final Set> result, final boolean forward) { + final Set validHaplotypeRoutes = haplotypeGraph.getEnclosingHaplotypeRoutes(start); + if (validHaplotypeRoutes.size() == 0) return; + final Deque, Set>> queue = new LinkedList<>(); + queue.add(new Pair<>(new Route<>(start, haplotypeGraph), validHaplotypeRoutes)); + while (!queue.isEmpty()) { + final Pair, Set> current = queue.remove(); + final Route path = current.getFirst(); + final MultiDeBruijnVertex vertex = forward ? path.getLastVertex() : path.getFirstVertex(); + final Set validRoutes = current.getSecond(); + for (final HaplotypeRoute hr : validRoutes) { + final MultiDeBruijnVertex routeEndVertex = forward ? hr.getLastVertex() : hr.getFirstVertex(); + if (vertex.equals(routeEndVertex)) { + result.add(path); + break; + } + } + final Set nextVertices = forward ? haplotypeGraph.outgoingVerticesOf(vertex) : + haplotypeGraph.incomingVerticesOf(vertex); + for (final MultiDeBruijnVertex candidate : nextVertices) { + extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex(forward, queue, path, validRoutes, candidate); + } + } + } + + /** + * Check on an candidate vertice to exted a path. + * + *

+ * This method updates the traversal queue accordingly. + *

+ * + * @param forward whether the extension is forward, or backwards. + * @param queue queue with open paths yet to be explored. + * @param path path extension to evaluate. + * @param validRoutes collection of valid haplotype routes used to discard non-informative extensions. + * @param candidate the candidate extending vertex. + */ + private void extendsHaplotypeRoutesFrom$ProcessCandidateExtendingVertex( + final boolean forward, + final Deque, Set>> queue, + final Route path, + final Set validRoutes, final MultiDeBruijnVertex candidate) { + final Set parentValidHaplotypes = haplotypeGraph.getEnclosingHaplotypeRoutes(candidate); + switch (parentValidHaplotypes.size()) { + case 0: + return; + case 1: + if (validRoutes.containsAll(parentValidHaplotypes)) + queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes)); + else + return; + break; + default: + if (parentValidHaplotypes.size() == validRoutes.size() && parentValidHaplotypes.containsAll(validRoutes)) { + queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), parentValidHaplotypes)); + } else { + final Set newValidHaplotypeRoutes = new HashSet<>(validRoutes.size()); + for (final HaplotypeRoute hr : validRoutes) + if (parentValidHaplotypes.contains(hr)) + newValidHaplotypeRoutes.add(hr); + if (newValidHaplotypeRoutes.size() == 0) + return; + queue.add(new Pair<>(forward ? new Route<>(path, candidate) : new Route<>(candidate, path), newValidHaplotypeRoutes)); + } + } + } + + public List getHaplotypeList() { + return new ArrayList<>(haplotypeGraph.getHaplotypes()); + } + + /** + * Returns the haplotype graph associated with this instance. + * @return never {@code null} + */ + public HaplotypeGraph getHaplotypeGraph() { + return haplotypeGraph; + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 18c93f2fb..82015d153 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.samtools.*; +import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsamplingUtils; import org.broadinstitute.sting.gatk.filters.BadMateFilter; @@ -70,13 +71,18 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileState; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.gvcf.GVCFWriter; import org.broadinstitute.sting.utils.haplotype.*; @@ -86,6 +92,7 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; @@ -148,6 +155,17 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Output(doc="File to which variants should be written") protected VariantContextWriter vcfWriter = null; + @Hidden + @Advanced + @Argument(fullName="likelihoodCalculationEngine",shortName="likelihoodEngine", + doc="what likelihood calculation engine to use to calculate the relative likelihood of reads vs haplotypes",required=false) + protected LikelihoodCalculationEngine.Implementation likelihoodEngineImplementation = LikelihoodCalculationEngine.Implementation.PairHMM; + + @Hidden + @Advanced + @Argument(fullName="heterogeneousKmerSizeResolution",shortName="hksr",doc="how to solve heterogeneous kmer situations using the fast method",required=false) + protected HeterogeneousKmerSizeResolution heterogeneousKmerSizeResultion = HeterogeneousKmerSizeResolution.COMBO_MIN; + @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false, defaultToStdout = false) protected PrintStream graphWriter = null; @@ -194,6 +212,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + private double log10GlobalReadMismappingRate; + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } /** @@ -225,7 +245,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); + protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. @@ -236,22 +256,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); - // ----------------------------------------------------------------------------------------------- - // arguments to control internal behavior of the debruijn assembler - // ----------------------------------------------------------------------------------------------- - - @Advanced - @Argument(fullName="useDebruijnAssembler", shortName="useDebruijnAssembler", doc="If specified, we will use the old DeBruijn assembler. Depreciated as of 2.6", required = false) - protected boolean useDebruijnAssembler = false; - - @Advanced - @Argument(fullName="minKmerForDebruijnAssembler", shortName="minKmerForDebruijnAssembler", doc="Minimum kmer length to use in the debruijn assembly graph", required = false) - protected int minKmerForDebruijnAssembler = 11; - - @Advanced - @Argument(fullName="onlyUseKmerSizeForDebruijnAssembler", shortName="onlyUseKmerSizeForDebruijnAssembler", doc="If specified, we will only build kmer graphs with this kmer size in the debruijn", required = false) - protected int onlyUseKmerSizeForDebruijnAssembler = -1; - // ----------------------------------------------------------------------------------------------- // arguments to control internal behavior of the read threading assembler // ----------------------------------------------------------------------------------------------- @@ -268,23 +272,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) protected int numPruningSamples = 1; - /** - * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype - * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the - * run of the haplotype caller we only take maxPathsPerSample * nSample paths from the graph, in order of their - * weights, no matter how many paths are possible to generate from the graph. Putting this number too low - * will result in dropping true variation because paths that include the real variant are not even considered. - */ - @Advanced - @Argument(fullName="maxPathsPerSample", shortName="maxPathsPerSample", doc="Max number of paths to consider for the read threading assembler per sample.", required = false) - protected int maxPathsPerSample = 10; - - /** - * The minimum number of paths to advance forward for genotyping, regardless of the - * number of samples - */ - private final static int MIN_PATHS_PER_GRAPH = 128; - @Hidden @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) protected boolean dontRecoverDanglingTails = false; @@ -317,7 +304,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * B <= X < C * X >= C * - * The default bands give the following GQ blocks: + * The default bands with (1, 10, 20, 30, 40, 50) give the following GQ blocks: * * [0, 0] * (0, 10] @@ -331,7 +318,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="GVCFGQBands", shortName="GQB", doc="Emit experimental reference confidence scores", required = false) - protected List GVCFGQBands = Arrays.asList(1, 10, 20, 30, 40, 50); + protected List GVCFGQBands = Arrays.asList(5, 20, 60); /** * This parameter determines the maximum size of an indel considered as potentially segregating in the @@ -348,8 +335,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // general advanced arguments to control haplotype caller behavior // ----------------------------------------------------------------------------------------------- + /** + * Users should be aware that this argument can really affect the results of the variant calling and should exercise caution. + * Using a prune factor of 1 (or below) will prevent any pruning from the graph which is generally not ideal; it can make the + * calling much slower and even less accurate (because it can prevent effective merging of "tails" in the graph). Higher values + * tend to make the calling much faster, but also lowers the sensitivity of the results (because it ultimately requires higher + * depth to produce calls). + */ @Advanced - @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) + @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with < X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 2; @Advanced @@ -391,9 +385,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="phredScaledGlobalReadMismappingRate", shortName="globalMAPQ", doc="The global assumed mismapping rate for reads", required = false) protected int phredScaledGlobalReadMismappingRate = 45; + /** + * Assembly graph can be quite complex, and could imply a very large number of possible haplotypes. Each haplotype + * considered requires N PairHMM evaluations if there are N reads across all samples. In order to control the + * run of the haplotype caller we only take maxNumHaplotypesInPopulation paths from the graph, in order of their + * weights, no matter how many paths are possible to generate from the graph. Putting this number too low + * will result in dropping true variation because paths that include the real variant are not even considered. + */ @Advanced @Argument(fullName="maxNumHaplotypesInPopulation", shortName="maxNumHaplotypesInPopulation", doc="Maximum number of haplotypes to consider for your population. This number will probably need to be increased when calling organisms with high heterozygosity.", required = false) - protected int maxNumHaplotypesInPopulation = 25; + protected int maxNumHaplotypesInPopulation = 128; @Advanced @Argument(fullName="mergeVariantsViaLD", shortName="mergeVariantsViaLD", doc="If specified, we will merge variants together into block substitutions that are in strong local LD", required = false) @@ -453,6 +454,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="allowCyclesInKmerGraphToGeneratePaths", shortName="allowCyclesInKmerGraphToGeneratePaths", doc="If specified, we will allow cycles in the kmer graphs to generate paths with multiple copies of the path sequenece rather than just the shortest paths", required = false) protected boolean allowCyclesInKmerGraphToGeneratePaths = false; + @Hidden + @Argument(fullName="noFpga", shortName="noFpga", doc="If provided, disables the use of the FPGA HMM implementation", required = false) + protected boolean noFpga = false; + // Parameters to control read error correction @Hidden @Argument(fullName="errorCorrectReads", shortName="errorCorrectReads", doc = "Use an exploratory algorithm to error correct the kmers used during assembly. May cause fundamental problems with the assembly graph itself", required=false) @@ -466,6 +471,36 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="minObservationsForKmerToBeSolid", shortName="minObservationsForKmerToBeSolid", doc = "A k-mer must be seen at least these times for it considered to be solid", required=false) protected int minObservationsForKmerToBeSolid = 20; + /** + * the maximum extent into the full active region extension that we're willing to go in genotyping our events + */ + @Hidden + @Argument(fullName="maxDiscARExtension", shortName="maxDiscARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for discovery", required=false) + protected int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25; + + @Hidden + @Argument(fullName="maxGGAARExtension", shortName="maxGGAARExtension", doc = "the maximum extent into the full active region extension that we're willing to go in genotyping our events for GGA mode", required=false) + protected int MAX_GGA_ACTIVE_REGION_EXTENSION = 300; + + /** + * Include at least this many bases around an event for calling it + */ + @Hidden + @Argument(fullName="paddingAroundIndels", shortName="paddingAroundIndels", doc = "Include at least this many bases around an event for calling indels", required=false) + protected int PADDING_AROUND_OTHERS_FOR_CALLING = 150; + + @Hidden + @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) + protected int PADDING_AROUND_SNPS_FOR_CALLING = 20; + + /** + * Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base + * insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator). + * VERY IMPORTANT: when using PCR-free sequencing data we definitely recommend setting this argument to NONE. + */ + @Advanced + @Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false) + public PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE; // ----------------------------------------------------------------------------------------------- // done with Haplotype caller parameters @@ -484,22 +519,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // the genotyping engine private GenotypingEngine genotypingEngine = null; - private VariantAnnotatorEngine annotationEngine = null; - // fasta reference reader to supplement the edges of the reference sequence - private CachingIndexedFastaSequenceFile referenceReader; + protected CachingIndexedFastaSequenceFile referenceReader; // reference base padding size private static final int REFERENCE_PADDING = 500; - // include at least this many bases around an event for calling it - private final static int PADDING_AROUND_SNPS_FOR_CALLING = 20; - private final static int PADDING_AROUND_OTHERS_FOR_CALLING = 150; - - // the maximum extent into the full active region extension that we're willing to go in genotyping our events - private final static int MAX_DISCOVERY_ACTIVE_REGION_EXTENSION = 25; - private final static int MAX_GGA_ACTIVE_REGION_EXTENSION = 100; - private ActiveRegionTrimmer trimmer = null; private final static int maxReadsInRegionPerSample = 1000; // TODO -- should be an argument @@ -519,6 +544,10 @@ public class HaplotypeCaller extends ActiveRegionWalker, In ReferenceConfidenceModel referenceConfidenceModel = null; + // as determined experimentally Nov-Dec 2013 + protected final static GATKVCFIndexType OPTIMAL_GVCF_INDEX_TYPE = GATKVCFIndexType.LINEAR; + protected final static int OPTIMAL_GVCF_INDEX_PARAMETER = 128000; + //--------------------------------------------------------------------------------------------------------------- // // initialize @@ -534,10 +563,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // get all of the unique sample names Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); - final int nSamples = samples.size(); // initialize the UnifiedGenotyper Engine which is used to call into the exact model final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user - // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine // TODO -- why is this? + // HC GGA mode depends critically on EMIT_ALL_SITES being set for the UG engine UAC.OutputMode = SCAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES : UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); @@ -549,22 +577,18 @@ public class HaplotypeCaller extends ActiveRegionWalker, In simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling simpleUAC.CONTAMINATION_FRACTION = 0.0; - simpleUAC.CONTAMINATION_FRACTION_FILE=null; + simpleUAC.CONTAMINATION_FRACTION_FILE = null; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); - // Currently, per-sample contamination level is only implemented for UG - if( UAC.CONTAMINATION_FRACTION_FILE !=null) { - throw new UserException("Per-Sample contamination level not supported in Haplotype Caller at this point"); + if( UAC.CONTAMINATION_FRACTION_FILE != null ) { + UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, UAC.CONTAMINATION_FRACTION, samples, logger)); } - // when we do implement per-sample contamination for HC, this will probably be needed. - // UAC.setSampleContamination(AlleleBiasedDownsamplingUtils.loadContaminationFile(UAC.CONTAMINATION_FRACTION_FILE, samples, logger)); - // initialize the output VCF header - annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - Set headerInfo = new HashSet(); + Set headerInfo = new HashSet<>(); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); @@ -589,6 +613,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + // a kluge to enforce the use of this indexing strategy + if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || + getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { + throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); + } + try { vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); } catch ( IllegalArgumentException e ) { @@ -607,10 +637,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // create and setup the assembler - final int maxAllowedPathsForReadThreadingAssembler = Math.max(maxPathsPerSample * nSamples, MIN_PATHS_PER_GRAPH); - assemblyEngine = useDebruijnAssembler - ? new DeBruijnAssembler(minKmerForDebruijnAssembler, onlyUseKmerSizeForDebruijnAssembler) - : new ReadThreadingAssembler(maxAllowedPathsForReadThreadingAssembler, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); + assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, kmerSizes, dontIncreaseKmerSizesForCycles, numPruningSamples); assemblyEngine.setErrorCorrectKmers(errorCorrectKmers); assemblyEngine.setPruneFactor(MIN_PRUNE_FACTOR); @@ -626,7 +653,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if ( phredScaledGlobalReadMismappingRate < 0 ) phredScaledGlobalReadMismappingRate = -1; // configure the global mismapping rate - final double log10GlobalReadMismappingRate; if ( phredScaledGlobalReadMismappingRate < 0 ) { log10GlobalReadMismappingRate = - Double.MAX_VALUE; } else { @@ -635,7 +661,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // create our likelihood calculation engine - likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate ); + likelihoodCalculationEngine = createLikelihoodCalculationEngine(); final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); @@ -653,6 +679,26 @@ public class HaplotypeCaller extends ActiveRegionWalker, In getToolkit().getGenomeLocParser()); } + /** + * Instantiates the appropriate likelihood calculation engine. + * + * @return never {@code null}. + */ + private LikelihoodCalculationEngine createLikelihoodCalculationEngine() { + switch (likelihoodEngineImplementation) { + case PairHMM: + return new PairHMMLikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel ); + case GraphBased: + return new GraphBasedLikelihoodCalculationEngine( (byte)gcpHMM,log10GlobalReadMismappingRate,heterogeneousKmerSizeResultion,DEBUG,debugGraphTransformations); + case Random: + return new RandomLikelihoodCalculationEngine(); + default: + //Note: we do not include in the error message list as it is of no grand public interest. + throw new UserException("Unsupported likelihood calculation engine '" + likelihoodCalculationEngine + + "'. Please use one of the following instead: 'PairHMM' and 'GraphBased'."); + } + } + //--------------------------------------------------------------------------------------------------------------- // // isActive @@ -750,7 +796,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } // run the local assembler, getting back a collection of information on how we should proceed - final AssemblyResult assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); + final AssemblyResultSet assemblyResult = assembleReads(originalActiveRegion, activeAllelesToGenotype); + final ActiveRegion regionForGenotyping = assemblyResult.getRegionForGenotyping(); // abort early if something is out of the acceptable range if( ! assemblyResult.isVariationPresent() ) { @@ -760,17 +807,26 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if (dontGenotype) return NO_CALLS; // user requested we not proceed // filter out reads from genotyping which fail mapping quality based criteria - final Collection filteredReads = filterNonPassingReads( assemblyResult.regionForGenotyping ); + final Collection filteredReads = filterNonPassingReads( regionForGenotyping ); final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); - if( assemblyResult.regionForGenotyping.size() == 0 ) { + if( regionForGenotyping.size() == 0 ) { // no reads remain after filtering so nothing else to do! return referenceModelForNoVariation(originalActiveRegion, false); } // evaluate each sample's reads against all haplotypes //logger.info("Computing read likelihoods with " + assemblyResult.regionForGenotyping.size() + " reads"); - final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( assemblyResult.haplotypes, splitReadsBySample( assemblyResult.regionForGenotyping.getReads() ) ); + final List haplotypes = assemblyResult.getHaplotypeList(); + final Map> reads = splitReadsBySample( regionForGenotyping.getReads() ); + + // Calculate the likelihoods: CPU intesive part. + final Map stratifiedReadMap = + likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); + + + + // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there // was a bad interaction between that selection and the marginalization that happens over each event when computing @@ -779,12 +835,12 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] final GenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( UG_engine, - assemblyResult.haplotypes, + haplotypes, stratifiedReadMap, perSampleFilteredReadList, - assemblyResult.fullReferenceWithPadding, - assemblyResult.paddedReferenceLoc, - assemblyResult.regionForGenotyping.getLocation(), + assemblyResult.getFullReferenceWithPadding(), + assemblyResult.getPaddedReferenceLoc(), + regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), metaDataTracker, activeAllelesToGenotype ); @@ -792,9 +848,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { haplotypeBAMWriter.writeReadsAlignedToHaplotypes( - assemblyResult.haplotypes, - assemblyResult.paddedReferenceLoc, - assemblyResult.haplotypes, + haplotypes, + assemblyResult.getPaddedReferenceLoc(), + haplotypes, calledHaplotypes.getCalledHaplotypes(), stratifiedReadMap); } @@ -802,50 +858,18 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } if ( emitReferenceConfidence() ) { - return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getRefHaplotype(), - calledHaplotypes.getCalledHaplotypes(), assemblyResult.paddedReferenceLoc, assemblyResult.regionForGenotyping, - stratifiedReadMap, calledHaplotypes.getCalls()); + if ( calledHaplotypes.getCalls().isEmpty() ) { + // no called all of the potential haplotypes + return referenceModelForNoVariation(originalActiveRegion, false); + } else + return referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), + calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, + stratifiedReadMap, calledHaplotypes.getCalls()); } else { return calledHaplotypes.getCalls(); } } - private final static class AssemblyResult { - final List haplotypes; - final ActiveRegion regionForGenotyping; - final byte[] fullReferenceWithPadding; - final GenomeLoc paddedReferenceLoc; - final boolean variationPresent; - final Haplotype refHaplotype; - - private AssemblyResult(List haplotypes, ActiveRegion regionForGenotyping, byte[] fullReferenceWithPadding, GenomeLoc paddedReferenceLoc, boolean variationPresent) { - this.haplotypes = haplotypes; - this.regionForGenotyping = regionForGenotyping; - this.fullReferenceWithPadding = fullReferenceWithPadding; - this.paddedReferenceLoc = paddedReferenceLoc; - this.variationPresent = variationPresent; - - Haplotype firstRefHaplotype = null; - for ( final Haplotype h : haplotypes ) { - if ( h.isReference() ) { - if ( firstRefHaplotype != null ) throw new IllegalArgumentException("Found two haplotypes marked as reference " + firstRefHaplotype + " and " + h); - firstRefHaplotype = h; - } - } - - if ( firstRefHaplotype == null ) throw new IllegalArgumentException("Couldn't find a reference haplotype in " + haplotypes); - this.refHaplotype = firstRefHaplotype; - } - - public Haplotype getRefHaplotype() { - return refHaplotype; - } - - public boolean isVariationPresent() { - return variationPresent && haplotypes.size() > 1; - } - } - /** * High-level function that runs the assembler on the active region reads, * returning a data structure with the resulting information needed @@ -855,9 +879,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) * @return the AssemblyResult describing how to proceed with genotyping */ - protected AssemblyResult assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { + protected AssemblyResultSet assembleReads(final ActiveRegion activeRegion, final List activeAllelesToGenotype) { // Create the reference haplotype which is the bases from the reference that make up the active region - finalizeActiveRegion(activeRegion); // merge overlapping fragments, clip adapter and low qual tails + finalizeActiveRegion(activeRegion); // handle overlapping fragments, clip adapter and low qual tails final byte[] fullReferenceWithPadding = activeRegion.getActiveRegionReference(referenceReader, REFERENCE_PADDING); final GenomeLoc paddedReferenceLoc = getPaddedLoc(activeRegion); @@ -866,17 +890,23 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // Create ReadErrorCorrector object if requested - will be used within assembly engine. ReadErrorCorrector readErrorCorrector = null; if (errorCorrectReads) - readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG,fullReferenceWithPadding); + readErrorCorrector = new ReadErrorCorrector(kmerLengthForReadErrorCorrection, MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION, minObservationsForKmerToBeSolid, DEBUG, fullReferenceWithPadding); try { - final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); - if ( ! emitReferenceConfidence() && ! dontTrimActiveRegions ) { - return trimActiveRegion(activeRegion, haplotypes, activeAllelesToGenotype, fullReferenceWithPadding, paddedReferenceLoc); - } else { - // we don't want to trim active regions, so go ahead and use the old one - return new AssemblyResult(haplotypes, activeRegion, fullReferenceWithPadding, paddedReferenceLoc, true); - } - } catch ( Exception e ) { + final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, paddedReferenceLoc, activeAllelesToGenotype,readErrorCorrector ); + assemblyResultSet.debugDump(logger); + + if ( ! dontTrimActiveRegions ) { + final ActiveRegion trimmedActiveRegion = trimActiveRegion(assemblyResultSet,activeAllelesToGenotype); + if (trimmedActiveRegion != null) + return trimAssemblyResultSet(assemblyResultSet, trimmedActiveRegion); + else { + assemblyResultSet.resetVariationPresent(); + return assemblyResultSet; + } + } else + return assemblyResultSet; + } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { final SAMFileWriter writer = ReadUtils.createSAMFileWriterWithCompression(getToolkit().getSAMFileHeader(), true, "assemblyFailure.bam", 5); @@ -946,73 +976,89 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return map; } - /** - * Trim down the active region to just enough to properly genotype the events among the haplotypes - * - * @param originalActiveRegion our full active region - * @param haplotypes the list of haplotypes we've created from assembly - * @param activeAllelesToGenotype additional alleles we might need to genotype (can be empty) - * @param fullReferenceWithPadding the reference bases over the full padded location - * @param paddedReferenceLoc the span of the reference bases - * @return an AssemblyResult containing the trimmed active region with all of the reads we should use - * trimmed down as well, and a revised set of haplotypes. If trimming down the active region results - * in only the reference haplotype over the non-extended active region, returns null. - */ - private AssemblyResult trimActiveRegion(final ActiveRegion originalActiveRegion, - final List haplotypes, - final List activeAllelesToGenotype, - final byte[] fullReferenceWithPadding, - final GenomeLoc paddedReferenceLoc) { - if ( DEBUG ) logger.info("Trimming active region " + originalActiveRegion + " with " + haplotypes.size() + " haplotypes"); - - EventMap.buildEventMapsForHaplotypes(haplotypes, fullReferenceWithPadding, paddedReferenceLoc, DEBUG); - final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypes); + private ActiveRegion trimActiveRegion(final AssemblyResultSet resultSet, final Collection activeAllelesToGenotype) { + if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes"); + final List haplotypeList = resultSet.getHaplotypeList(); + final ActiveRegion originalGenotypingRegion = resultSet.getRegionForGenotyping(); + EventMap.buildEventMapsForHaplotypes(haplotypeList, resultSet.getFullReferenceWithPadding(), resultSet.getPaddedReferenceLoc(), DEBUG); + final TreeSet allVariantsWithinFullActiveRegion = EventMap.getAllVariantContexts(haplotypeList); allVariantsWithinFullActiveRegion.addAll(activeAllelesToGenotype); - final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalActiveRegion, allVariantsWithinFullActiveRegion); + final ActiveRegion trimmedActiveRegion = trimmer.trimRegion(originalGenotypingRegion, allVariantsWithinFullActiveRegion,false); if ( trimmedActiveRegion == null ) { // there were no variants found within the active region itself, so just return null if ( DEBUG ) logger.info("No variation found within the active region, skipping the region :-)"); - return new AssemblyResult(haplotypes, originalActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, false); + return null; } - // trim down the haplotypes - final Set haplotypeSet = new HashSet<>(haplotypes.size()); - for ( final Haplotype h : haplotypes ) { - final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); - if ( trimmed != null ) { - haplotypeSet.add(trimmed); - } else if ( DEBUG ) { - logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + " because it starts with or ends with an insertion or deletion when trimmed to " + trimmedActiveRegion.getExtendedLoc()); - } - } - - // create the final list of trimmed haplotypes - final List trimmedHaplotypes = new ArrayList<>(haplotypeSet); - - // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM - Collections.sort( trimmedHaplotypes, new HaplotypeBaseComparator() ); - - if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + haplotypes.size() + " to only " + trimmedHaplotypes.size()); - if ( DEBUG ) { - for ( final Haplotype remaining: trimmedHaplotypes ) { - logger.info(" Remains: " + remaining + " cigar " + remaining.getCigar()); - } - } - - // trim down the reads and add them to the trimmed active region - final List trimmedReads = new ArrayList<>(originalActiveRegion.getReads().size()); - for( final GATKSAMRecord read : originalActiveRegion.getReads() ) { - final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); - if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { + final List trimmedReads = new ArrayList<>(originalGenotypingRegion.getReads().size()); + for( final GATKSAMRecord read : originalGenotypingRegion.getReads() ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipToRegion( read, + trimmedActiveRegion.getExtendedLoc().getStart(), trimmedActiveRegion.getExtendedLoc().getStop() ); + if( trimmedActiveRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) trimmedReads.add(clippedRead); - } } trimmedActiveRegion.clearReads(); trimmedActiveRegion.addAll(ReadUtils.sortReadsByCoordinate(trimmedReads)); - return new AssemblyResult(trimmedHaplotypes, trimmedActiveRegion, fullReferenceWithPadding, paddedReferenceLoc, true); + return trimmedActiveRegion; + } + + + /** + * Trims a assembly result set according to the active-region trimming. + * + * @param resultSet the original assembly result set. + * @param trimmedActiveRegion the trimmed active region to trim to. + * @return the assembly result set trimmed. + */ + private AssemblyResultSet trimAssemblyResultSet(final AssemblyResultSet resultSet, final ActiveRegion trimmedActiveRegion) { + if ( DEBUG ) logger.info("Trimming active region " + resultSet.getRegionForGenotyping() + " with " + resultSet.getHaplotypeCount() + " haplotypes"); + + final List haplotypeList = resultSet.getHaplotypeList(); + + // trim down the haplotypes + final Map originalByTrimmedHaplotypes = new HashMap<>(); + + for ( final Haplotype h : haplotypeList ) { + final Haplotype trimmed = h.trim(trimmedActiveRegion.getExtendedLoc()); + + if ( trimmed != null ) { + if (originalByTrimmedHaplotypes.containsKey(trimmed)) { + if (trimmed.isReference()) { + originalByTrimmedHaplotypes.remove(trimmed); + originalByTrimmedHaplotypes.put(trimmed, h); + } + } else + originalByTrimmedHaplotypes.put(trimmed,h); + } else if (h.isReference()) + throw new IllegalStateException("trimming eliminates the reference haplotype"); + else if ( DEBUG ) { + logger.info("Throwing out haplotype " + h + " with cigar " + h.getCigar() + + " because it starts with or ends with an insertion or deletion when trimmed to " + + trimmedActiveRegion.getExtendedLoc()); + } + } + + // create the final list of trimmed haplotypes + final List trimmedHaplotypes = new ArrayList<>(originalByTrimmedHaplotypes.keySet()); + + // resort the trimmed haplotypes. + Collections.sort(trimmedHaplotypes,new HaplotypeSizeAndBaseComparator()); + final Map sortedOriginalByTrimmedHaplotypes = new LinkedHashMap<>(trimmedHaplotypes.size()); + for (final Haplotype trimmed : trimmedHaplotypes) + sortedOriginalByTrimmedHaplotypes.put(trimmed,originalByTrimmedHaplotypes.get(trimmed)); + + + if ( DEBUG ) logger.info("Trimmed region to " + trimmedActiveRegion.getLocation() + " size " + + trimmedActiveRegion.getLocation().size() + " reduced number of haplotypes from " + + haplotypeList.size() + " to only " + trimmedHaplotypes.size()); + if ( DEBUG ) + for ( final Haplotype remaining: trimmedHaplotypes ) + logger.info("Remains: " + remaining + " cigar " + remaining.getCigar()); + + return resultSet.trimTo(trimmedActiveRegion,sortedOriginalByTrimmedHaplotypes); } //--------------------------------------------------------------------------------------------------------------- @@ -1038,7 +1084,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In public void onTraversalDone(Integer result) { if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) ((GVCFWriter)vcfWriter).close(false); // GROSS -- engine forces us to close our own VCF writer since we wrapped it referenceConfidenceModel.close(); - likelihoodCalculationEngine.close(); + //TODO remove the need to call close here for debugging, the likelihood output stream should be managed + //TODO (open & close) at the walker, not the engine. + //likelihoodCalculationEngine.close(); logger.info("Ran local assembly on " + result + " active regions"); } @@ -1049,6 +1097,8 @@ public class HaplotypeCaller extends ActiveRegionWalker, In //--------------------------------------------------------------------------------------------------------------- private void finalizeActiveRegion( final ActiveRegion activeRegion ) { + if (activeRegion.isFinalized()) return; + if( DEBUG ) { logger.info("Assembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } // Loop through the reads hard clipping the adaptor and low quality tails @@ -1084,11 +1134,19 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } + // TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code. + + final List downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart); + + // handle overlapping read pairs from the same fragment + cleanOverlappingReadPairs(downsampledReads); + activeRegion.clearReads(); - activeRegion.addAll(DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart)); + activeRegion.addAll(downsampledReads); + activeRegion.setFinalized(true); } - private Set filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private Set filterNonPassingReads( final ActiveRegion activeRegion ) { final Set readsToRemove = new LinkedHashSet<>(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { if( rec.getReadLength() < MIN_READ_LENGTH || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { @@ -1099,7 +1157,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return readsToRemove; } - private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { + private GenomeLoc getPaddedLoc( final ActiveRegion activeRegion ) { final int padLeft = Math.max(activeRegion.getExtendedLoc().getStart()-REFERENCE_PADDING, 1); final int padRight = Math.min(activeRegion.getExtendedLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getExtendedLoc().getContig()).getSequenceLength()); return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getExtendedLoc().getContig(), padLeft, padRight); @@ -1125,7 +1183,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In return returnMap; } - /** * Are we emitting a reference confidence in some form, or not? * @return true if we are @@ -1133,4 +1190,17 @@ public class HaplotypeCaller extends ActiveRegionWalker, In private boolean emitReferenceConfidence(){ return emitReferenceConfidence != ReferenceConfidenceMode.NONE; } -} \ No newline at end of file + + /** + * Clean up reads/bases that overlap within read pairs + * + * @param reads the list of reads to consider + */ + private void cleanOverlappingReadPairs(final List reads) { + for ( final List perSampleReadList : splitReadsBySample(reads).values() ) { + final FragmentCollection fragmentCollection = FragmentUtils.create(perSampleReadList); + for ( final List overlappingPair : fragmentCollection.getOverlappingPairs() ) + FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java new file mode 100644 index 000000000..5887864e3 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeRoute.java @@ -0,0 +1,129 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; + +import java.util.*; + +/** + * Graph route that represent an haplotype on the haplotype assembly graph. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class HaplotypeRoute extends Route { + + protected final Set vertexSet; + + protected final Map vertexOrder; + + protected final Set forkAndJoins; + + /** + * Constructs a HaplotypeRoute given its route. + * + * @param route the haplotype route. + */ + public HaplotypeRoute(final Route route) { + super(route); + vertexOrder = new LinkedHashMap<>(route.length() + 1); + int nextOrder = 0; + vertexOrder.put(getFirstVertex(),nextOrder++); + for (final MultiSampleEdge edge : edgesInOrder) + vertexOrder.put(graph.getEdgeTarget(edge), nextOrder++); + Route currentRoute = this; + forkAndJoins = new HashSet<>(route.length()); + while (currentRoute != null) { + if (currentRoute.lastVertexIsForkOrJoin()) + forkAndJoins.add(currentRoute.getLastVertex()); + currentRoute = currentRoute.getPrefixRouteWithLastVertexThatIsForkOrJoin(); + } + vertexSet = Collections.unmodifiableSet(new HashSet<>(vertexOrder.keySet())); + } + + + + @SuppressWarnings("unused") + public Route subRoute(final MultiDeBruijnVertex start, final MultiDeBruijnVertex end) { + final Integer startOrder = vertexOrder.get(start); + final Integer endOrder = vertexOrder.get(end); + if (startOrder == null || endOrder == null) + return null; + else if (startOrder > endOrder) + return null; + else { + Route result = new Route<>(start,graph); + for (final MultiSampleEdge edge : edgesInOrder.subList(startOrder,endOrder)) + result = new Route(result,edge); + return result; + } + } + + /** + * Returns the set of vertex on the route. + * @return read only, never {@code null} vertex set. + */ + public Set vertexSet() { + return vertexSet; + } + + + /** + * Returns the position of the vertex in the route. + * + * @param vertex the query vertex. + * + * @throws NullPointerException if {@code vertex} is {@code null}. + * + * @return -1 if there is no such a vertex in the route, otherwise a number between 0 and {@link #length()} - 1. + */ + public int getVertexPosition(final MultiDeBruijnVertex vertex) { + final Integer result = vertexOrder.get(vertex); + return result == null ? -1 : result; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java similarity index 68% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java index 0f66082c6..b56d942ae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnGraphBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HeterogeneousKmerSizeResolution.java @@ -46,105 +46,65 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; - /** - * Fast approach to building a DeBruijnGraph - * - * Follows the model: - * - * for each X that has bases for the final graph: - * addKmer pair (single kmer with kmer size + 1 spanning the pair) - * - * flushKmersToGraph - * - * User: depristo - * Date: 4/7/13 - * Time: 4:14 PM + * How to resolve the haplotype graph when haplotypes where generated from a mixture of different kmerSizes. */ -public class DeBruijnGraphBuilder { - /** The size of the kmer graph we want to build */ - private final int kmerSize; - - /** The graph we're going to add kmers to */ - private final DeBruijnGraph graph; - - /** keeps counts of all kmer pairs added since the last flush */ - private final KMerCounter counter; +public enum HeterogeneousKmerSizeResolution { /** - * Create a new builder that will write out kmers to graph + * Combine haplotypes using a haplotype graph with the largest kmerSize amongst the ones that generated some haplotype. + */ + COMBO_MAX, + + /** + * Combine haplotypes using a haplotype graph with the largest kmerSize amongst the ones that generated some haplotype. + */ + COMBO_MIN, + + /** + * Take just the haplotypes from largest kmersize that generated any. + */ + MAX_ONLY, + + /** + * Take just the haplotypes from the smallest kmerSize that generated any. + */ + @SuppressWarnings("unused") + MIN_ONLY; + + /** + * Indicates whether we should use the maximum kmerSize for the haplotypeGraph or not. * - * @param graph a non-null graph that can contain already added kmers + * @return true if we need to use the maximum, false otherwise. */ - public DeBruijnGraphBuilder(final DeBruijnGraph graph) { - if ( graph == null ) throw new IllegalArgumentException("Graph cannot be null"); - this.kmerSize = graph.getKmerSize(); - this.graph = graph; - this.counter = new KMerCounter(kmerSize + 1); - } - - /** - * The graph we're building - * @return a non-null graph - */ - public DeBruijnGraph getGraph() { - return graph; - } - - /** - * The kmer size of our graph - * @return positive integer - */ - public int getKmerSize() { - return kmerSize; - } - - /** - * Higher-level interface to #addKmersToGraph that adds a pair of kmers from a larger sequence of bytes to this - * graph. The kmers start at start (first) and start + 1 (second) have have length getKmerSize(). The - * edge between them is added with isRef and multiplicity - * - * @param sequence a sequence of bases from which we want to extract a pair of kmers - * @param start the start of the first kmer in sequence, must be between 0 and sequence.length - 2 - getKmerSize() - * @param multiplicity what's the multiplicity of the edge between these two kmers - */ - public void addKmerPairFromSeqToGraph( final byte[] sequence, final int start, final int multiplicity ) { - if ( sequence == null ) throw new IllegalArgumentException("Sequence cannot be null"); - if ( start < 0 ) throw new IllegalArgumentException("start must be >= 0 but got " + start); - if ( start + 1 + getKmerSize() > sequence.length ) throw new IllegalArgumentException("start " + start + " is too big given kmerSize " + getKmerSize() + " and sequence length " + sequence.length); - final Kmer kmerPair = new Kmer(sequence, start, getKmerSize() + 1); - addKmerPair(kmerPair, multiplicity); - } - - /** - * Add a single kmer pair to this builder - * @param kmerPair a kmer pair is a single kmer that has kmerSize + 1 bp, where 0 -> kmersize and 1 -> kmersize + 1 - * will have an edge added to this - * @param multiplicity the desired multiplicity of this edge - */ - public void addKmerPair(final Kmer kmerPair, final int multiplicity) { - if ( kmerPair.length() != kmerSize + 1 ) throw new IllegalArgumentException("kmer pair must be of length kmerSize + 1 = " + kmerSize + 1 + " but got " + kmerPair.length()); - counter.addKmer(kmerPair, multiplicity); - } - - /** - * Flushes the currently added kmers to the graph - * - * After this function is called the builder is reset to an empty state - * - * This flushing is expensive, so many kmers should be added to the builder before flushing. The most - * efficient workflow is to add all of the kmers of a particular class (all ref bases, or all read bases) - * then and do one flush when completed - * - * @param addRefEdges should the kmers present in the builder be added to the graph with isRef = true for the edges? - */ - public void flushKmersToGraph(final boolean addRefEdges) { - for ( final KMerCounter.CountedKmer countedKmer : counter.getCountedKmers() ) { - final byte[] first = countedKmer.getKmer().subKmer(0, kmerSize).bases(); - final byte[] second = countedKmer.getKmer().subKmer(1, kmerSize).bases(); - graph.addKmersToGraph(first, second, addRefEdges, countedKmer.getCount()); + public boolean useMaximum() { + switch (this) { + case COMBO_MAX: return true; + case MAX_ONLY: return true; + default: return false; } - counter.clear(); + } + + /** + * Indicates whether we should use the minimum kmerSize for the haplotypeGraph or not. + * + * @return true if we need to use the minimum, false otherwise. + */ + @SuppressWarnings("unused") + public boolean useMinimum() { + return ! useMaximum(); + } + + /** + * Tell whether this policy combines kmer-sizes or not. + * @return true iff it does. + */ + public boolean combinesKmerSizes() { + switch (this) { + case COMBO_MAX: return true; + case COMBO_MIN: return true; + default: return false; + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java index 2e757722b..056f7991f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Kmer.java @@ -49,8 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Requires; import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; /** * Fast wrapper for byte[] kmers @@ -68,7 +66,7 @@ import java.util.Map; */ public class Kmer { // this values may be updated in the course of interacting with this kmer - private byte[] bases; + protected byte[] bases; protected int start; // two constants @@ -126,6 +124,16 @@ public class Kmer { this.hash = kmer.hash; } + public Kmer(final Kmer kmer, final byte nextChar) { + final byte[] sequence = new byte[kmer.length]; + System.arraycopy(kmer.bases,kmer.start + 1,sequence,0,kmer.length - 1); + sequence[kmer.length - 1] = nextChar; + bases = sequence; + start = 0; + length = kmer.length; + hash = myHashCode(bases,start,length); + } + /** * Create a derived shallow kmer that starts at newStart and has newLength bases * @param newStart the new start of kmer, where 0 means that start of the kmer, 1 means skip the first base @@ -144,6 +152,7 @@ public class Kmer { * @return a non-null byte[] containing length() bases of this kmer, regardless of how this kmer was created */ public byte[] bases() { + if ( start != 0 || bases.length != length ) { // update operation. Rip out the exact byte[] and update start so we don't ever do this again bases = Arrays.copyOfRange(bases, start, start + length); @@ -153,6 +162,44 @@ public class Kmer { return bases; } + + /** + * Copies kmer bytes into a byte array. + * + * @param start first position of the kmer to copy + * @param dest what array to copy into + * @param offset what position the first byte to copy should go into the destination array. + * @param length how many bytes to copy + * + * @throws IllegalArgumentException if start is negative or combined with length it goes + * beyond the end of the kmer. Also if length is negative. + * @throws NullPointerException if dest is null + * @throws ArrayIndexOutOfBoundsException if dest does not have capacity to received the data. + */ + public void copyTo(final int start, final byte[] dest, final int offset, final int length) { + if (start + length > this.length) { + throw new IllegalArgumentException("request goes beyond end of kmer"); + } + if (length < 0) { + throw new IllegalArgumentException("requested length cannot be negative"); + } + System.arraycopy(bases,this.start + start,dest,offset,length); + } + + /** + * Copies kmer bytes into a byte array. + * + * @param dest what array to copy into + * @param offset what position the first byte to copy should go into the destination array. + * + * @throws IllegalArgumentException if start is negative or combined with length it goes + * beyond the end of the kmer. Also if length is negative. + * @throws NullPointerException if dest is null + */ + public void copyTo(final byte[] dest, final int offset) { + System.arraycopy(bases,start,dest,offset,length); + } + /** * Backdoor method for fast base peeking: avoids copying like bases() and doesn't modify internal state. * Intended to be used for fast computation of neighboring kmers @@ -219,13 +266,13 @@ public class Kmer { @Override public String toString() { - return "Kmer{" + new String(bases()) + "}"; + return "Kmer{" + new String(bases,start,length) + "}"; } @Override public boolean equals(Object o) { if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (o == null || !Kmer.class.isAssignableFrom(o.getClass())) return false; final Kmer kmer = (Kmer) o; @@ -264,4 +311,23 @@ public class Kmer { return result; } + + public byte base(final int i) { + return bases[start + i]; + } + + public Kmer shift(final byte nextChar) { + if (bases.length > start + length && bases[start + length] == nextChar) { + return new Kmer(bases,start + 1,length); + } else { + final byte[] newBases = new byte[length]; + System.arraycopy(bases,start + 1,newBases,0,length - 1); + newBases[length - 1] = nextChar; + return new Kmer(newBases,0,length); + } + } + + public byte lastBase() { + return bases[start + length - 1]; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java new file mode 100644 index 000000000..a6c35bce0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequence.java @@ -0,0 +1,461 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + + +import com.sun.istack.internal.NotNull; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.lang.reflect.Array; +import java.util.*; + +/** + * Represent a sequence of kmers where any two consecutive kmers overlap in kmer length - 1 elements. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class KmerSequence implements List { + private final byte[] sequence; + private final int start; + private final int size; + private final int kmerSize; + private final int rawLength; + + /** + * Creates a kmer sequence from a read's sequence. + * + * @param read the read to represent as a sequence of kmers. + * @param kmerSize the kmer size. + */ + public KmerSequence(final SAMRecord read, final int kmerSize) { + this(read.getReadBases(), kmerSize); + } + + /** + * Creates a kmer sequence from a haplotype's sequence. + * + * @param hap the haplotype to represent as a sequence of kmers. + * @param kmerSize the kmer size. + */ + public KmerSequence(final Haplotype hap, final int kmerSize) { + this(hap.getBases(), kmerSize); + } + + /** + * Creates a kmer sequence out of a byte sequence. + * + * @param sequence the byte array to represent as a kmer sequence. + * @param kmerSize the kmer size. + */ + public KmerSequence(final byte[] sequence, final int kmerSize) { + this(sequence,0,Math.max(0,sequence.length - kmerSize + 1),kmerSize, sequence.length); + } + + + /** + * Creates a kmer sequence out of a range of a byte array + * + * @param sequence the input array. + * @param start inclusive first position of the array that maps to the first position in the first kmer. + * @param size number kmers in the output. + * @param kmerSize kmer length in bases. + * @param rawLength the of the range in bases. + */ + protected KmerSequence(final byte[] sequence, final int start, final int size, final int kmerSize, final int rawLength) { + if (sequence == null) { + throw new IllegalArgumentException("start must be 0 or greater"); + } + if (rawLength > sequence.length - start) { + throw new IllegalArgumentException("the raw sequence length goes beyond the array capacity"); + } + if (size < 0) { + throw new IllegalArgumentException("the length cannot be negative"); + } + if (start < 0) { + throw new IllegalArgumentException("start must be 0 or greater"); + } + if (size > 0 && size + kmerSize - 1 > rawLength) { + throw new IllegalArgumentException( + String.format("the kmerSize (%d) + size (%d) - 1 cannot be larger than rawLength (%d)",kmerSize,size,rawLength) ); + } + this.sequence = sequence; + this.start = start; + this.size = size; + this.kmerSize = kmerSize; + this.rawLength = rawLength; + } + + public int kmerSize() { + return kmerSize; + } + + public KmerSequence subsequence(final int from, final int to) { + if (from < 0 || from > to) { + throw new IllegalArgumentException(); + } + if (to > size) { + throw new IllegalArgumentException(); + } + return new KmerSequence(sequence,this.start + from,to - from,kmerSize,rawLength - from - (size - to)); + } + + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size == 0; + } + + @Override + public boolean contains(final Object o) { + if (o instanceof Kmer) { + if (o instanceof MyKmer) { + final MyKmer k = (MyKmer) o; + if (k.bases == sequence && k.start >= start && k.length == kmerSize && k.start < start + size) { + return true; + } + } + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return false; + } + for (int i = 0; i < size; i++) { + int j; + for (j = 0; j < kmerSize; j++) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == kmerSize) { + return true; + } + } + return false; + } else { + return false; + } + } + + @Override + @NotNull + public Iterator iterator() { + return new Iterator() { + + private int offset = 0; + + @Override + public boolean hasNext() { + return offset < size; + } + + @Override + public Kmer next() { + return new Kmer(sequence,start + offset,kmerSize); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @NotNull + @Override + public Object[] toArray() { + return toArray(new Kmer[size()]); + } + + @Override + @NotNull + @SuppressWarnings("unchecked") + public T[] toArray(@NotNull final T[] a) { + if (a == null) { + throw new IllegalArgumentException(); + } else if (!a.getClass().getComponentType().isAssignableFrom(Kmer.class)) { + throw new IllegalArgumentException(); + } else { + T[] result; + if (a.length < size) { + result = (T[]) Array.newInstance(a.getClass().getComponentType(), size); + } else { + result = a; + } + for (int i = 0; i < size; i++) { + result[i] = (T) new Kmer(sequence,start + i,kmerSize); + } + return result; + } + } + + @Override + public boolean add(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean remove(final Object o) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean containsAll(final Collection c) { + for (final Object o : c) + if (!contains(o)) + return false; + return true; + } + + @Override + public boolean addAll(final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean addAll(final int index, @NotNull final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean removeAll(@NotNull final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public boolean retainAll(@NotNull final Collection c) { + throw new UnsupportedOperationException(); + } + + @Override + public void clear() { + throw new UnsupportedOperationException(); + } + + @Override + public Kmer get(final int index) { + if (index < 0 || index >= size) { + throw new IllegalArgumentException(); + } + return new Kmer(sequence,start + index,kmerSize); + } + + @Override + public Kmer set(final int index, final Kmer element) { + throw new UnsupportedOperationException(); + } + + @Override + public void add(final int index, final Kmer element) { + throw new UnsupportedOperationException(); + } + + @Override + public Kmer remove(final int index) { + throw new UnsupportedOperationException(); + } + + @Override + public int indexOf(final Object o) { + if (o instanceof Kmer) { + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return -1; + } + for (int i = 0; i < size; i++) { + int j; + for (j = 0; j < kmerSize; j++) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == kmerSize) { + return i; + } + } + return -1; + } else { + return -1; + } + } + + @Override + public int lastIndexOf(final Object o) { + if (o instanceof Kmer) { + final Kmer k = (Kmer) o; + if (k.length != kmerSize) { + return -1; + } + for (int i = size - 1; i >= 0; i--) { + int j; + for (j = kmerSize - 1; j >= 0; j--) { + if (sequence[start + i + j] != k.bases[k.start + j]) { + break; + } + } + if (j == 0) { + return i; + } + } + return -1; + } else { + return -1; + } + } + + @Override + @NotNull + public ListIterator listIterator() { + return new MyListIterator(0); + } + + @Override + @NotNull + public ListIterator listIterator(final int index) { + return new MyListIterator(index); + } + + @Override + @NotNull + public List subList(final int fromIndex, final int toIndex) { + return subsequence(fromIndex,toIndex); + } + + /** + * Returns the byte array representation of the kmer sequence. + * @return never {@code null}. + */ + @NotNull + public byte[] getBytes() { + if (start == 0 && rawLength == sequence.length) + return sequence; + else + return Arrays.copyOfRange(sequence, start, rawLength + start); + } + + /** + * Internal class that implements the {@link Kmer} more efficiently + * making reference to the sequence's own byte array. + */ + protected class MyKmer extends Kmer { + + /** + * Create a new instance give the offset in the byte array. + * @param start the start base offset for the kmer. + */ + public MyKmer(final int start) { + super(sequence,start,kmerSize); + } + } + + /** + * Iterator implementation of Kmer elements. + */ + private class MyListIterator implements ListIterator { + + private int i = 0; + + /** + * Creates a iterator at certain offset in the sequence. + * @param idx the start position or kmer offset. + */ + private MyListIterator(final int idx) { + i = idx; + } + + @Override + public boolean hasNext() { + return i < size; + } + + @Override + public Kmer next() { + return new Kmer(sequence,start + i++,kmerSize); + } + + @Override + public boolean hasPrevious() { + return i > 0; + } + + @Override + public Kmer previous() { + return new Kmer(sequence,start + --i,kmerSize); + } + + @Override + public int nextIndex() { + return i; + } + + @Override + public int previousIndex() { + return i - 1; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void set(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + @Override + public void add(final Kmer kmer) { + throw new UnsupportedOperationException(); + } + + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java new file mode 100644 index 000000000..d71f99116 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KmerSequenceGraphMap.java @@ -0,0 +1,209 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.BaseEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.BaseVertex; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.KmerSearchableGraph; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Collections; +import java.util.Arrays; +import java.util.ArrayList; + +/** + * Contains information as to how a kmer sequence maps to an (assembly) graph. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.com> + */ +public class KmerSequenceGraphMap { + + protected final KmerSequence sequence; + protected final KmerSearchableGraph graph; + protected final int kmerSize; + + private List vertexList; + + private List vertexMatchOnlyList; + + private Set vertexSet; + + private Map vertexOffset; + + //private List> vertexSegmentList; + + /** + * Constructs a new Kmer sequence graph map give the graph and sequence. + * + * @param g the graph to map to. + * @param s the sequence to map. + * @throws NullPointerException if either the graph or the sequence is null. + * @throws IllegalArgumentException if the kmer sizes of the input graph and sequence are not the same. + */ + public KmerSequenceGraphMap(final KmerSearchableGraph g, final KmerSequence s) { + if (s.kmerSize() != g.getKmerSize()) { + throw new IllegalArgumentException("kmer size for the graph (" + g.getKmerSize() + ") and the sequence (" + s.kmerSize() + ") are different"); + } + sequence = s; + graph = g; + kmerSize = s.kmerSize(); + } + + /** + * Vertices that form part of the kmer sequence path along the graph. + * + *

The ith position in the resulting list corresponds to the ith kmer in the sequence

. + * + *

+ * The resulting list will contain null values for those kmers where there is no unique kmer match in the + * graph. + *

+ * + * @return never {@code null} + */ + public List vertexList() { + if (vertexList == null) + buildVertexCollections(); + return vertexList; + } + + /** + * Vertices that form part of the kmer sequence path along the graph. + * + *

Only contains unique kmer vertices where the non-unique ones have been sliced out from the list

+ * + * @return never {@code null} + */ + public List vertexMatchOnlyList() { + if (vertexMatchOnlyList == null) { + buildVertexCollections(); + } + return vertexMatchOnlyList; + } + + + /** + * Return a map from vertices to their kmer offset in the kmer sequence. + * @return never {@code null} + */ + public Map vertexOffset() { + if (vertexOffset == null) { + buildVertexCollections(); + } + return vertexOffset; + } + + /** + * Set of all vertices with unique kmers in the kmer sequence. + *

+ * This structure is more appropriate to query whether a vertex belong or not to such a set. + *

+ * @return never {@code null}. + */ + public Set vertexSet() { + if (vertexSet == null) { + buildVertexCollections(); + } + return vertexSet; + } + + /** + * Updates vertex structures. + */ + protected void buildVertexCollections() { + @SuppressWarnings("unchecked") + final V[] result = (V[]) new BaseVertex[sequence.size()]; + final Set set = new HashSet<>(sequence.size()); + final Map posMap = new HashMap<>(sequence.size()); + @SuppressWarnings("unchecked") + final V[] matchOnly = (V[]) new BaseVertex[sequence.size()]; + int next = 0; + int matchOnlyNext = 0; + for (int i = 0; i < sequence.size(); i++) { + final Kmer k = sequence.get(i); + final V v = graph.findKmer(k); + if (v != null) { + set.add(v); + posMap.put(v,i); + matchOnly[matchOnlyNext++] = v; + } + result[next++] = v; + } + vertexList = Arrays.asList(result); + vertexMatchOnlyList = Arrays.asList(Arrays.copyOf(matchOnly,matchOnlyNext)); + vertexSet = Collections.unmodifiableSet(set); + vertexOffset = Collections.unmodifiableMap(posMap); + } + + /** + * Returns the list of kmers in the sequence that do not have a unique mapping on the graph. + * @return never {@code null} + */ + @SuppressWarnings("unused") + public List missingKmers() { + if (vertexList == null) { + buildVertexCollections(); + } + if (vertexList.size() == vertexMatchOnlyList.size()) { + return Collections.emptyList(); + } else { + final List result = new ArrayList<>(vertexList.size() - vertexMatchOnlyList.size()); + final int size = sequence.size(); + for (int i = 0; i < vertexList.size(); i++) { + if (vertexList.get(i) == null) { + result.add(sequence.get(i)); + } + } + return result; + } + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 4a1a5993a..0626f2268 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -46,412 +46,47 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.haplotype.HaplotypeScoreComparator; -import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; -import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; -import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.*; +import java.util.List; +import java.util.Map; -public class LikelihoodCalculationEngine { - private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); +/** + * Common interface for assembly-haplotype vs reads likelihood engines. + */ +public interface LikelihoodCalculationEngine { - private final byte constantGCP; - private final double log10globalReadMismappingRate; - private final boolean DEBUG; + enum Implementation { + /** + * Classic full pair-hmm all haplotypes vs all reads. + */ + PairHMM, - private final PairHMM.HMM_IMPLEMENTATION hmmType; + /** + * Graph-base likelihoods. + */ + GraphBased, - private final ThreadLocal pairHMM = new ThreadLocal() { - @Override - protected PairHMM initialValue() { - switch (hmmType) { - case EXACT: return new Log10PairHMM(true); - case ORIGINAL: return new Log10PairHMM(false); - case LOGLESS_CACHING: return new LoglessPairHMM(); - default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING."); - } - } - }; - - private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; - private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; - private final PrintStream likelihoodsStream; - - /** - * The expected rate of random sequencing errors for a read originating from its true haplotype. - * - * For example, if this is 0.01, then we'd expect 1 error per 100 bp. - */ - private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02; - - /** - * Create a new LikelihoodCalculationEngine using provided parameters and hmm to do its calculations - * - * @param constantGCP the gap continuation penalty to use with the PairHMM - * @param debug should we emit debugging information during the calculation? - * @param hmmType the type of the HMM to use - * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of - * -3 means that the chance that a read doesn't actually belong at this - * location in the genome is 1 in 1000. The effect of this parameter is - * to cap the maximum likelihood difference between the reference haplotype - * and the best alternative haplotype by -3 log units. So if the best - * haplotype is at -10 and this parameter has a value of -3 then even if the - * reference haplotype gets a score of -100 from the pairhmm it will be - * assigned a likelihood of -13. - */ - public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) { - this.hmmType = hmmType; - this.constantGCP = constantGCP; - this.DEBUG = debug; - this.log10globalReadMismappingRate = log10globalReadMismappingRate; - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - try { - likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME))); - } catch ( FileNotFoundException e ) { - throw new RuntimeException(e); - } - } else { - likelihoodsStream = null; - } + /** + * Random likelihoods, used to establish a baseline benchmark for other meaningful implementations. + */ + Random } - public LikelihoodCalculationEngine() { - this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3); - } - - public void close() { - if ( likelihoodsStream != null ) likelihoodsStream.close(); - } - - /** - * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate + * Calculates the likelihood of reads across many samples evaluated against haplotypes resulting from the + * active region assembly process. * - * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples - * against the set of haplotypes + * @param assemblyResultSet the input assembly results. + * @param perSampleReadList the input read sets stratified per sample. * - * @param haplotypes a non-null list of haplotypes - * @param perSampleReadList a mapping from sample -> reads + * @throws NullPointerException if either parameter is {@code null}. + * + * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. + * The value maps can be potentially empty though. */ - private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { - int X_METRIC_LENGTH = 0; - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - for( final GATKSAMRecord read : sample.getValue() ) { - final int readLength = read.getReadLength(); - if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } - } - } - int Y_METRIC_LENGTH = 0; - for( final Haplotype h : haplotypes ) { - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } - } - - // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMM.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); - } - - public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { - // configure the HMM - initializePairHMM(haplotypes, perSampleReadList); - - // Add likelihoods for each sample's reads to our stratifiedReadMap - final Map stratifiedReadMap = new LinkedHashMap<>(); - for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { - // evaluate the likelihood of the reads given those haplotypes - final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); - - final List removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); -// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey()); -// for ( final GATKSAMRecord read : removedReads ) -// logger.info("\tRemoved " + read.getReadName()); - - stratifiedReadMap.put(sampleEntry.getKey(), map); - } - - return stratifiedReadMap; - } - - private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { - // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) - final int numHaplotypes = haplotypes.size(); - final Map alleleVersions = new LinkedHashMap<>(numHaplotypes); - Allele refAllele = null; - for ( final Haplotype haplotype : haplotypes ) { - final Allele allele = Allele.create(haplotype, true); - alleleVersions.put(haplotype, allele); - if ( haplotype.isReference() ) refAllele = allele; - } - - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); - for( final GATKSAMRecord read : reads ) { - final byte[] overallGCP = new byte[read.getReadLength()]; - Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? - // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read - final byte[] readQuals = read.getBaseQualities().clone(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - for( int kkk = 0; kkk < readQuals.length; kkk++ ) { - readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - //readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated - //readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated - // TODO -- why is Q18 hard-coded here??? - readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); - } - - // keep track of the reference likelihood and the best non-ref likelihood - double refLog10l = Double.NEGATIVE_INFINITY; - double bestNonReflog10L = Double.NEGATIVE_INFINITY; - - // iterate over all haplotypes, calculating the likelihood of the read for each haplotype - for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { - final Haplotype haplotype = haplotypes.get(jjj); - final boolean isFirstHaplotype = jjj == 0; - final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), - read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); - - if ( WRITE_LIKELIHOODS_TO_FILE ) { - likelihoodsStream.printf("%s %s %s %s %s %s %f%n", - haplotype.getBaseString(), - new String(read.getReadBases()), - SAMUtils.phredToFastq(readQuals), - SAMUtils.phredToFastq(readInsQuals), - SAMUtils.phredToFastq(readDelQuals), - SAMUtils.phredToFastq(overallGCP), - log10l); - } - - if ( haplotype.isNonReference() ) - bestNonReflog10L = Math.max(bestNonReflog10L, log10l); - else - refLog10l = log10l; - - perReadAlleleLikelihoodMap.add(read, alleleVersions.get(haplotype), log10l); - } - - // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global - // mismapping rate. This protects us from the case where the assembly has produced haplotypes - // that are very divergent from reference, but are supported by only one read. In effect - // we capping how badly scoring the reference can be for any read by the chance that the read - // itself just doesn't belong here - final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; - if ( refLog10l < (worstRefLog10Allowed) ) { - perReadAlleleLikelihoodMap.add(read, refAllele, worstRefLog10Allowed); - } - } - - return perReadAlleleLikelihoodMap; - } - - @Requires({"alleleOrdering.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, - final Map stratifiedReadMap, - final List alleleOrdering, - final boolean normalize ) { - return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); - } - - @Requires({"alleleOrdering.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, - final Map stratifiedReadMap, - final List alleleOrdering, - final boolean normalize) { - - final int numHaplotypes = alleleOrdering.size(); - final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); - } - - // compute the diploid haplotype likelihoods - for( int iii = 0; iii < numHaplotypes; iii++ ) { - final Allele iii_allele = alleleOrdering.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Allele jjj_allele = alleleOrdering.get(jjj); - double haplotypeLikelihood = 0.0; - for( final String sample : samples ) { - for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) * - ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); - } - } - haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; - } - } - - // normalize the diploid likelihoods matrix - return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix; - } - - @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) - @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) - protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { - final int numHaplotypes = likelihoodMatrix.length; - double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; - int index = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ){ - genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj]; - } - } - genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); - index = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ){ - likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++]; - } - } - return likelihoodMatrix; - } - - // -------------------------------------------------------------------------------- - // - // System to compute the best N haplotypes for genotyping - // - // -------------------------------------------------------------------------------- - - /** - * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele - * @param map an annoying map object that moves us between the allele and haplotype representation - * @param haplotypeAsAllele the allele version of the haplotype - * @return the haplotype version, with its score incremented by 1 if its non-reference - */ - private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { - final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic - if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value - return h; - } - - /** - * Take the best N haplotypes and return them as a list - * - * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample - * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing - * order of score (so higher score haplotypes are preferred). The N we take is determined by - * - * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) - * - * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is - * bounded by maxNumHaplotypesInPopulation as that number can grow without bound - * - * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 - * @param nSamples the number of samples used to select the haplotypes - * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples - * @return a list of N or fewer haplotypes, with the reference haplotype first - */ - private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { - final List selectedHaplotypesList = new ArrayList(selectedHaplotypes); - Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); - final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; - final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); - final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); - if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); - return bestHaplotypes; - } - - /** - * Select the best haplotypes for genotyping the samples in stratifiedReadMap - * - * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely - * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for - * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get - * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation - * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the - * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. - * - * @param haplotypes a list of all haplotypes we're considering - * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype - * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes - * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation - */ - public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { - if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); - - if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes - - // all of the haplotypes that at least one sample called as one of the most likely - final Set selectedHaplotypes = new HashSet<>(); - selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected - - // our annoying map from allele -> haplotype - final Map allele2Haplotype = new HashMap<>(); - for ( final Haplotype h : haplotypes ) { - h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes - allele2Haplotype.put(Allele.create(h, h.isReference()), h); - } - - // for each sample, compute the most likely pair of haplotypes - for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { - // get the two most likely haplotypes under a diploid model for this sample - final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); - - if ( mla != null ) { // there was something to evaluate in this sample - // note that there must be at least 2 haplotypes - final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); - final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); - -// if ( DEBUG ) { -// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); -// } - - // add these two haplotypes to the set of haplotypes that have been selected - selectedHaplotypes.add(best); - selectedHaplotypes.add(second); - - // we've already selected all of our haplotypes, and we don't need to prune them down - if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) - break; - } - } - - // take the best N haplotypes forward, in order of the number of samples that choose them - final int nSamples = stratifiedReadMap.size(); - final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); - - if ( DEBUG ) { - logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); - for ( final Haplotype h : bestHaplotypes ) { - logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); - } - } - return bestHaplotypes; - } - - /** - * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found - * @param haplotypes non-null list of haplotypes - * @return the reference haplotype - */ - private static Haplotype findReferenceHaplotype( final List haplotypes ) { - for( final Haplotype h : haplotypes ) { - if( h.isReference() ) return h; - } - throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); - } -} \ No newline at end of file + public Map computeReadLikelihoods(AssemblyResultSet assemblyResultSet, + Map> perSampleReadList); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index 27178c78f..d0e28d878 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -77,7 +77,7 @@ public abstract class LocalAssemblyEngine { * If false, we will only write out a region around the reference source */ private final static boolean PRINT_FULL_GRAPH_FOR_DEBUGGING = true; - public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 8; + public static final byte DEFAULT_MIN_BASE_QUALITY_TO_USE = (byte) 10; private static final int MIN_HAPLOTYPE_REFERENCE_LENGTH = 30; protected final int numBestHaplotypesPerGraph; @@ -124,15 +124,16 @@ public abstract class LocalAssemblyEngine { * @param refLoc GenomeLoc object corresponding to the reference sequence with padding * @param activeAllelesToGenotype the alleles to inject into the haplotypes during GGA mode * @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used. - * @return a non-empty list of all the haplotypes that are produced during assembly + * @return the resulting assembly-result-set */ - public List runLocalAssembly(final ActiveRegion activeRegion, + public AssemblyResultSet runLocalAssembly(final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final ReadErrorCorrector readErrorCorrector) { if( activeRegion == null ) { throw new IllegalArgumentException("Assembly engine cannot be used with a null ActiveRegion."); } + if( activeRegion.getExtendedLoc() == null ) { throw new IllegalArgumentException("Active region must have an extended location."); } if( refHaplotype == null ) { throw new IllegalArgumentException("Reference haplotype cannot be null."); } if( fullReferenceWithPadding.length != refLoc.size() ) { throw new IllegalArgumentException("Reference bases and reference loc must be the same size."); } if( pruneFactor < 0 ) { throw new IllegalArgumentException("Pruning factor cannot be negative"); } @@ -153,26 +154,32 @@ public abstract class LocalAssemblyEngine { } final List nonRefGraphs = new LinkedList<>(); + final AssemblyResultSet resultSet = new AssemblyResultSet(); + resultSet.setRegionForGenotyping(activeRegion); + resultSet.setFullReferenceWithPadding(fullReferenceWithPadding); + resultSet.setPaddedReferenceLoc(refLoc); + final GenomeLoc activeRegionExtendedLocation = activeRegion.getExtendedLoc(); + refHaplotype.setGenomeLocation(activeRegionExtendedLocation); + resultSet.add(refHaplotype); + final Map assemblyResultByGraph = new HashMap<>(); // create the graphs by calling our subclass assemble method for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, activeAlleleHaplotypes) ) { if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) { // do some QC on the graph sanityCheckGraph(result.getGraph(), refHaplotype); // add it to graphs with meaningful non-reference features + assemblyResultByGraph.put(result.getGraph(),result); nonRefGraphs.add(result.getGraph()); } + } + findBestPaths (nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet); + // print the graphs if the appropriate debug option has been turned on if ( graphWriter != null ) { printGraphs(nonRefGraphs); } - if ( nonRefGraphs.isEmpty() ) { - // we couldn't assemble any meaningful graphs, so return just the reference haplotype - return Collections.singletonList(refHaplotype); - } else { - // find the best paths in the graphs and return them as haplotypes - return findBestPaths( nonRefGraphs, refHaplotype, refLoc, activeRegion.getExtendedLoc() ); - } + return resultSet; } /** @@ -198,8 +205,10 @@ public abstract class LocalAssemblyEngine { return new ArrayList<>(returnHaplotypes); } + @Ensures({"result.contains(refHaplotype)"}) - protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow) { + protected List findBestPaths(final List graphs, final Haplotype refHaplotype, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, + final Map assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes final Set returnHaplotypes = new LinkedHashSet<>(); returnHaplotypes.add( refHaplotype ); @@ -235,7 +244,9 @@ public abstract class LocalAssemblyEngine { h.setCigar(cigar); h.setAlignmentStartHapwrtRef(activeRegionStart); h.setScore(path.getScore()); + h.setGenomeLocation(activeRegionWindow); returnHaplotypes.add(h); + assemblyResultSet.add(h, assemblyResultByGraph.get(graph)); if ( debug ) logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize()); @@ -243,8 +254,6 @@ public abstract class LocalAssemblyEngine { } } - // add genome locs to the haplotypes - for ( final Haplotype h : returnHaplotypes ) h.setGenomeLocation(activeRegionWindow); if ( returnHaplotypes.size() < returnHaplotypes.size() ) logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against at " + refLoc); @@ -262,8 +271,8 @@ public abstract class LocalAssemblyEngine { } return new ArrayList<>(returnHaplotypes); - } + } /** * We use CigarOperator.N as the signal that an incomplete or too divergent bubble was found during bubble traversal * @param c the cigar to test @@ -301,9 +310,7 @@ public abstract class LocalAssemblyEngine { printDebugGraphTransform(seqGraph, new File("sequenceGraph.2.zipped.dot")); // now go through and prune the graph, removing vertices no longer connected to the reference chain - // IMPORTANT: pruning must occur before we call simplifyGraph, as simplifyGraph adds 0 weight - // edges to maintain graph connectivity. - seqGraph.pruneGraph(pruneFactor); + seqGraph.removeSingletonOrphanVertices(); seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); printDebugGraphTransform(seqGraph, new File("sequenceGraph.3.pruned.dot")); @@ -328,7 +335,6 @@ public abstract class LocalAssemblyEngine { seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); } printDebugGraphTransform(seqGraph, new File("sequenceGraph.5.final.dot")); - return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java new file mode 100644 index 000000000..b1db23d74 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -0,0 +1,622 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.*; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.variant.variantcontext.*; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.*; + +public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculationEngine { + private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class); + + public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual + + private final byte constantGCP; + private final double log10globalReadMismappingRate; + private final boolean DEBUG; + + private final PairHMM.HMM_IMPLEMENTATION hmmType; + private final boolean noFpga; + + private final ThreadLocal pairHMMThreadLocal = new ThreadLocal() { + @Override + protected PairHMM initialValue() { + switch (hmmType) { + case EXACT: return new Log10PairHMM(true); + case ORIGINAL: return new Log10PairHMM(false); + case LOGLESS_CACHING: + if (noFpga || !CnyPairHMM.isAvailable()) + return new LoglessPairHMM(); + else + return new CnyPairHMM(); + case ARRAY_LOGLESS: + if (noFpga || !CnyPairHMM.isAvailable()) + return new ArrayLoglessPairHMM(); + else + return new CnyPairHMM(); + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS."); + } + } + }; +// Attempted to do as below, to avoid calling pairHMMThreadLocal.get() later on, but it resulted in a NullPointerException +// private final PairHMM pairHMM = pairHMMThreadLocal.get(); + + private final static boolean WRITE_LIKELIHOODS_TO_FILE = false; + private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; + private final PrintStream likelihoodsStream; + + public enum PCR_ERROR_MODEL { + /** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */ + NONE, + /** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */ + AGGRESSIVE, + /** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */ + CONSERVATIVE + } + + private final PCR_ERROR_MODEL pcrErrorModel; + + /** + * The expected rate of random sequencing errors for a read originating from its true haplotype. + * + * For example, if this is 0.01, then we'd expect 1 error per 100 bp. + */ + private final static double EXPECTED_ERROR_RATE_PER_BASE = 0.02; + + /** + * Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations + * + * @param constantGCP the gap continuation penalty to use with the PairHMM + * @param debug should we emit debugging information during the calculation? + * @param hmmType the type of the HMM to use + * @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of + * -3 means that the chance that a read doesn't actually belong at this + * location in the genome is 1 in 1000. The effect of this parameter is + * to cap the maximum likelihood difference between the reference haplotype + * and the best alternative haplotype by -3 log units. So if the best + * haplotype is at -10 and this parameter has a value of -3 then even if the + * reference haplotype gets a score of -100 from the pairhmm it will be + * assigned a likelihood of -13. + * @param noFpga disable FPGA acceleration + */ + public PairHMMLikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) { + this.hmmType = hmmType; + this.constantGCP = constantGCP; + this.DEBUG = debug; + this.log10globalReadMismappingRate = log10globalReadMismappingRate; + this.noFpga = noFpga; + this.pcrErrorModel = pcrErrorModel; + + initializePCRErrorModel(); + + if ( WRITE_LIKELIHOODS_TO_FILE ) { + try { + likelihoodsStream = new PrintStream(new FileOutputStream(new File(LIKELIHOODS_FILENAME))); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } + } else { + likelihoodsStream = null; + } + } + + public void close() { + if ( likelihoodsStream != null ) likelihoodsStream.close(); + } + + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ + if ( WRITE_LIKELIHOODS_TO_FILE ) { + likelihoodsStream.printf("%s %s %s %s %s %s %f%n", + haplotype.getBaseString(), + new String(processedRead.getReadBases() ), + SAMUtils.phredToFastq(processedRead.getBaseQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ), + SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ), + SAMUtils.phredToFastq(constantGCP), + log10l); + } + } + + private Map createAlleleMap(List haplotypes){ + final int numHaplotypes = haplotypes.size(); + final Map alleleMap = new LinkedHashMap<>(numHaplotypes); + for ( final Haplotype haplotype : haplotypes ) { + final Allele allele = Allele.create(haplotype, true); + alleleMap.put(allele, haplotype); + } + return alleleMap; + } + + private Map fillGCPArrays(List reads){ + final Map GCPArrayMap = new LinkedHashMap<>(); + for (GATKSAMRecord read: reads){ + byte [] GCPArray = new byte[read.getReadBases().length]; + Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data? + GCPArrayMap.put(read, GCPArray); + } + return GCPArrayMap; + } + + private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { + for( int kkk = 0; kkk < readQuals.length; kkk++ ) { + readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG + readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); + readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); + } + } + + /** + * Pre-processing of the reads to be evaluated at the current location from the current sample. + * We apply the PCR Error Model, and cap the minimum base, insertion, and deletion qualities of each read. + * Modified copies of reads are packed into a new list, while original reads are retained for downstream use + * + * @param reads The original list of unmodified reads + * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding + */ + private List modifyReadQualities(final List reads) { + List processedReads = new LinkedList<>(); + for ( GATKSAMRecord read : reads ) { + + final byte[] readBases = read.getReadBases(); + + // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read + final byte[] readQuals = read.getBaseQualities().clone(); + final byte[] readInsQuals = read.getBaseInsertionQualities().clone(); + final byte[] readDelQuals = read.getBaseDeletionQualities().clone(); + + applyPCRErrorModel(readBases, readInsQuals, readDelQuals); + capMinimumReadQualities(read, readQuals, readInsQuals, readDelQuals); + + // Create a new copy of the read and sets its base qualities to the modified versions. + // Pack this into a new list for return + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); + processedReads.add(processedRead); + } + return processedReads; + } + + /** + * Post-processing of the read/allele likelihoods. + * + * We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads. + * We wish to return a map containing the original, unmodified reads. + * + * At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate. + * This protects us from the case where the assembly has produced haplotypes + * that are very divergent from reference, but are supported by only one read. In effect + * we capping how badly scoring the reference can be for any read by the chance that the read + * itself just doesn't belong here + * + * @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls + * @param reads Our original, unmodified reads + * @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls + * @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype + * + * @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls + */ + private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List reads, List processedReads, Map alleleHaplotypeMap){ + + // a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls + final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); + + Allele refAllele = null; + final int numReads = reads.size(); + for (int readIndex = 0; readIndex < numReads; readIndex++) { + + // Get the original and quality-modified read from their respective lists + // Note that this requires both lists to have reads in the same order + final GATKSAMRecord originalRead = reads.get(readIndex); + final GATKSAMRecord processedRead = processedReads.get(readIndex); + + // keep track of the reference likelihood and the best non-ref likelihood + double refLog10l = Double.NEGATIVE_INFINITY; + double bestNonReflog10L = Double.NEGATIVE_INFINITY; + + for ( Allele allele : alleleHaplotypeMap.keySet() ) { + final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele); + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + if ( haplotype.isNonReference() ) + bestNonReflog10L = Math.max(bestNonReflog10L, log10l); + else { + refAllele = allele; + refLog10l = log10l; + } + writeDebugLikelihoods(processedRead, haplotype, log10l); + + // add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l + processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l); + } + + // ensure that the reference haplotype is no worse than the best non-ref haplotype minus the global + // mismapping rate. This protects us from the case where the assembly has produced haplotypes + // that are very divergent from reference, but are supported by only one read. In effect + // we capping how badly scoring the reference can be for any read by the chance that the read + // itself just doesn't belong here + final double worstRefLog10Allowed = bestNonReflog10L + log10globalReadMismappingRate; + if ( refLog10l < (worstRefLog10Allowed) ) { + processedReadAlleleLikelihoodMap.add(originalRead, refAllele, worstRefLog10Allowed); + } + } + return processedReadAlleleLikelihoodMap; + } + + /** + * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate + * + * After calling this routine the PairHMM will be configured to best evaluate all reads in the samples + * against the set of haplotypes + * + * @param haplotypes a non-null list of haplotypes + * @param perSampleReadList a mapping from sample -> reads + */ + private void initializePairHMM(final List haplotypes, final Map> perSampleReadList) { + int X_METRIC_LENGTH = 0; + for( final Map.Entry> sample : perSampleReadList.entrySet() ) { + for( final GATKSAMRecord read : sample.getValue() ) { + final int readLength = read.getReadLength(); + if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } + } + } + int Y_METRIC_LENGTH = 0; + for( final Haplotype h : haplotypes ) { + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; } + } + + // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases + pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + } + + + @Override + public Map computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final Map> perSampleReadList ) { + + final List haplotypes = assemblyResultSet.getHaplotypeList(); + // configure the HMM + initializePairHMM(haplotypes, perSampleReadList); + + // Add likelihoods for each sample's reads to our stratifiedReadMap + final Map stratifiedReadMap = new LinkedHashMap<>(); + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { + // evaluate the likelihood of the reads given those haplotypes + final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); + + map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); + stratifiedReadMap.put(sampleEntry.getKey(), map); + } + + return stratifiedReadMap; + } + + + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { + + // Add likelihoods for each sample's reads to our stratifiedReadMap + final Map stratifiedReadMap = new LinkedHashMap<>(); + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { + // evaluate the likelihood of the reads given those haplotypes + final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); + + map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); + stratifiedReadMap.put(sampleEntry.getKey(), map); + } + + return stratifiedReadMap; + } + + private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { + + // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities + List processedReads = modifyReadQualities(reads); + + // Get alleles corresponding to our haplotypees + Map alleleHaplotypeMap = createAlleleMap(haplotypes); + + // Get an array containing the constantGCP for each read in our modified read list + Map GCPArrayMap = fillGCPArrays(processedReads); + + // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); + + // Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate + + return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); + } + + @Requires({"alleleOrdering.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, + final Map stratifiedReadMap, + final List alleleOrdering, + final boolean normalize ) { + return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); + } + + @Requires({"alleleOrdering.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, + final Map stratifiedReadMap, + final List alleleOrdering, + final boolean normalize) { + + final int numHaplotypes = alleleOrdering.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Allele iii_allele = alleleOrdering.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleleOrdering.get(jjj); + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) * + ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); + } + } + haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; + } + } + + // normalize the diploid likelihoods matrix + return normalize ? normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ) : haplotypeLikelihoodMatrix; + } + + @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) + @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) + protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { + final int numHaplotypes = likelihoodMatrix.length; + double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; + int index = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ){ + genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj]; + } + } + genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); + index = 0; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + for( int jjj = 0; jjj <= iii; jjj++ ){ + likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++]; + } + } + return likelihoodMatrix; + } + + // -------------------------------------------------------------------------------- + // + // System to compute the best N haplotypes for genotyping + // + // -------------------------------------------------------------------------------- +// +// /** +// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele +// * @param map an annoying map object that moves us between the allele and haplotype representation +// * @param haplotypeAsAllele the allele version of the haplotype +// * @return the haplotype version, with its score incremented by 1 if its non-reference +// */ +// private Haplotype updateSelectHaplotype(final Map map, final Allele haplotypeAsAllele) { +// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic +// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value +// return h; +// } +// +// /** +// * Take the best N haplotypes and return them as a list +// * +// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample +// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing +// * order of score (so higher score haplotypes are preferred). The N we take is determined by +// * +// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation) +// * +// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is +// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound +// * +// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1 +// * @param nSamples the number of samples used to select the haplotypes +// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples +// * @return a list of N or fewer haplotypes, with the reference haplotype first +// */ +// private List selectBestHaplotypesAccordingToScore(final Set selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) { +// final List selectedHaplotypesList = new ArrayList<>(selectedHaplotypes); +// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator()); +// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1; +// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation); +// final List bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep); +// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list"); +// return bestHaplotypes; +// } +// +// /** +// * Select the best haplotypes for genotyping the samples in stratifiedReadMap +// * +// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely +// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for +// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get +// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation +// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the +// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference. +// * +// * @param haplotypes a list of all haplotypes we're considering +// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype +// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes +// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation +// */ +// public List selectBestHaplotypesFromEachSample(final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation) { +// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes); +// +// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes +// +// // all of the haplotypes that at least one sample called as one of the most likely +// final Set selectedHaplotypes = new HashSet<>(); +// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected +// +// // our annoying map from allele -> haplotype +// final Map allele2Haplotype = new HashMap<>(); +// for ( final Haplotype h : haplotypes ) { +// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes +// allele2Haplotype.put(Allele.create(h, h.isReference()), h); +// } +// +// // for each sample, compute the most likely pair of haplotypes +// for ( final Map.Entry entry : stratifiedReadMap.entrySet() ) { +// // get the two most likely haplotypes under a diploid model for this sample +// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles(); +// +// if ( mla != null ) { // there was something to evaluate in this sample +// // note that there must be at least 2 haplotypes +// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele()); +// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele()); +// +//// if ( DEBUG ) { +//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey()); +//// } +// +// // add these two haplotypes to the set of haplotypes that have been selected +// selectedHaplotypes.add(best); +// selectedHaplotypes.add(second); +// +// // we've already selected all of our haplotypes, and we don't need to prune them down +// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation ) +// break; +// } +// } +// +// // take the best N haplotypes forward, in order of the number of samples that choose them +// final int nSamples = stratifiedReadMap.size(); +// final List bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation); +// +// if ( DEBUG ) { +// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples."); +// for ( final Haplotype h : bestHaplotypes ) { +// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype")); +// } +// } +// return bestHaplotypes; +// } +// +// /** +// * Find the haplotype that isRef(), or @throw ReviewedStingException if one isn't found +// * @param haplotypes non-null list of haplotypes +// * @return the reference haplotype +// */ +// private static Haplotype findReferenceHaplotype( final List haplotypes ) { +// for( final Haplotype h : haplotypes ) { +// if( h.isReference() ) return h; +// } +// throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); +// } + + // -------------------------------------------------------------------------------- + // + // Experimental attempts at PCR error rate modeling + // + // -------------------------------------------------------------------------------- + + protected static final int MAX_STR_UNIT_LENGTH = 8; + protected static final int MAX_REPEAT_LENGTH = 20; + protected static final int MIN_ADJUSTED_QSCORE = 10; + protected static final double INITIAL_QSCORE = 40.0; + + private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1]; + private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + + private void initializePCRErrorModel() { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH); + + pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1]; + + final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0; + + for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ ) + pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor); + } + + protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) { + return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 )); + } + + protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) { + if ( pcrErrorModel == PCR_ERROR_MODEL.NONE ) + return; + + for ( int iii = 1; iii < readBases.length; iii++ ) { + final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond(); + readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java new file mode 100644 index 000000000..b8dba7b86 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java @@ -0,0 +1,82 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +/** + * Random likelihoods generator, used for testing/benchmarking purposes. + */ +public class RandomLikelihoodCalculationEngine implements LikelihoodCalculationEngine { + + @Override + public Map computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map> reads) { + final List haplotypes = assemblyResultSet.getHaplotypeList(); + final Map result = new HashMap<>(reads.size()); + final Map alleles = new HashMap<>(haplotypes.size()); + for (final Haplotype haplotype : haplotypes) + alleles.put(haplotype,Allele.create(haplotype,false)); + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + for (final String sample : reads.keySet()) { + final PerReadAlleleLikelihoodMap pralm = new PerReadAlleleLikelihoodMap(); + for (final GATKSAMRecord read : reads.get(sample)) + for (final Haplotype haplotype : haplotypes ) + pralm.add(read,alleles.get(haplotype),-Math.abs(rnd.nextDouble())); + result.put(sample,pralm); + } + + return result; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java new file mode 100644 index 000000000..b4d2e1a2f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadAnchoring.java @@ -0,0 +1,366 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * Collects information as to how a read maps into the haplotype graph that is needed to calculate its likelihood + * using the graph-based approach. + */ +public class ReadAnchoring { + + private static final Logger logger = Logger.getLogger(ReadAnchoring.class); + + /** Holds a reference to the read itself */ + protected final GATKSAMRecord read; + protected final Map uniqueKmerOffsets; + + /** + * Kmer offset on the read of the left anchor + *

+ * {@code -1} if there is no left anchor. + *

+ */ + protected int leftAnchorIndex; + + /** + * Vertex in the graph where the left anchor falls. + *

+ * {@code null} if there is no left anchor. + *

+ */ + protected MultiDeBruijnVertex leftAnchorVertex; + + /** + * Kmer offset on the read of the right anchor. + * + *

+ * {@code -1} if there is no right anchor. + *

+ */ + protected int rightAnchorIndex; + + /** + * Vertex in the graph where the right anchor falls. + * + *

+ * {@code null} if there is no right anchor. + *

+ */ + protected MultiDeBruijnVertex rightAnchorVertex; + + /** + * Kmer sequence mapping information for the read sequence. + * + * never {@code null}. + */ + protected final KmerSequenceGraphMap graphMap; + + /** + * Alignment of read kmers on the reference haplotype kmers. + * + *

+ * There is one entry for each base in the read. + * + *

+ *

+ * The i-th entry indicates what kmer in the reference haplotype correspond to the kmer on the read starting + * at is i-th base. + *

+ * + *

+ * {@code -1} means that there is no match. + *

+ * + *

+ * The last kmerSize - 1 entry of the array are {@code -1} + *

+ */ + protected final int[] referenceAlignment; + + /** + * Maps between reference path vertex that are found between anchors and the kmer offset they map uniquely to + * on the read. + */ + protected final Map referenceWithinAnchorsMap; + + /** + * Creates the read's anchoring information for the haplotype-graph. + * + * @param read the targeted read. + * @param haplotypeGraph the targeted graph. + * + * @throws NullPointerException if any argument is {@code null}. + * @throws IllegalArgumentException if elements in {@code anchorableVertices} are not vertex in {@code haplotypeGraph} + */ + public ReadAnchoring(final GATKSAMRecord read, final HaplotypeGraph haplotypeGraph) { + this.read = read; + final byte[] readBases = read.getReadBases(); + final KmerSequence readKmers = new KmerSequence(read, haplotypeGraph.getKmerSize()); + graphMap = new KmerSequenceGraphMap<>(haplotypeGraph, readKmers); + final Map vertexOffset = graphMap.vertexOffset(); + referenceAlignment = calculateUniqueKmerAlignment(0, readBases.length, haplotypeGraph.getReferenceRoute(), vertexOffset, haplotypeGraph.getKmerSize()); + leftAnchorIndex = -1; + leftAnchorVertex = null; + for (int i = 0; i < readBases.length - haplotypeGraph.getKmerSize() + 1; i++) { + if (referenceAlignment[i] == -1) continue; + final MultiDeBruijnVertex candidate = haplotypeGraph.findKmer(readKmers.get(i)); + if (candidate != null && haplotypeGraph.getAnchorableVertices().contains(candidate)) { + leftAnchorIndex = i; + leftAnchorVertex = candidate; + break; + } + } + rightAnchorIndex = leftAnchorIndex; + rightAnchorVertex = leftAnchorVertex; + if (leftAnchorIndex != -1) { + for (int i = readBases.length - haplotypeGraph.getKmerSize(); i > leftAnchorIndex; i--) { + if (referenceAlignment[i] == -1) continue; + final MultiDeBruijnVertex candidate = haplotypeGraph.findKmer(readKmers.get(i)); + if (candidate != null && haplotypeGraph.getAnchorableVertices().contains(candidate)) { + rightAnchorIndex = i; + rightAnchorVertex = candidate; + break; + } + } + } + referenceWithinAnchorsMap = buildReferenceWithinBoundariesMap(read, haplotypeGraph, + vertexOffset, leftAnchorVertex, rightAnchorVertex); + uniqueKmerOffsets = buildReadUniqueKmerOffsets(haplotypeGraph); + } + + /** + * For a given read, returns the set of reference path vertices that falls between the two anchor vertices. + *

+ *

+ * The resulting map has as key the reference vertices between those two boundaries (inclusive) and + * the value is the corresponding offset in the kmer. + *

+ * + * @param read the target read. + * @param readVertexKmerOffset map between vertices and their kmer offset on the read. + * @param leftAnchorVertex left anchor vertex. + * @param rightAnchorVertex right anchor vertex. + * @return never {@code null}, but empty if the anchors are out of order in the reference. + */ + private Map buildReferenceWithinBoundariesMap( + final GATKSAMRecord read, final HaplotypeGraph haplotypeGraph, + final Map readVertexKmerOffset, + final MultiDeBruijnVertex leftAnchorVertex, final MultiDeBruijnVertex rightAnchorVertex) { + if (leftAnchorVertex == null) + return Collections.emptyMap(); + + final Map result = new HashMap<>(); + MultiDeBruijnVertex nextVertex = leftAnchorVertex; + + int leftAnchorOffset = 0; + while (nextVertex != null) { + result.put(nextVertex, leftAnchorOffset++); + if (nextVertex == rightAnchorVertex) + break; + nextVertex = haplotypeGraph.getNextReferenceVertex(nextVertex); + } + if (nextVertex == null) { + logger.warn("unexpected event kmers out of order between read anchor kmers: " + read.getReadString() + + " Offending kmer offsets: " + readVertexKmerOffset.get(leftAnchorVertex) + " " + readVertexKmerOffset.get(rightAnchorVertex) + + " sequences: " + + read.getReadString().substring(readVertexKmerOffset.get(leftAnchorVertex), haplotypeGraph.getKmerSize() + readVertexKmerOffset.get(leftAnchorVertex)) + + " " + read.getReadString().substring(readVertexKmerOffset.get(rightAnchorVertex), haplotypeGraph.getKmerSize() + readVertexKmerOffset.get(rightAnchorVertex)) + + " Reference haplotype: " + haplotypeGraph.getReferenceHaplotype().getBaseString()); + return Collections.emptyMap(); + } + return result; + } + + /** + * Builds a map between unique kmers in the reference path and their kmer offset in the read. + * + * @param haplotypeGraph the anchoring graph. + * + * @return never {@code null}. + */ + private Map buildReadUniqueKmerOffsets(final HaplotypeGraph haplotypeGraph) { + if (!hasValidAnchors()) + return Collections.emptyMap(); + final Map vertexOffset = graphMap.vertexOffset(); + final Set readUniqueKmerVertices = new HashSet<>(vertexOffset.size()); + readUniqueKmerVertices.add(leftAnchorVertex); + readUniqueKmerVertices.add(rightAnchorVertex); + for (int i = leftAnchorIndex + 1; i < rightAnchorIndex; i++) { + if (referenceAlignment[i] != -1) { + readUniqueKmerVertices.add(haplotypeGraph.findKmer(graphMap.sequence.get(i))); + } + } + final Map validVertexOffset = new HashMap<>(graphMap.vertexOffset()); + validVertexOffset.keySet().retainAll(readUniqueKmerVertices); + return validVertexOffset; + } + + /** + * Checks whether it has some anchoring kmer and these are valid, i.e. the left anchor is the same or preceedes the right anchor in the reference path. + * @return {@code true} iff so. + */ + public boolean hasValidAnchors() { + return referenceWithinAnchorsMap.size() >= 1; + } + + /** + * Calculates an array indicating for each kmer in the read what is the offset of that kmer in a path. + *

+ *

+ * The result is of the same length as the read. Position ith indicates the offset of the read kmer that + * start at that position in the input path. Non matching kmers have -1 instead. + *

+ * + * @param readStart inclusive first position of the read to consider. + * @param readEnd exclusive position after last to be considered. + * @param path the path to which to align against. + * @param readUniqueKmerOffset map of vertices to the kmer offset with the read. + * @return never {@code null}. + */ + private int[] calculateUniqueKmerAlignment(final int readStart, final int readEnd, final Path path, + final Map readUniqueKmerOffset, final int kmerSize) { + + final int[] result = new int[readEnd - readStart]; + Arrays.fill(result, -1); + int i = 0; + for (final MultiDeBruijnVertex v : path.getVertices()) { + final Integer kmerReadOffset = readUniqueKmerOffset.get(v); + if (kmerReadOffset != null) { + final int kro = kmerReadOffset; + if (kro >= readStart && kro < readEnd - kmerSize + 1) { + result[kro - readStart] = i; + } + } + i++; + } + // Now we remove conflicting mappings: + // A conflicting mapping is when to kmer mapping suggest that + // the same read position maps to two different bases in the path. + maskOutConflictingKmerAlignments(result,kmerSize); + return result; + } + + /** + * Mark with -1 those kmer matches that result in read base mapping conflicts. + * + * @param result in/out changed in-situ. + */ + @Requires("result != null") + private void maskOutConflictingKmerAlignments(final int[] result, final int kmerSize) { + int i; + int lastKmer = -1; + int lastKmerPos = -1; + for (i = 0; i < result.length; i++) { + final int kmer = result[i]; + if (kmer == -1) + continue; + if (lastKmer == -1) { + lastKmer = kmer; + lastKmerPos = i; + } else if (lastKmer + kmerSize - 1 >= kmer && (i - lastKmerPos) != (kmer - lastKmer)) { // kmer overlap. fixing by eliminating offending kmers alignments. + int iSkip = result.length; // iSkip will contain the next position minus 1 to visit in the next iteration of the enclosing loop. + for (int j = i; j < result.length; j++) + if (result[j] != -1) { + if (lastKmer + kmerSize - 1 >= result[j]) + result[j] = -1; + else { + iSkip = j; + break; + } + } + // then backwards and do the same. + int j = lastKmerPos; + lastKmer = -1; + lastKmerPos = -1; + for (; j >= 0; j--) + if (result[j] != -1) { + if (result[j] + kmerSize - 1 >= kmer) + result[j] = -1; + else { + lastKmer = result[j]; + lastKmerPos = j; + break; + } + } + i = iSkip; + } else { + lastKmer = kmer; + lastKmerPos = i; + } + } + } + + /** + * Checks whether it is anchored at all. + * + * @return {@code true} iff so. + */ + public boolean isAnchoredSomewhere() { + return hasValidAnchors(); + //return hasValidAnchors(); + } + + /** + * Whether the read is anchored perfectly, there are no non-aligned bases. + * + * @return {@code true} iff so. + */ + public boolean isPerfectAnchoring() { + return hasValidAnchors() && leftAnchorIndex == 0 && rightAnchorIndex == read.getReadLength() - graphMap.kmerSize && + !leftAnchorVertex.hasAmbiguousSequence(); + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java new file mode 100644 index 000000000..d5f62a6a3 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadCost.java @@ -0,0 +1,112 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Comparator; + +/** + * A pair read-likelihood (cost). + */ +public class ReadCost { + public final GATKSAMRecord read; + + /** + * Holds the cost value. Public for convenience, please use with care. + */ + private double cost; + + /** + * Create a new read cost object provided the read and the gap extension penalty. + * + * @param r the read. + * @param initialCost the initial cost for the read before any read-segment alignment. + * + * @throws NullPointerException if {@code r} is {@code null}. + * @throws IllegalArgumentException if {@code initialCost} is not a valid likelihood. + */ + public ReadCost(final GATKSAMRecord r, final double initialCost) { + if (r == null) throw new NullPointerException(); + if (Double.isNaN(initialCost) || Double.isInfinite(initialCost) || initialCost > 0) + throw new IllegalArgumentException("initial cost must be a finite 0 or negative value (" + initialCost + ")"); + read = r; + cost = initialCost; + } + + + /** + * Comparator used to sort ReadCosts + */ + public static final Comparator COMPARATOR = new Comparator() { + @Override + public int compare(final ReadCost o1, final ReadCost o2) { + final String s1 = o1.read.getReadName() + (o1.read.getReadPairedFlag() ? (o1.read.getFirstOfPairFlag() ? "/1" : "/2") : ""); + final String s2 = o2.read.getReadName() + (o2.read.getReadPairedFlag() ? (o2.read.getFirstOfPairFlag() ? "/1" : "/2") : ""); + return s1.compareTo(s2); + } + }; + + + /** + * Add to the cost. + * @param value value to add. + */ + public void addCost(final double value) { + if (cost + value > 0) + throw new IllegalArgumentException("value brings cost over 0. Current cost " + cost + " value " + value); + cost += value; + } + + /** + * Return cost. + * @return 0 or less. + */ + public double getCost() { + return cost; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java new file mode 100644 index 000000000..2efe42337 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentComparator.java @@ -0,0 +1,86 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import java.util.Comparator; + +/** + * Sorts path costs. + *

+ * Path costs are first sorted by their path base sequence in alphanumerical order. + *

+ * + *

+ * When these are the same, we consider their unique ids {@link ReadSegmentCost#uniqueId()} to break the tie. + *

+ * + */ +class ReadSegmentComparator implements Comparator { + + public static final Comparator INSTANCE = new ReadSegmentComparator(); + + @Override + public int compare(final ReadSegmentCost o1, final ReadSegmentCost o2) { + int minLength = Math.min(o1.bases.length, o2.bases.length); + for (int i = 0; i < minLength; i++) { + if (o1.bases[i] == o2.bases[i]) + continue; + else if (o1.bases[i] < o2.bases[i]) { + return -1; + } else { + return 1; + } + } + if (o1.bases.length < o2.bases.length) { + return -1; + } else if (o1.bases.length > o2.bases.length) { + return 1; + } else { + return Long.compare(o1.uniqueId(),o2.uniqueId()); + } + + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java new file mode 100644 index 000000000..b5544f1a2 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadSegmentCost.java @@ -0,0 +1,120 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.MultiSampleEdge; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Route; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * Path cost indicate the cost (alignment likelihood) of traversing a section of the graph using a segement of a read. + * + *

A path can be a whole haplotype path as well as just a smaller haplotype segment

. + * + *

We would generate many of this objects for each read. The final likelihood of a read vs each haplotype + * would be the summation of the path-cost of that read along the corresponding haplotype path.

+ */ +class ReadSegmentCost { + + public Route path; + public GATKSAMRecord read; + + /** + * Holds the cost value. It public and non-final for convenience. + */ + private double cost; + + /** + * Caches the path bases (the haplotype segment bases). + */ + protected byte[] bases; + + /** + * Construct a new path cost. + * @param read the corresponding read. + * @param path the corresponding path. + * @param cost initial cost estimate. Might be updated later. + */ + @Requires("route != null") + public ReadSegmentCost(final GATKSAMRecord read, + final Route path, double cost) { + this.read = read; + this.path = path; + setCost(cost); + } + + public double getCost() { + return cost; + } + + public void setCost(final double value) { + cost = value; + } + + /** + * Used to generate unique identifiers for path cost object. + */ + private static final AtomicLong pathCostUniqueIdGenerator = new AtomicLong(); + + /** + * Holds the path cost unique identifier. + */ + private Long uniqueId; + + /** + * Returns the this path-cost unique identifier. + * @return + */ + public long uniqueId() { + if (uniqueId == null) + uniqueId = pathCostUniqueIdGenerator.incrementAndGet(); + return uniqueId; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java index ee7565282..8fb7afec7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RefVsAnyResult.java @@ -68,5 +68,13 @@ final class RefVsAnyResult { /** * @return Get the DP (sum of AD values) */ - public int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } + protected int getDP() { return AD_Ref_Any[0] + AD_Ref_Any[1]; } + + /** + * Cap the het and hom var likelihood values by the hom ref likelihood. + */ + protected void capByHomRefLikelihood() { + genotypeLikelihoods[1] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[1]); + genotypeLikelihoods[2] = Math.min(genotypeLikelihoods[0], genotypeLikelihoods[2]); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java index 98264d4c2..4ec56f706 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -61,6 +61,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; import org.broadinstitute.variant.vcf.VCFHeaderLine; @@ -81,10 +82,9 @@ import java.util.*; * Time: 12:52 PM */ public class ReferenceConfidenceModel { - public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; - public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - public final static String INDEL_INFORMATIVE_DEPTH = "CD"; + //public final static String INDEL_INFORMATIVE_DEPTH = "CD"; // temporarily taking this extra genotype level information out for now + public final static String ALTERNATE_ALLELE_STRING = "ALT"; // arbitrary alternate allele private final GenomeLocParser genomeLocParser; private final Set samples; @@ -94,6 +94,8 @@ public class ReferenceConfidenceModel { private final static boolean WRITE_DEBUGGING_BAM = false; private final SAMFileWriter debuggingWriter; + private final static byte REF_MODEL_DELETION_QUAL = (byte) 30; + /** * Create a new ReferenceConfidenceModel * @@ -124,6 +126,8 @@ public class ReferenceConfidenceModel { } else { debuggingWriter = null; } + + initializeIndelPLCache(); } /** @@ -132,8 +136,9 @@ public class ReferenceConfidenceModel { */ public Set getVCFHeaderLines() { final Set headerLines = new LinkedHashSet<>(); - headerLines.add(new VCFSimpleHeaderLine("ALT", NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); - headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); + // TODO - do we need a new kind of VCF Header subclass for specifying arbitrary alternate alleles? + headerLines.add(new VCFSimpleHeaderLine(ALTERNATE_ALLELE_STRING, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME, "Represents any possible alternative allele at this location")); + //headerLines.add(new VCFFormatHeaderLine(INDEL_INFORMATIVE_DEPTH, 1, VCFHeaderLineType.Integer, "Number of reads at locus that are informative about an indel of size <= " + indelInformativeDepthIndelSize)); return headerLines; } @@ -161,7 +166,7 @@ public class ReferenceConfidenceModel { * @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence - * under any position is covers (for snps that 1 bp, but for deletion its the entire ref span) + * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) * @return an ordered list of variant contexts that spans activeRegion.getLoc() and includes both reference confidence * contexts as well as calls from variantCalls if any were provided */ @@ -181,7 +186,7 @@ public class ReferenceConfidenceModel { if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); final GenomeLoc refSpan = activeRegion.getLocation(); - final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, refSpan, stratifiedReadMap); + final List refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); final byte[] ref = refHaplotype.getBases(); final List results = new ArrayList<>(refSpan.size()); final String sampleName = stratifiedReadMap.keySet().iterator().next(); @@ -201,9 +206,10 @@ public class ReferenceConfidenceModel { final int refOffset = offset + globalRefOffset; final byte refBase = ref[refOffset]; final RefVsAnyResult homRefCalc = calcGenotypeLikelihoodsOfRefVsAny(pileup, refBase, (byte)6, null); + homRefCalc.capByHomRefLikelihood(); final Allele refAllele = Allele.create(refBase, true); - final List refSiteAlleles = Arrays.asList(refAllele, NON_REF_SYMBOLIC_ALLELE); + final List refSiteAlleles = Arrays.asList(refAllele, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); final VariantContextBuilder vcb = new VariantContextBuilder("HC", curPos.getContig(), curPos.getStart(), curPos.getStart(), refSiteAlleles); final GenotypeBuilder gb = new GenotypeBuilder(sampleName, Arrays.asList(refAllele, refAllele)); gb.AD(homRefCalc.AD_Ref_Any); @@ -224,7 +230,7 @@ public class ReferenceConfidenceModel { gb.GQ((int) (-10 * leastConfidenceGLs.getLog10GQ(GenotypeType.HOM_REF))); gb.PL(leastConfidenceGLs.getAsPLs()); - gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); + //gb.attribute(INDEL_INFORMATIVE_DEPTH, nIndelInformativeReads); vcb.genotypes(gb.make()); results.add(vcb.make()); @@ -252,14 +258,21 @@ public class ReferenceConfidenceModel { * @return non-null GenotypeLikelihoods given N */ protected final GenotypeLikelihoods getIndelPLs(final int nInformativeReads) { - // TODO -- optimization -- this could easily be optimized with some caching - final double homRef = 0.0; - final double het = - LOG10_2 * nInformativeReads; - final double homVar = INDEL_ERROR_RATE * nInformativeReads; - return GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + return indelPLCache[nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads]; + } + + protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway + private static final GenotypeLikelihoods[] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_INFORMATIVE_READS + 1]; + private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp + + private void initializeIndelPLCache() { + for( int nInformativeReads = 0; nInformativeReads <= MAX_N_INDEL_INFORMATIVE_READS; nInformativeReads++ ) { + final double homRef = 0.0; + final double het = MathUtils.LOG_ONE_HALF * nInformativeReads; + final double homVar = INDEL_ERROR_RATE * nInformativeReads; + indelPLCache[nInformativeReads] = GenotypeLikelihoods.fromLog10Likelihoods(new double[]{homRef, het, homVar}); + } } - private final static double LOG10_2 = Math.log10(2); - private final static double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp /** * Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt @@ -274,8 +287,8 @@ public class ReferenceConfidenceModel { final RefVsAnyResult result = new RefVsAnyResult(); for( final PileupElement p : pileup ) { - final byte qual = p.getQual(); - if( p.isDeletion() || qual > minBaseQual) { + final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual()); + if( p.isDeletion() || qual > minBaseQual ) { int AA = 0; final int AB = 1; int BB = 2; if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) { AA = 2; @@ -283,9 +296,9 @@ public class ReferenceConfidenceModel { if( hqSoftClips != null && p.isNextToSoftClip() ) { hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); } - result.AD_Ref_Any[1]++; + result.AD_Ref_Any[1] += p.getRepresentativeCount(); } else { - result.AD_Ref_Any[0]++; + result.AD_Ref_Any[0] += p.getRepresentativeCount(); } result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); @@ -302,20 +315,37 @@ public class ReferenceConfidenceModel { private List getPileupsOverReference(final Haplotype refHaplotype, final Collection calledHaplotypes, final GenomeLoc paddedReferenceLoc, + final ActiveRegion activeRegion, final GenomeLoc activeRegionSpan, final Map stratifiedReadMap) { - final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); - final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); - writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves - writer.writeReadsAlignedToHaplotypes(calledHaplotypes.isEmpty() ? Collections.singleton(refHaplotype) : calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); - final List realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + + if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); + if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); + if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); + if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); + if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); + if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); + if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); + + List realignedReads; + + if( calledHaplotypes.size() == 1 ) { // only contains ref haplotype so an optimization is to just trust the alignments to the reference haplotype as provided by the aligner + realignedReads = activeRegion.getReads(); + } else { + final ReadDestination.ToList realignedReadsDest = new ReadDestination.ToList(header, "FOO"); + final HaplotypeBAMWriter writer = HaplotypeBAMWriter.create(HaplotypeBAMWriter.Type.CALLED_HAPLOTYPES, realignedReadsDest); + writer.setWriteHaplotypesAsWell(false); // don't write out reads for the haplotypes, as we only want the realigned reads themselves + writer.setOnlyRealignInformativeReads(true); + writer.writeReadsAlignedToHaplotypes(calledHaplotypes, paddedReferenceLoc, stratifiedReadMap); + realignedReads = ReadUtils.sortReadsByCoordinate(realignedReadsDest.getReads()); + } if ( debuggingWriter != null ) for ( final GATKSAMRecord read : realignedReads ) debuggingWriter.addAlignment(read); final LocusIteratorByState libs = new LocusIteratorByState(realignedReads.iterator(), LocusIteratorByState.NO_DOWNSAMPLING, - false, genomeLocParser, samples, false); + true, genomeLocParser, samples, false); final List pileups = new LinkedList<>(); final int startPos = activeRegionSpan.getStart(); @@ -378,7 +408,7 @@ public class ReferenceConfidenceModel { final byte refBase = refBases[refStart + i]; if ( readBase != refBase ) { sum += readQuals[readStart + i]; - if ( sum > maxSum ) + if ( sum > maxSum ) // abort early return sum; } } @@ -403,7 +433,10 @@ public class ReferenceConfidenceModel { final byte[] refBases, final int refStart, final int maxIndelSize) { - // todo -- fast exit when n bases left < maxIndelSize + // fast exit when n bases left < maxIndelSize + if( readBases.length - readStart < maxIndelSize || refBases.length - refStart < maxIndelSize ) { + return false; + } final int baselineMMSum = sumMismatchingQualities(readBases, readQuals, readStart, refBases, refStart, Integer.MAX_VALUE); @@ -445,12 +478,16 @@ public class ReferenceConfidenceModel { final int offset = p.getOffset(); // doesn't count as evidence - if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() ) + if ( p.isBeforeDeletionStart() || p.isBeforeInsertion() || p.isDeletion() ) continue; // todo -- this code really should handle CIGARs directly instead of relying on the above tests - if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize)) - nInformative++; + if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { + nInformative += p.getRepresentativeCount(); + if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { + return MAX_N_INDEL_INFORMATIVE_READS; + } + } } return nInformative; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index 2b37d90c2..edd8dbb16 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -68,10 +68,10 @@ import java.util.*; @Invariant("!this.isAllowingMultipleEdges()") public class BaseGraph extends DefaultDirectedGraph { protected final static Logger logger = Logger.getLogger(BaseGraph.class); - private final int kmerSize; + protected final int kmerSize; /** - * Construct a DeBruijnGraph with kmerSize + * Construct a TestGraph with kmerSize * @param kmerSize */ public BaseGraph(final int kmerSize, final EdgeFactory edgeFactory) { @@ -95,10 +95,13 @@ public class BaseGraph extends Default */ public boolean isReferenceNode( final V v ) { if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - for( final BaseEdge e : edgesOf(v) ) { - if( e.isRef() ) { return true; } + + for ( final BaseEdge e : edgesOf(v) ) { + if ( e.isRef() ) { return true; } } - return false; + + // edge case: if the graph only has one node then it's a ref node, otherwise it's not + return (vertexSet().size() == 1); } /** @@ -154,62 +157,46 @@ public class BaseGraph extends Default return v.getAdditionalSequence(isSource(v)); } - /** - * @param e the edge to test - * @return true if this edge is a reference source edge - */ - public boolean isRefSource( final E e ) { - if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); } - for( final E edgeToTest : incomingEdgesOf(getEdgeSource(e)) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - /** * @param v the vertex to test * @return true if this vertex is a reference source */ public boolean isRefSource( final V v ) { if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - for( final E edgeToTest : incomingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { return false; } + + // confirm that no incoming edges are reference edges + for ( final E edgeToTest : incomingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return false; } } - return true; + + // confirm that there is an outgoing reference edge + for ( final E edgeToTest : outgoingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return true; } + } + + // edge case: if the graph only has one node then it's a ref sink, otherwise it's not + return (vertexSet().size() == 1); } /** - * @param e the edge to test - * @return true if this edge is a reference sink edge - */ - public boolean isRefSink( final E e ) { - if( e == null ) { throw new IllegalArgumentException("Attempting to test a null edge."); } - for( final E edgeToTest : outgoingEdgesOf(getEdgeTarget(e)) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - - /** - * // TODO -- the logic of this test is just wrong * @param v the vertex to test * @return true if this vertex is a reference sink */ public boolean isRefSink( final V v ) { if( v == null ) { throw new IllegalArgumentException("Attempting to test a null vertex."); } - for( final E edgeToTest : outgoingEdgesOf(v) ) { - if( edgeToTest.isRef() ) { return false; } - } - return true; - } - /** - * Is this both a refsink node and a reference node - * @param v a non-null vertex - * @return true if v is both a sink and a reference node - */ - public boolean isRefNodeAndRefSink(final V v) { - return isRefSink(v) && isReferenceNode(v); + // confirm that no outgoing edges are reference edges + for ( final E edgeToTest : outgoingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return false; } + } + + // confirm that there is an incoming reference edge + for ( final E edgeToTest : incomingEdgesOf(v) ) { + if ( edgeToTest.isRef() ) { return true; } + } + + // edge case: if the graph only has one node then it's a ref source, otherwise it's not + return (vertexSet().size() == 1); } /** @@ -217,7 +204,7 @@ public class BaseGraph extends Default */ public V getReferenceSourceVertex( ) { for( final V v : vertexSet() ) { - if( isReferenceNode(v) && isRefSource(v) ) { + if( isRefSource(v) ) { return v; } } @@ -229,7 +216,7 @@ public class BaseGraph extends Default */ public V getReferenceSinkVertex( ) { for( final V v : vertexSet() ) { - if( isReferenceNode(v) && isRefSink(v) ) { + if( isRefSink(v) ) { return v; } } @@ -472,28 +459,11 @@ public class BaseGraph extends Default } /** - * Prune all edges from this graph that have multiplicity <= pruneFactor and remove all orphaned singleton vertices as well - * - * @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed - */ - public void pruneGraph( final int pruneFactor ) { - final List edgesToRemove = new ArrayList<>(); - for( final E e : edgeSet() ) { - if( e.getPruningMultiplicity() <= pruneFactor && !e.isRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor - edgesToRemove.add(e); - } - } - removeAllEdges(edgesToRemove); - - removeSingletonOrphanVertices(); - } - - /** - * Prune all chains from this graph where all edges in the path have multiplicity <= pruneFactor + * Prune all chains from this graph where any edge in the path has multiplicity < pruneFactor * * @see LowWeightChainPruner for more information * - * @param pruneFactor all edges with multiplicity <= this factor that aren't ref edges will be removed + * @param pruneFactor all edges with multiplicity < this factor that aren't ref edges will be removed */ public void pruneLowWeightChains( final int pruneFactor ) { final LowWeightChainPruner pruner = new LowWeightChainPruner<>(pruneFactor); @@ -503,11 +473,11 @@ public class BaseGraph extends Default /** * Remove all vertices in the graph that have in and out degree of 0 */ - protected void removeSingletonOrphanVertices() { + public void removeSingletonOrphanVertices() { // Run through the graph and clean up singular orphaned nodes final List verticesToRemove = new LinkedList<>(); for( final V v : vertexSet() ) { - if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 && !isRefSource(v) ) { verticesToRemove.add(v); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java index 18a3ce1eb..63f087979 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseVertex.java @@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; + import java.util.Arrays; /** @@ -181,7 +183,7 @@ public class BaseVertex { /** * Set additional debugging information for this vertex - * @param info + * @param info the new info value. */ public void setAdditionalInfo(final String info) { if ( info == null ) throw new IllegalArgumentException("info cannot be null"); @@ -192,4 +194,32 @@ public class BaseVertex { * @return the additional information for display about this vertex */ public String additionalInfo() { return additionalInfo; } + + /** + * Checks whether the vertex sequence is ambiguous or not. + * + *

+ * Ambiguity may come about as a result of either: + *

    + *
  • by construction as the generating sequence (read or haplotype) had ambiguous bases
  • + *
  • or because this vertex is the result of merging two or more vertices with some variation upstream + * no more than kmerSize bases away (e.g. by executing {@link HaplotypeGraph#mergeCommonChains}
  • + *
+ *

+ * + * @return {@code true} iff so. + */ + public boolean hasAmbiguousSequence() { + for (final byte base : sequence) + switch (Character.toUpperCase(base)) { + case 'A' : + case 'T' : + case 'G' : + case 'C' : + continue; + default : + return true; + } + return false; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index 4d9441efe..ec2ccff20 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -78,7 +78,7 @@ public class DeBruijnVertex extends BaseVertex { * @return integer >= 1 */ @Ensures("result >= 1") - public int getKmer() { + public int getKmerSize() { return sequence.length; } @@ -100,7 +100,7 @@ public class DeBruijnVertex extends BaseVertex { * @return a byte */ public byte getSuffix() { - return sequence[getKmer() - 1]; + return sequence[getKmerSize() - 1]; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java new file mode 100644 index 000000000..94dc98b6c --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KmerSearchableGraph.java @@ -0,0 +1,74 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; + +/** + * Common interface for those graphs that implement vertex by kmer look-up. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public interface KmerSearchableGraph { + + /** + * Returns the vertex that represents or contains the last base of a given kmer. + * @param k the query kmer. + * + * @throws NullPointerException if {@code k} is {@code null}. + * @return {@code null} if there is no such a kmer in the graph or it is not unique. + */ + V findKmer(Kmer k); + + /** + * The kmer-size of indexed kmers. + * + * @return greater than 0. + */ + int getKmerSize(); + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java index 27b6bd902..520267dee 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPruner.java @@ -96,7 +96,7 @@ public class LowWeightChainPruner { } /** - * Traverse the edges in the path and determine if any are either ref edges or have weight above + * Traverse the edges in the path and determine if any are either ref edges or have weight above or equal to * the pruning factor and should therefore not be pruned away. * * @param path the path in question diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 2e84e1d22..6901d16ef 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -72,17 +72,17 @@ public class Path { private final static Logger logger = Logger.getLogger(Path.class); // the last vertex seen in the path - private final T lastVertex; + protected final T lastVertex; // the list of edges comprising the path private Set edgesAsSet = null; - private final LinkedList edgesInOrder; + protected final ArrayList edgesInOrder; // the scores for the path - private final int totalScore; + protected final int totalScore; // the graph from which this path originated - private final BaseGraph graph; + protected final BaseGraph graph; // used in the bubble state machine to apply Smith-Waterman to the bubble sequence // these values were chosen via optimization against the NA12878 knowledge base @@ -99,7 +99,7 @@ public class Path { if ( ! graph.containsVertex(initialVertex) ) throw new IllegalArgumentException("Vertex " + initialVertex + " must be part of graph " + graph); lastVertex = initialVertex; - edgesInOrder = new LinkedList(); + edgesInOrder = new ArrayList<>(0); totalScore = 0; this.graph = graph; } @@ -114,11 +114,29 @@ public class Path { return path; } + /** + * Create a new path with the same field values. + * + * @param p the template path. + * + * @throws NullPointerException if {@code p} is {@code null}. + */ + protected Path(final Path p) { + this.edgesInOrder = p.edgesInOrder; + this.lastVertex = p.lastVertex; + this.edgesAsSet = p.edgesAsSet; + this.totalScore = p.totalScore; + this.graph = p.graph; + } + /** * Create a new Path extending p with edge * - * @param p the path to extend - * @param edge the edge to extend path by + * @param p the path to extend. + * @param edge the edge to extend path with. + * + * @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code p}'s graph, or {@code edge} does not have as a source the last vertex in {@code p}. */ public Path(final Path p, final E edge) { if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); @@ -128,11 +146,43 @@ public class Path { graph = p.graph; lastVertex = p.graph.getEdgeTarget(edge); - edgesInOrder = new LinkedList(p.getEdges()); + edgesInOrder = new ArrayList<>(p.length() + 1); + edgesInOrder.addAll(p.edgesInOrder); edgesInOrder.add(edge); totalScore = p.totalScore + edge.getMultiplicity(); } + /** + * Length of the path in edges. + * + * @return {@code 0} or greater. + */ + public int length() { + return edgesInOrder.size(); + } + + /** + * Prepend a path with an edge. + * + * @param edge the extending edge. + * @param p the original path. + * + * @throws IllegalArgumentException if {@code p} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code p}'s graph, or {@code edge} does not have as a target the first vertex in {@code p}. + */ + public Path(final E edge, final Path p) { + if ( p == null ) throw new IllegalArgumentException("Path cannot be null"); + if ( edge == null ) throw new IllegalArgumentException("Edge cannot be null"); + if ( ! p.graph.containsEdge(edge) ) throw new IllegalArgumentException("Graph must contain edge " + edge + " but it doesn't"); + if ( ! p.graph.getEdgeTarget(edge).equals(p.getFirstVertex())) { throw new IllegalStateException("Edges added to path must be contiguous."); } + graph = p.graph; + lastVertex = p.lastVertex; + edgesInOrder = new ArrayList<>(p.length() + 1); + edgesInOrder.add(edge); + edgesInOrder.addAll(p.getEdges()); + totalScore = p.totalScore + edge.getMultiplicity(); + } + /** * Get the collection of edges leaving the last vertex of this path * @return a non-null collection @@ -168,6 +218,27 @@ public class Path { return getVertices().contains(v); } + /** + * Checks whether a given path is a suffix of this path. + * + * @param other the path to compare against. + * @throws IllegalArgumentException if other is null, or the come from + * different graphs. + * @return true if other is a suffix of this path. + */ + public boolean isSuffix(final Path other) { + if ( other == null ) throw new IllegalArgumentException("path cannot be null"); + if (other.getGraph() != this.getGraph()) throw new IllegalArgumentException("the other path most belong to the same path"); + if (!lastVertex.equals(other.lastVertex)) + return false; + final ListIterator myIt = edgesInOrder.listIterator(edgesInOrder.size()); + final ListIterator otherIt = other.edgesInOrder.listIterator(other.edgesInOrder.size()); + while (myIt.hasPrevious() && otherIt.hasPrevious()) + if (otherIt.previous() != myIt.previous()) + return false; + return !otherIt.hasPrevious(); + } + /** * Check that two paths have the same edges and total score * @param path the other path we might be the same as @@ -182,13 +253,13 @@ public class Path { final StringBuilder b = new StringBuilder("Path{score=" + totalScore + ", path="); boolean first = true; for ( final T v : getVertices() ) { - if ( first ) { + if ( first ) first = false; - } else { + else b.append(" -> "); - } b.append(v.getSequenceString()); } + b.append('}'); return b.toString(); } @@ -249,7 +320,11 @@ public class Path { * @return a non-null vertex */ public T getFirstVertex() { - return getGraph().getEdgeSource(edgesInOrder.pollFirst()); + if (edgesInOrder.size() == 0) { + return lastVertex; + } else { + return getGraph().getEdgeSource(edgesInOrder.get(0)); + } } /** @@ -260,7 +335,7 @@ public class Path { public byte[] getBases() { if( getEdges().isEmpty() ) { return graph.getAdditionalSequence(lastVertex); } - byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.getFirst())); + byte[] bases = graph.getAdditionalSequence(graph.getEdgeSource(edgesInOrder.get(0))); for( final E e : edgesInOrder ) { bases = ArrayUtils.addAll(bases, graph.getAdditionalSequence(graph.getEdgeTarget(e))); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java new file mode 100644 index 000000000..1cf986c00 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Route.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + + +import java.util.List; +import java.util.ListIterator; + +/** + * Represents a route or path through a graph. + *

+ * In contrast with a {@link Path}, a route keeps track of the + * path taken at furcations in order to speed up some path comparisions like the + * one implemented by {@link #isSuffix}. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class Route extends Path { + + protected final Route previousRouteWithLastVertexThatIsForkOrJoin; + protected final boolean lastVertexIsForkOrJoin; + + /** + * Create a zero length route with a start in a particular vertex: + * + * @param initialVertex the first vertex of the route. + * @param graph the new route's graph. + * + * @throws IllegalArgumentException if {@code initialVertex} or {@code graph} are {@code null}. + * or if {@code initialVertex} does not belong to {@code graph}. + */ + public Route(final V initialVertex, final BaseGraph graph) { + super(initialVertex, graph); + previousRouteWithLastVertexThatIsForkOrJoin = null; + lastVertexIsForkOrJoin = graph.inDegreeOf(initialVertex) > 1; + } + + @Override + public boolean equals(final Object other) { + if (other == null) return false; + if (other == this) return true; + if (! (other instanceof Route)) return false; + @SuppressWarnings("unchecked") + final Route otherRoute = (Route) other; + return otherRoute.length() == this.length() && isSuffix(otherRoute); + } + + /** + * Extends a route into a new instance. + * + * @param prefix the route to extend. + * @param nextVertex the vertex to extend the route to. + * + * @throws IllegalArgumentException if {@code prefix} is {@code null} or {@code nextVertex} is {@code null} + * or {@code nextVertex} does not belong to {@code prefix}'s graph or there is no edge that in the graph + * that would connect {@code prefix}'s last vertex with {@code nextVertex} directly. + */ + public Route(final Route prefix, final V nextVertex) { + this(prefix,resolveSuffixEdge(prefix,nextVertex)); + } + + + /** + * Extends a route into a new instance. + * + * @param prevVertex the vertex to extend the route to. + * @param suffix the route to extend. + * + * @throws IllegalArgumentException if {@code suffix} is {@code null} or {@code prevVertex} is {@code null} + * or {@code prevVertex} does not belong to {@code suffix}'s graph or there is no edge that in the graph + * that would connect {@code suffix}'s first vertex with {@code prevVertex} directly. + */ + public Route(final V prevVertex, final Route suffix) { + this(resolvePrefixEdge(prevVertex, suffix),suffix); + } + + /** + * Resolves the prefix edge as required by {@link Route(V,Route)}. + */ + private static E resolvePrefixEdge(final V prevVertex, final Route suffix) { + if (prevVertex == null) throw new NullPointerException(); + if (!suffix.getGraph().containsVertex(prevVertex)) throw new IllegalArgumentException(); + final E result = suffix.getGraph().getEdge(prevVertex,suffix.getFirstVertex()); + if (result == null) + throw new IllegalArgumentException("there is no such edge in the graph"); + return result; + } + + /** + * Resolves the suffix edge as required by {@link Route(Route,V)} + */ + private static E resolveSuffixEdge(final Route prefix, final V nextVertex) { + if (nextVertex == null) throw new IllegalArgumentException(); + if (!prefix.getGraph().containsVertex(nextVertex)) throw new IllegalArgumentException(); + final E result = prefix.getGraph().getEdge(prefix.getLastVertex(),nextVertex); + if (result == null) + throw new IllegalArgumentException("there is no such edge in the graph"); + return result; + } + + /** + * Extends a route by prefixing an edge. + * + * @param initialEdge the extending edge. + * @param suffix the original path. + * + * @throws IllegalArgumentException if {@code suffix} or {@code initialEdge} are {@code null}, or {@code initialEdge} is + * not part of {@code suffix}'s graph, or {@code initialEdge} does not have as a target the first vertex in {@code suffix}. + */ + public Route(final E initialEdge, final Route suffix) { + super(initialEdge,suffix); + final V firstVertex = getFirstVertex(); + if(suffix.length() == 0) { + lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin || graph.outDegreeOf(firstVertex) > 1; + previousRouteWithLastVertexThatIsForkOrJoin = graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; + } else { + lastVertexIsForkOrJoin = suffix.lastVertexIsForkOrJoin; + if (suffix.previousRouteWithLastVertexThatIsForkOrJoin != null) + previousRouteWithLastVertexThatIsForkOrJoin = new Route<>(initialEdge,suffix.previousRouteWithLastVertexThatIsForkOrJoin); + else + previousRouteWithLastVertexThatIsForkOrJoin = graph.outDegreeOf(firstVertex) > 1 ? + new Route<>(new Route<>(firstVertex,graph),edgesInOrder.get(0)) : + graph.inDegreeOf(firstVertex) > 1 ? new Route<>(firstVertex,graph) : null; + } + } + + /** + * Create copy of an existing route. + * @param route the route to copy + * + * @throws NullPointerException if {@code route} is {@code null}. + */ + protected Route(final Route route) { + super(route); + lastVertexIsForkOrJoin = route.lastVertexIsForkOrJoin; + previousRouteWithLastVertexThatIsForkOrJoin = route.previousRouteWithLastVertexThatIsForkOrJoin; + } + + /** + * Create a new Route extending another one with an edge + * + * @param route the route to extend. + * @param edge the edge to extend the route with. + * + * @throws IllegalArgumentException if {@code route} or {@code edge} are {@code null}, or {@code edge} is + * not part of {@code route}'s graph, or {@code edge} does not have as a source the last vertex in {@code route}. + */ + public Route(final Route route, final E edge) { + super(route, edge); + lastVertexIsForkOrJoin = graph.outDegreeOf(route.lastVertex) > 1 || graph.inDegreeOf(lastVertex) > 1; + previousRouteWithLastVertexThatIsForkOrJoin = route.lastVertexIsForkOrJoin ? route : route.previousRouteWithLastVertexThatIsForkOrJoin; + } + + @Override + public boolean isSuffix(final Path other) { + if (other == this) + return true; + else if (other == null) + throw new IllegalArgumentException("other path must not be null"); + else if (getGraph() != other.getGraph()) + throw new IllegalArgumentException("other path must be part of the same graph"); + else if (other instanceof Route) + return isRouteSuffix((Route)other); + else + return super.isSuffix(other); + } + + @Override + public String toString() { + return super.toString().replace("Path{", "Route{"); + } + + /** + * Faster version when comparing with a route. + */ + protected boolean isRouteSuffix(final Route other) { + if (other.getGraph() != this.getGraph()) + throw new IllegalArgumentException("you cannot compare routes on different graphs"); + else if (lastVertex != other.lastVertex) // obvious case. + return false; + else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null + && other.previousRouteWithLastVertexThatIsForkOrJoin != null) // I am shorter or different path for sure. + return false; + else if (this.edgesInOrder.size() < other.edgesInOrder.size()) // I am shorter regardless of path, no way Jose! + return false; + else if (this.previousRouteWithLastVertexThatIsForkOrJoin == null || other.previousRouteWithLastVertexThatIsForkOrJoin == null) { + final ListIterator myEdges = edgesInOrder.listIterator(edgesInOrder.size()); + final ListIterator otherEdges = other.edgesInOrder.listIterator(other.edgesInOrder.size()); + while (otherEdges.hasPrevious()) + if (myEdges.previous() != otherEdges.previous()) + return false; + return true; + } else + return (other.previousRouteWithLastVertexThatIsForkOrJoin == this.previousRouteWithLastVertexThatIsForkOrJoin) + || (previousRouteWithLastVertexThatIsForkOrJoin.lastVertex == other.previousRouteWithLastVertexThatIsForkOrJoin.lastVertex + && previousRouteWithLastVertexThatIsForkOrJoin.isRouteSuffix(other.previousRouteWithLastVertexThatIsForkOrJoin)); + } + + /** + * Checks whether the last vertex in the route is a fork or a joining vertex. + * @return {@code true} iff so. + */ + public boolean lastVertexIsForkOrJoin() { + return lastVertexIsForkOrJoin; + } + + /** + * Returns the longest prefix route that has as a last vertex a join or furcation vertex. + * + * @return never {@code null}. + */ + public Route getPrefixRouteWithLastVertexThatIsForkOrJoin() { + return previousRouteWithLastVertexThatIsForkOrJoin; + } + + + + /** + * Splice out the first few vertices of the route. + * + * @param length how many vertices to splice out + * @return a new route without those spliced vertices. + * + * @throws IllegalArgumentException if {@code length} is equal to the route's length or greater or if it is negative. + * Notice that non-vertex route are no legal routes. + */ + public Route splicePrefix(final int length) { + if (length == 0) + return this; + if (length >= length()) + throw new IllegalArgumentException("prefix slicing to long"); + if (length < 0) + throw new IllegalArgumentException("prefix cannot be negative"); + + final List resultEdges = getEdges().subList(length,length()); + Route result = new Route<>(graph.getEdgeSource(resultEdges.get(0)),this); + for (final E edge : resultEdges) + result = new Route<>(result,edge); + return result; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java similarity index 52% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java index a13618dae..ac6837d33 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblyGraphUnitTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteFinder.java @@ -1,124 +1,196 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KmerSequence; +import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; +import java.util.Stack; /** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 2/8/13 + * A collection of route building methods. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ +public class RouteFinder { -public class DeBruijnAssemblyGraphUnitTest { - private class GetReferenceBytesTestProvider extends BaseTest.TestDataProvider { - public byte[] refSequence; - public byte[] altSequence; - public int KMER_LENGTH; - public GetReferenceBytesTestProvider(String ref, String alt, int kmer) { - super(GetReferenceBytesTestProvider.class, String.format("Testing reference bytes. kmer = %d, ref = %s, alt = %s", kmer, ref, alt)); - refSequence = ref.getBytes(); - altSequence = alt.getBytes(); - KMER_LENGTH = kmer; - } - - public byte[] expectedReferenceBytes() { - return refSequence; - } - - public byte[] calculatedReferenceBytes() { - DeBruijnGraph graph = new DeBruijnGraph(); - graph.addSequenceToGraph(refSequence, KMER_LENGTH, true); - if( altSequence.length > 0 ) { - graph.addSequenceToGraph(altSequence, KMER_LENGTH, false); + /** + * Completes a path backwards in the graph that would explain the sequence if bytes ending in the vertex provided. + * + * @param graph the graph to build the path upon. + * @param sequence contains the sequence to backtrack. + * @param start inclusive start position of the sequence to backtrack. + * @param end exclusive end position of the sequence to backtrack. + * @param vertex final vertex of the resulting path. + * @return {@code null} if there is not such path, otherwise a path such that vertex is the last vertex of it + * and its sequence is squence[start to end] + v.getSuffix(); + */ + private static Route extendRouteBackwards(final BaseGraph graph, + final byte[] sequence, + final int start, + final int end, + final V vertex) { + final Route emptyPath = new Route<>(vertex,graph); + if (end <= start) // trivial case. + return emptyPath; + final int kmerSize = graph.getKmerSize(); + final Stack,Integer>> stack = new Stack<>(); + stack.ensureCapacity(end - start + 1); + stack.push(new Pair<>(emptyPath,end)); + while (!stack.isEmpty()) { + final Pair,Integer> next = stack.pop(); + final Route nextRoute = next.getFirst(); + final int nextEnd = next.getSecond(); + if (nextEnd <= start) { + return nextRoute.splicePrefix(kmerSize - 1); // gotcha!!! + } + final V nextFirstVertex = nextRoute.getFirstVertex(); + if (graph.isSource(nextFirstVertex)) { + final byte[] fullFirstVertexSequence = nextFirstVertex.getSequence(); + if (nextEnd - start != fullFirstVertexSequence.length - 1) { + continue; // you need to have the right length to accept a source vertex. + } + boolean mismatchFound = false; + for (int i = 0; i < fullFirstVertexSequence.length - 1; i++) { + if (fullFirstVertexSequence[i] != sequence[i + start]) { + mismatchFound = true; + break; + } + } + if (!mismatchFound) + return nextRoute; + } else { + final Integer newNextEnd = nextEnd - 1; + for (final E edge : graph.incomingEdgesOf(nextFirstVertex)) { + final V prevVertex = graph.getEdgeSource(edge); + final byte[] prevSequence = prevVertex.getSequence(); + final byte prevByte = prevSequence[prevSequence.length - 1]; + if (prevByte == sequence[newNextEnd]) { + stack.push(new Pair<>(new Route<>(edge,nextRoute),newNextEnd)); + } + } } - return graph.getReferenceBytes(graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex(), true, true); } + return null; } - @DataProvider(name = "GetReferenceBytesTestProvider") - public Object[][] GetReferenceBytesTests() { - new GetReferenceBytesTestProvider("GGTTAACC", "", 3); - new GetReferenceBytesTestProvider("GGTTAACC", "", 4); - new GetReferenceBytesTestProvider("GGTTAACC", "", 5); - new GetReferenceBytesTestProvider("GGTTAACC", "", 6); - new GetReferenceBytesTestProvider("GGTTAACC", "", 7); - new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "", 6); - new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 66); - new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "", 76); + /** + * Completes a path forward in the graph that would explain the sequence if bytes starting by the prefix provided. + * + * @param sequence missing sequence we want to + * @param start inclusive first position in {@code sequence} that starts the extension + * @param end exclusive position after the last of bases to be added to the extension. + * @param prefix the seed prefix of the path. + * @return {@code null} if there is not such path, otherwise a path such that vertex is the last vertex of it + * and its sequence is prefix.getBases() + sequence[start to end]; + */ + private static Route extendRouteForwards( + final BaseGraph graph, final byte[] sequence, final int start, final int end, + final Route prefix) { + if (end <= start) // trivial case. + return prefix; - new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 3); - new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 4); - new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 5); - new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 6); - new GetReferenceBytesTestProvider("GGTTAACC", "GGTTAACC", 7); - new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6); - new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66); - new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76); - - new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 3); - new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 4); - new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 5); - new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 6); - new GetReferenceBytesTestProvider("GGTTAACC", "AAAAAAAAAAAAA", 7); - new GetReferenceBytesTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", "AAAAAAAAAAAAA", 6); - new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 66); - new GetReferenceBytesTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", 76); - - return GetReferenceBytesTestProvider.getTests(GetReferenceBytesTestProvider.class); + final Stack,Integer>> stack = new Stack<>(); + stack.ensureCapacity(end - start + 1); + stack.push(new Pair<>(prefix,start)); + while (!stack.isEmpty()) { + final Pair,Integer> next = stack.pop(); + final Route nextRoute = next.getFirst(); + final int nextStart = next.getSecond(); + if (end <= nextStart) + return nextRoute; // gotcha!!! + final V lastVertex = nextRoute.getLastVertex(); + final Integer newNextStart = nextStart + 1; + for (final E edge : graph.outgoingEdgesOf(lastVertex)) { + final V nextVertex = graph.getEdgeTarget(edge); + final byte[] nextSequence = nextVertex.getSequence(); + final byte nextByte = nextSequence[nextSequence.length - 1]; + if (nextByte == sequence[nextStart]) { + stack.push(new Pair<>(new Route<>(nextRoute,edge),newNextStart)); + } + } + } + return null; } - @Test(dataProvider = "GetReferenceBytesTestProvider", enabled = true) - public void testGetReferenceBytes(GetReferenceBytesTestProvider cfg) { - Assert.assertEquals(cfg.calculatedReferenceBytes(), cfg.expectedReferenceBytes(), "Reference sequences do not match"); + /** + * Construct a new route object give a sequence using unique kmer mappings. + * + * @param sequence base sequence. + * @return {@code null} if there is no way such route on the graph or the start kmer is not unique. + */ + @SuppressWarnings("unchecked") + public static Route findRoute(final BaseGraph graph, + final byte[] sequence) { + if (graph == null) + throw new NullPointerException(); + if (!(graph instanceof KmerSearchableGraph)) + throw new IllegalArgumentException("the input graph must implement " + KmerSearchableGraph.class.getName()); + + final int kmerSize = graph.getKmerSize(); + final KmerSequence haplotypeKmers = new KmerSequence(sequence,kmerSize); + + if (haplotypeKmers.kmerSize() != graph.getKmerSize()) + throw new IllegalArgumentException("incompatible kmer sizes " + graph.getKmerSize() + " != " + haplotypeKmers.kmerSize()); + + V vertex = null; + int i; + for (i = 0; i < haplotypeKmers.size(); i++) + if ((vertex = ((KmerSearchableGraph)graph).findKmer(haplotypeKmers.get(i))) != null) + break; + if (vertex == null) + return null; + if (!graph.containsVertex(vertex)) + throw new IllegalStateException("vertex does not belong to graph."); + Route result = i == 0 ? new Route<>(vertex,graph) : + extendRouteBackwards(graph, sequence, 0, i + kmerSize - 1, vertex); + if (result == null) + return null; + result = extendRouteForwards(graph, sequence, i + kmerSize, sequence.length, result); + return result; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java index 36c515073..c8c6abb86 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraph.java @@ -62,7 +62,7 @@ import java.util.Set; * @author: depristo * @since 03/2013 */ -public final class SeqGraph extends BaseGraph { +public class SeqGraph extends BaseGraph { /** * Edge factory that creates non-reference multiplicity 1 edges */ @@ -89,13 +89,6 @@ public final class SeqGraph extends BaseGraph { */ private final static int MAX_REASONABLE_SIMPLIFICATION_CYCLES = 100; - /** - * Construct an empty SeqGraph - */ - public SeqGraph() { - this(11); - } - /** * Construct an empty SeqGraph where we'll add nodes based on a kmer size of kmer * @@ -294,10 +287,8 @@ public final class SeqGraph extends BaseGraph { // create the combined vertex, and add it to the graph // TODO -- performance problem -- can be optimized if we want - final List seqs = new LinkedList(); - for ( SeqVertex v : linearChain ) seqs.add(v.getSequence()); - final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{})); - final SeqVertex addedVertex = new SeqVertex( seqsCat ); + + final SeqVertex addedVertex = mergeLinearChainVertices(linearChain); addVertex(addedVertex); final Set inEdges = incomingEdgesOf(first); @@ -315,6 +306,13 @@ public final class SeqGraph extends BaseGraph { return true; } + protected SeqVertex mergeLinearChainVertices(final List vertices) { + final List seqs = new LinkedList(); + for ( SeqVertex v : vertices ) seqs.add(v.getSequence()); + final byte[] seqsCat = org.broadinstitute.sting.utils.Utils.concat(seqs.toArray(new byte[][]{})); + return new SeqVertex( seqsCat ); + } + /** * Get the sum of the edge weights on a linear chain of at least 2 elements * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java index 083747db4..ee1ece3f3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqVertex.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.Utils; + import java.util.Arrays; import java.util.concurrent.atomic.AtomicInteger; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java index 205d0027a..284062749 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitter.java @@ -189,7 +189,7 @@ public class SharedVertexSequenceSplitter { * Must be called before calling updateGraph */ public void split() { - splitGraph = new SeqGraph(); + splitGraph = new SeqGraph(outer.getKmerSize()); newMiddles = new LinkedList(); edgesToRemove = new LinkedList(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java similarity index 89% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java index 0200ce4a2..8c79a5efe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/TestGraph.java @@ -49,17 +49,16 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import org.jgrapht.EdgeFactory; -import java.util.Arrays; import java.util.HashMap; import java.util.Map; /** - * A DeBruijn kmer graph + * A Test kmer graph * * User: rpoplin * Date: 2/6/13 */ -public final class DeBruijnGraph extends BaseGraph { +public final class TestGraph extends BaseGraph { /** * Edge factory that creates non-reference multiplicity 1 edges */ @@ -71,33 +70,20 @@ public final class DeBruijnGraph extends BaseGraph { } /** - * Create an empty DeBruijnGraph with default kmer size + * Create an empty TestGraph with default kmer size */ - public DeBruijnGraph() { + public TestGraph() { this(11); } /** - * Create an empty DeBruijnGraph with kmer size + * Create an empty TestGraph with kmer size * @param kmerSize kmer size, must be >= 1 */ - public DeBruijnGraph(int kmerSize) { + public TestGraph(int kmerSize) { super(kmerSize, new MyEdgeFactory()); } - /** - * Pull kmers out of the given long sequence and throw them on in the graph - * @param sequence byte array holding the sequence with which to build the assembly graph - * @param KMER_LENGTH the desired kmer length to use - * @param isRef if true the kmers added to the graph will have reference edges linking them - */ - public void addSequenceToGraph( final byte[] sequence, final int KMER_LENGTH, final boolean isRef ) { - if( sequence.length < KMER_LENGTH + 1 ) { throw new IllegalArgumentException("Provided sequence is too small for the given kmer length"); } - final int kmersInSequence = sequence.length - KMER_LENGTH + 1; - for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { - addKmersToGraph(Arrays.copyOfRange(sequence, iii, iii + KMER_LENGTH), Arrays.copyOfRange(sequence, iii + 1, iii + 1 + KMER_LENGTH), isRef, 1); - } - } /** * Add edge to assembly graph connecting the two kmers @@ -129,7 +115,7 @@ public final class DeBruijnGraph extends BaseGraph { @Ensures({"result != null"}) public SeqGraph convertToSequenceGraph() { final SeqGraph seqGraph = new SeqGraph(getKmerSize()); - final Map vertexMap = new HashMap(); + final Map vertexMap = new HashMap<>(); // create all of the equivalent seq graph vertices for ( final DeBruijnVertex dv : vertexSet() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java new file mode 100644 index 000000000..2b58fcbf0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/VertexOrder.java @@ -0,0 +1,62 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** +* Indicate a SeqGraph vertex topological order between to vertices. +*/ +public enum VertexOrder { + BEFORE, AFTER, SAME, PARALLEL; + + public VertexOrder inverse() { + switch (this) { + case BEFORE: return AFTER; + case AFTER: return BEFORE; + default: return this; + } + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java new file mode 100644 index 000000000..55ff2f978 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java @@ -0,0 +1,1016 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeRoute; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.SequenceComplexity; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.CountSet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * + * Threading graph subclass used to "re-thread" haplotypes instead of reads. + * + * Created with IntelliJ IDEA. + * User: valentin + * Date: 8/23/13 + * Time: 2:42 PM + * To change this template use File | Settings | File Templates. + */ +public class HaplotypeGraph extends ReadThreadingGraph { + + /** + * Maximum repeat unit length considered when looking for repeats that should not be considered as + * possible read anchor places along the reference path. + */ + protected static final int DEFAULT_MAX_REPEAT_UNIT_LENGTH = 4; + + /** + * Minimum repeat length to consider a region a repeat that should not be considered as possibl read anchor + * places along the reference path. + */ + protected static final int DEFAULT_MIN_REPEAT_LENGTH_IN_UNITS = 6; + + /** + * Reference haplotype + */ + private Haplotype referenceHaplotype; + + /** + * Reference haplotype bases + */ + private byte[] referenceBases; + + /** + * Sets of haplotypes in the graph. + */ + private Set haplotypes; + + /** + * Route of haplotypes in the graph. + */ + private HaplotypeRoute referenceRoute; + + /** + * Set of vertices along the reference route. + */ + private Set referenceVertices; + + /** + * Holds haplotype routes by haplotype. + */ + private Map haplotypeRouteByHaplotype; + + /** + * Holds haplotypes by contained vertices. + */ + private Map> haplotypesByVertex; + + /** + * Reference to the logger for this class. + */ + private static final Logger logger = Logger.getLogger(HaplotypeGraph.class); + + /** + * What is the maximum STR unit length. + */ + private int maxRepeatUnitLength = DEFAULT_MAX_REPEAT_UNIT_LENGTH; + + /** + * What is the minimum length in units for a STR. + */ + private int minRepeatLengthInUnits = DEFAULT_MIN_REPEAT_LENGTH_IN_UNITS; + + + /** + * Indicates that the haplotype data structures need update previous to querying. + */ + private boolean needToUpdateHaplotypeStructures = true; + private Set anchorableVertices; + + /** + * Constructs a haplotype graph from a describing string. + * + *

Used for testing

+ * @param string the string representation of the haplotype graph. + */ + public HaplotypeGraph(final String string) { + super(string); + haplotypes = new LinkedHashSet<>(10); + referenceVertices = Collections.emptySet(); + } + + /** + * Constructs a new haplotype graph given its kmerSize. + * + * @param kmerSize 1 or greater, the targeted kmerSize + * + * @throws IllegalArgumentException if {@code kmerSize} is 0 or negative. + */ + public HaplotypeGraph(final int kmerSize) { + super(kmerSize); + haplotypes = new LinkedHashSet<>(10); + referenceVertices = Collections.emptySet(); + } + + + /** + * Set of vertices along the reference haplotype path. + * + * @return never {@code} null but perhaps empty. + */ + public Set getReferenceVertices() { + updateHaplotypeStructures(); + return referenceVertices; + } + + /** + * Returns the haplotype route given an haplotype. + * @param haplotype query haplotype + * @throws NullPointerException if {@code haplotype} is {@code null}. + * @throws IllegalArgumentException if {@code haplotype} is not a supported haplotype in the graph. + * @return never {@code null}. + */ + public HaplotypeRoute getHaplotypeRoute(final Haplotype haplotype) { + updateHaplotypeStructures(); + if (!haplotypes.contains(haplotype)) + throw new IllegalArgumentException("input haplotype must be part of the haplotype graph haplotype set"); + HaplotypeRoute result = haplotypeRouteByHaplotype.get(haplotype); + if (result == null) + haplotypeRouteByHaplotype.put(haplotype,result = buildHaplotypeRoute(haplotype)); + return result; + } + + /** + * Creates an haplotype route. + * @param haplotype the target haplotype + * @return {@code null} if there is no such a route in the graph. + */ + private HaplotypeRoute buildHaplotypeRoute(final Haplotype haplotype) { + final Route route = RouteFinder.findRoute(this,haplotype.getBases()); + if (route == null) + return null; + else + return new HaplotypeRoute(route); + } + + /** + * Bases along the reference path. + * + * @return {@code null} if there is no reference. + */ + @SuppressWarnings("unused") + public byte[] getReferenceBases() { + updateHaplotypeStructures(); + return referenceBases; + } + + /** + * Returns the reference haplotype + * @return {@code null} if there is no such a reference. + */ + public Haplotype getReferenceHaplotype() { + updateHaplotypeStructures(); + return referenceHaplotype; + } + + + + /** + * Construct a haplotype graph given the haplotype list and the elected kmerSize. + * + * @param haplotypes whose path to add to the graph. + * @param kmerSize the kmerSize use to compose the graph. + */ + public HaplotypeGraph(final int kmerSize, final List haplotypes) { + super(kmerSize); + referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); + this.haplotypes = new LinkedHashSet<>(haplotypes); + addSequence("anonymous", referenceHaplotype.getBases(), null, true); + for (final Haplotype h : haplotypes) { + if (h.isReference()) + continue; + if (h.length() < kmerSize) { + Utils.warnUser(logger, "haplotype shorter than kmerSize " + h.length() + " < " + kmerSize + " will be dropped"); + } else + addSequence("anonymous", h.getBases(), null, false); + + } + buildGraphIfNecessary(); + } + + /** + * Returns the reference haplotype within the input collection. + * + * @param haplotypes the query haplotype set. + * @throws IllegalArgumentException if there is no reference haplotype. + * @throws NullPointerException if {@code haplotypes} is {@code null} or contains some {@code null} value. + * @return never {@code} null, a haplotype that is reference. + */ + private Haplotype findReferenceHaplotypeOrFail(final List haplotypes) { + for (final Haplotype h : haplotypes) + if (h.isReference()) + return h; + throw new IllegalArgumentException("no reference haplotype present"); + } + + /** + * Constructs a new haplotype graph given a template read-threading graph and set of haplotypes + * + * @param template the template read-threading graph. + * @param haplotypes the haplotype set to consider + */ + public HaplotypeGraph(final ReadThreadingGraph template, final List haplotypes) { + this(template.getKmerSize()); + referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); + this.haplotypes = new HashSet<>(haplotypes); + template.buildGraphIfNecessary(); + uniqueKmers = new HashMap<>(); + nonUniqueKmers = new HashSet<>(); + // Copy vertices over. + addVertices(template.vertexSet()); + // Copy edges over. + for (final MultiSampleEdge edge : template.edgeSet()) { + final MultiSampleEdge newEdge = addEdge(template.getEdgeSource(edge), template.getEdgeTarget(edge)); + newEdge.setIsRef(newEdge.isRef()); + newEdge.setMultiplicity(edge.getMultiplicity()); + } + // Copy kmer lookup tables: + uniqueKmers.putAll(template.uniqueKmers); + nonUniqueKmers.addAll(template.nonUniqueKmers); + alreadyBuilt = true; + } + + /** + * Update the haplotype data structures based in current edges and vertices. + */ + private void updateHaplotypeStructures() { + if (!needToUpdateHaplotypeStructures) + return; + needToUpdateHaplotypeStructures = false; + haplotypeRouteByHaplotype = new LinkedHashMap<>(haplotypes.size()); + final Iterator haplotypeIterator = haplotypes.iterator(); + final Set nonFoundHaplotypes = new HashSet<>(haplotypes.size()); + while (haplotypeIterator.hasNext()) { + final Haplotype haplotype = haplotypeIterator.next(); + final HaplotypeRoute haplotypeRoute = buildHaplotypeRoute(haplotype); + if (haplotypeRoute == null) { + haplotypeIterator.remove(); + nonFoundHaplotypes.add(haplotype); + if (haplotype.isReference()) { + referenceHaplotype = null; + referenceRoute = null; + referenceVertices = Collections.emptySet(); + referenceBases = null; + } + } else { + if (haplotype.isReference()) { + referenceHaplotype = haplotype; + referenceRoute = haplotypeRoute; + referenceVertices = haplotypeRoute.vertexSet(); + referenceBases = haplotypeRoute.getBases(); + } + haplotypeRouteByHaplotype.put(haplotype, haplotypeRoute); + } + } + haplotypesByVertex = buildHaplotypesByVertex(); + anchorableVertices = calculateAnchorableVertexSet(); + logger.debug("some haplotypes do not have a path across the haplotype graph " + nonFoundHaplotypes.size()); + } + + /** + * Builds a map for each vertex to all the haplotype routes that pass thru it. + */ + private Map> buildHaplotypesByVertex() { + final Map> result = new HashMap<>(referenceVertices.size()); + final Set allHaplotypeRoutes = new LinkedHashSet<>(haplotypeRouteByHaplotype.values()); + for (final HaplotypeRoute haplotypeRoute : allHaplotypeRoutes) { + final Set singleton = Collections.singleton(haplotypeRoute); + for (final MultiDeBruijnVertex vertex : haplotypeRoute.vertexSet()) + if (!result.containsKey(vertex)) + result.put(vertex, singleton); + else { + final Set currentHrs = result.get(vertex); + if (currentHrs.size() == haplotypes.size() - 1) + result.put(vertex, allHaplotypeRoutes); + else if (currentHrs.size() == 1) { + final Set newHrs = new LinkedHashSet<>(allHaplotypeRoutes.size()); + newHrs.addAll(currentHrs); + newHrs.add(haplotypeRoute); + result.put(vertex, newHrs); + } else + currentHrs.add(haplotypeRoute); + } + } + return result; + } + + + /** + * Debug convenient method to print a graph into a file in the .dot format. + * @param fileName name of the output file. + * @throws NullPointerException if {@code fileName} is {@code null}. + */ + public void printGraph(final String fileName) { + super.printGraph(new File(fileName), 10000); + } + + + + + @Override + public void printGraph(final PrintStream graphWriter, final boolean writeHeader, final int pruneFactor) { + if ( writeHeader ) + graphWriter.println("digraph assemblyGraphs {"); + + + for( final MultiSampleEdge edge : edgeSet() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() > 0 && edge.getMultiplicity() <= pruneFactor ? "style=dotted,color=grey," : "") + "label=\"" + edge.getDotLabel() + "\"];"); + if( edge.isRef() ) { + graphWriter.println("\t" + getEdgeSource(edge).toString() + " -> " + getEdgeTarget(edge).toString() + " [color=red];"); + } + } + + for( final MultiDeBruijnVertex v : vertexSet() ) + graphWriter.println("\t" + v.toString() + " [label=\"" + v.getId() + ":" + new String(getAdditionalSequence(v)) + v.additionalInfo() + "\",shape=box]"); + + if ( writeHeader ) + graphWriter.println("}"); + } + + + @Override + public Pair findStart(final SequenceForKmers seqForKmers) { + return getOrCreateKmerVertex(seqForKmers.sequence, 0, true); + } + + /** + * Checks whether the graph has some sources or sink vertices that are not reference vertices. + * + * @return {@code true} iff so. + */ + public boolean hasNonReferenceEnds() { + for (final MultiDeBruijnVertex end : getSources()) + if (!isReferenceNode(end)) return true; + for (final MultiDeBruijnVertex end : getSinks()) + if (!isReferenceNode(end)) return true; + return false; + } + + /** + * Merges vertices that share exactly the same set of outgoing vertices. + *

+ * This is done in reversed topological order and since the graph is a DAG it ensure to return a graph + * that such merge is any longer possible. I.e. there is no need to run this method more than once. + *

+ * Notice that we will a record of distinct unique kmers that map to the same vertex that map now to the same + * merged vertex. Thus if vertices {@code X and Y} are merged then {@code findKmer(X.sequence) == findKmer(Y.sequence)}. + *

+ * Examples: + *

    + *
  • + * {@code AAA -> AAC, CAA -> AAC} would become {@code NAA -> AAC}. + *
  • + * {@code AAA -> AAC, AAA -> AAG, CAA -> AAC, CAA -> AAG} would become {@code NAA -> AAG, NAA -> AAG} + *
  • + * {@code AAA -> AAC, AAA -> AAG, CAA -> AAC} would not change as {@code AAA} and {@code CAA} + * do not share {@code AAG} as outgoing vertex. + *
  • + *
  • + * {@code AAA -> AAC, AAC -> ACA, CAA -> AAC, GAC -> ACA } would become {@code NAA -> NAC, NAC -> ACA}. + *
  • + *
+ */ + public void mergeCommonChains() { + final int vertexCount = vertexSet().size(); + final Set refVertices = new HashSet<>(vertexCount); + final Map indexByVertex = new HashMap<>(vertexCount); + final int[] pendingChildren = new int[vertexCount]; + final Deque readyVertices = new LinkedList<>(); + final Set merged = new HashSet<>(1 + vertexCount / 10 ); + + // Initialize traversal data structures. + mergeCommonChainsInitialize(refVertices, indexByVertex, pendingChildren, readyVertices); + + // Traversal in inverted topological order where children nodes are processed before their parents. + while (!readyVertices.isEmpty()) { + final MultiDeBruijnVertex currentVertex = readyVertices.remove(); + if (merged.contains(currentVertex)) continue; + + final Set mergeSet = new HashSet<>(2); + MultiDeBruijnVertex refVertex = mergeCommonChainsComposeMergeSet(refVertices, currentVertex, mergeSet); + mergeVertices(refVertex,mergeSet,indexByVertex,pendingChildren,readyVertices); + merged.addAll(mergeSet); + } + needToUpdateHaplotypeStructures = true; + } + + /** + * Given a seed vertex, determines the mergin set of nodes that will be collapsed into one. + * + * @param refVertices reference path vertices + * @param currentVertex current vertex. + * @param mergeSet where to store the final merging set. + * @return the reference node if present that needs to be preserved as such. It might be {@code null} + */ + private MultiDeBruijnVertex mergeCommonChainsComposeMergeSet(final Set refVertices, + final MultiDeBruijnVertex currentVertex, + final Set mergeSet) { + final boolean currentIsSource = isSource(currentVertex); + final Set children = outgoingVerticesOf(currentVertex); + if (children.size() == 0) + mergeSet.add(currentVertex); + else + for (final MultiDeBruijnVertex child : children) + mergeSet.addAll(incomingVerticesOf(child)); + + MultiDeBruijnVertex refVertex = refVertices.contains(currentVertex) ? currentVertex : null; + final Iterator candidatesIt = mergeSet.iterator(); + while (candidatesIt.hasNext()) { + final MultiDeBruijnVertex candidate = candidatesIt.next(); + if (candidate == currentVertex) continue; + if (isSource(candidate) != currentIsSource) { + candidatesIt.remove(); + continue; + } + if (currentIsSource && !candidate.getSequenceString().equals(currentVertex.getSequenceString())) { + candidatesIt.remove(); + continue; + } + if (!currentIsSource && candidate.getSuffix() != currentVertex.getSuffix()) { + candidatesIt.remove(); + continue; + } + final Set candidateChildren = outgoingVerticesOf(candidate); + if (candidateChildren.size() != children.size()) + candidatesIt.remove(); + else { + boolean removed = false; + for (final MultiDeBruijnVertex candidateChild : candidateChildren) + if (!children.contains(candidateChild)) { + candidatesIt.remove(); + removed = true; + break; + } + if (refVertex == null && !removed && refVertices.contains(candidate)) refVertex = candidate; + } + } + return refVertex; + } + + /** + * Initialize data-structures for {@link #mergeCommonChains} + * + * @param refVertices will contain reference path vertices. + * @param indexByVertex map vertex -> index in {@code pendingChildren}. + * @param pendingChildren number of children of a node that have not yet been processed. + * @param readyVertices vertices that are ready to be processed (all children have been processed). + */ + private void mergeCommonChainsInitialize(final Set refVertices, + final Map indexByVertex, + final int[] pendingChildren, + final Deque readyVertices) { + int nextIndex = 0; + for (final MultiDeBruijnVertex v : vertexSet()) { + indexByVertex.put(v,nextIndex++); + if (isReferenceNode(v)) refVertices.add(v); + } + + for (final Map.Entry entry : indexByVertex.entrySet()) + if ((pendingChildren[entry.getValue()] = outDegreeOf(entry.getKey())) == 0) + readyVertices.add(entry.getKey()); + } + + // Perform the actual merge. + private void mergeVertices(final MultiDeBruijnVertex refVertex, final Collection vertices, final Map indexByVertex, final int[] pendingChildrenCounts, final Deque ready) { + if (vertices.size() == 0) + throw new IllegalArgumentException(); + final MultiDeBruijnVertex vertexToKeep = refVertex == null ? vertices.iterator().next() : refVertex; + final byte[] sequence = vertexToKeep.getSequence(); + final Set uniqueKmersToUpdate = new HashSet<>(vertices.size()); + final Set parentVertices = new HashSet<>(inDegreeOf(vertexToKeep) * 2); + parentVertices.addAll(incomingVerticesOf(vertexToKeep)); + for (final MultiDeBruijnVertex p : parentVertices) + if (--pendingChildrenCounts[indexByVertex.get(p)] == 0) + ready.add(p); + + final Kmer mergedKmer = new Kmer(sequence); + if (uniqueKmers.containsKey(mergedKmer)) { + uniqueKmersToUpdate.add(new Kmer(mergedKmer.bases().clone())); + uniqueKmers.remove(mergedKmer); + } + boolean foundMergedVertex = false; + for (final MultiDeBruijnVertex v : vertices) + if (v == vertexToKeep) + foundMergedVertex = true; + else { + final byte[] seq = v.getSequence(); + final Kmer kmer = new Kmer(seq); + if (uniqueKmers.containsKey(kmer)) { + uniqueKmersToUpdate.add(kmer); + uniqueKmers.remove(kmer); + } + if (sequence.length != seq.length) throw new IllegalArgumentException("mismatched sizes " + sequence.length + " != " + + seq.length + " " + new String(sequence) + " " + new String(seq)); + for (int i = sequence.length - 1; i >= 0; i--) { + + if (sequence[i] != seq[i]) sequence[i] = 'N'; + } + for (final MultiDeBruijnVertex p : incomingVerticesOf(v)) { + if (--pendingChildrenCounts[indexByVertex.get(p)] == 0) + ready.add(p); + if (!parentVertices.contains(p)) { + parentVertices.add(p); + final MultiSampleEdge e = getEdge(p,v); + addEdge(p,vertexToKeep,new MultiSampleEdge(e.isRef(),e.getMultiplicity(),1)); + } else { + getEdge(p,vertexToKeep).incMultiplicity(getEdge(p,v).getMultiplicity()); + } + } + removeVertex(v); + } + if (!foundMergedVertex) + throw new IllegalArgumentException("merged vertex must be contained in the input set"); + for (final Kmer kmer : uniqueKmersToUpdate) + uniqueKmers.put(kmer,vertexToKeep); + } + + public Map uniqueKmerMap() { + return Collections.unmodifiableMap(uniqueKmers); + } + + @Override + public boolean equals(Object other) { + return (other instanceof HaplotypeGraph) && equals((HaplotypeGraph)other); + } + + + /** + * Simple debug representation of the haplotype graph. + * @return never {@code null} + */ + @Override + public String toString() { + return getClass().getSimpleName() + "[ks=" + kmerSize + "](vs=" + vertexSet().size() + "," + edgeSet().size() + "){...}"; + } + + /** + * Returns set of valid haplotypes. + * @return never {@code null} but perhaps empty. + */ + public Set getHaplotypes() { + updateHaplotypeStructures(); + return haplotypes; + } + + /** + * Returns a map between valid haplotypes and corresponding routes in the graph. + * @return never {@code null} but perhaps empty. + */ + public Map getHaplotypeRouteMap() { + updateHaplotypeStructures(); + return haplotypeRouteByHaplotype; + } + + /** + * Returns set of haplotype routes that enclose a vertex. + * @param vertex the query vertex. + * @return never {@code null} but perhaps empty set. + */ + public Set getEnclosingHaplotypeRoutes(final MultiDeBruijnVertex vertex) { + updateHaplotypeStructures(); + if (haplotypesByVertex == null) + return Collections.emptySet(); + final Set result = haplotypesByVertex.get(vertex); + if (result == null) + return Collections.emptySet(); + else + return result; + } + + /** + * Returns the reference route + * + * @return {@code null} if there is no valid reference haplotype. + */ + public HaplotypeRoute getReferenceRoute() { + updateHaplotypeStructures(); + return referenceRoute; + } + + /*********************************************** + * deep equals implementation, used in testing. * + ***********************************************/ + + /** + * Compare two haplotype threading graphs and it determines whether they have the same structure. + *

+ * This method goes a long way to figure out the equality and no equality of both graphs. However there + * are "pathological" case in where it might fail to see a difference. This is due to the fact that there + * is no guarantee of the uniqueness of sequences at source vertex. + *

+ * If there are more than one source vertex with the same sequence it try to match source vertices between both + * graphs matching all possible paths emanating from every pair of sources. + * + *

Note: in practice this is only used in for testing purposes + * + * @param other the other graph to compare against. + * @return never {@code null}. + */ + public boolean equals(HaplotypeGraph other) { + updateHaplotypeStructures(); + if (other == null) return false; + if (other == this) return true; + + if (!equals$ReferencePaths(this, other)) return false; + final Map thisSourcesBySequence = equalsBuildSourceBySequenceMap(this); + final Map otherSourcesBySequence = equalsBuildSourceBySequenceMap(other); + if (thisSourcesBySequence.size() != otherSourcesBySequence.size()) return false; + final List unmatchedLeft = new LinkedList<>(); + final List unmatchedRight = new LinkedList<>(); + final List> sourcePairs = equals$matchVertexBySequenceMaps(thisSourcesBySequence,otherSourcesBySequence,unmatchedLeft,unmatchedRight); + if (unmatchedLeft.size() > 0 || unmatchedRight.size() > 0) return false; + + + final Deque> pending = new LinkedList<>(sourcePairs); + final Set visited = new HashSet<>(vertexSet().size()); + while (!pending.isEmpty()) { + final Pair pair = pending.removeFirst(); + final MultiDeBruijnVertex leftVertex = pair.getFirst(); + final MultiDeBruijnVertex rightVertex = pair.getSecond(); + final List> childrenPairs = equals$matchVertexBySequenceMaps(equalsBuildChildrenBySuffixMap(this, leftVertex), + equalsBuildChildrenBySuffixMap(other, rightVertex), unmatchedLeft, unmatchedRight); + if (unmatchedLeft.size() > 0 || unmatchedRight.size() > 0) return false; + for (final Pair childPair : childrenPairs) { + final MultiDeBruijnVertex leftChild = childPair.getFirst(); + final MultiDeBruijnVertex rightChild = childPair.getSecond(); + final boolean leftVisited = visited.add(leftChild); + final boolean rightVisited = visited.add(rightChild); + if (leftVisited != rightVisited) return false; // visited before in different matchings. + if (leftVisited) continue; + pending.add(childPair); + visited.add(childPair.getFirst()); + visited.add(childPair.getSecond()); + } + } + return true; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private boolean equals$ReferencePaths(final HaplotypeGraph g1, final HaplotypeGraph g2) { + MultiDeBruijnVertex refVertex1 = g1.getReferenceSourceVertex(); + MultiDeBruijnVertex refVertex2 = g2.getReferenceSourceVertex(); + if (refVertex1 == null && refVertex2 == null) + return true; + if (refVertex1 == null || refVertex2 == null) + return false; + + if (!refVertex1.getSequenceString().equals(refVertex2.getSequenceString())) + return false; + + while (refVertex1 != null && refVertex2 != null) { + if (refVertex1.getSuffix() != refVertex2.getSuffix()) return false; + refVertex1 = g1.getNextReferenceVertex(refVertex1); + refVertex2 = g2.getNextReferenceVertex(refVertex2); + + } + return refVertex1 == refVertex2; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static Map equalsBuildChildrenBySuffixMap(final HaplotypeGraph graph, + final MultiDeBruijnVertex vertex) { + final Map result = new HashMap<>(); + for (final MultiDeBruijnVertex child : graph.outgoingVerticesOf(vertex)) + result.put(new String(new byte[]{child.getSuffix()}), child); + return result; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static List> equals$matchVertexBySequenceMaps( + final Map left, final Map right, + final Collection unmatchedLeft, final Collection unmatchedRight) { + final List> result = new LinkedList<>(); + for (final Map.Entry leftEntry : left.entrySet()) + if (right.containsKey(leftEntry.getKey())) + result.add(new Pair<>(leftEntry.getValue(),right.get(leftEntry.getKey()))); + else + unmatchedLeft.add(leftEntry.getValue()); + for (final Map.Entry rightEntry : right.entrySet()) + if (!left.containsKey(rightEntry.getKey())) + unmatchedRight.add(rightEntry.getValue()); + return result; + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static Map equalsBuildSourceBySequenceMap(final HaplotypeGraph other) { + + final Set sources = other.getSources(); + final Map result = new HashMap<>(sources.size()); + final Map> collisions = new HashMap<>(sources.size()); + for (final MultiDeBruijnVertex v : sources) { + final String sequence = v.getSequenceString(); + if (result.containsKey(sequence)) { // we need to handle collision due to lack of uniqueness. + final List collisionList; + if (collisions.containsKey(sequence)) + collisionList = collisions.get(sequence); + else + collisions.put(sequence,collisionList = new LinkedList<>()); + collisionList.add(v); + } else { + result.put(sequence,v); + } + } + if (collisions.size() == 0) + return result; + for (final String s : collisions.keySet()) { + result.remove(s); + final List vertices = collisions.remove(s); + int number = 0; + final List> extendedSequences = new LinkedList<>(); + for (final MultiDeBruijnVertex vertice : vertices) + extendedSequences.add(new Pair<>(vertice, equalsCollisionResolverExtendedSequence(other, vertice))); + Collections.sort(extendedSequences,new Comparator>(){ + public int compare(final Pair p1, final Pair p2) { + return p1.getSecond().compareTo(p2.getSecond()); + } + }); + for (final Pair p : extendedSequences) + result.put(p.getSecond() + '-' + (number++),p.getFirst()); + } + return result; + + } + + // Note: in practice only use by equals(HaplotypeGraph) for testing purposes. + private static String equalsCollisionResolverExtendedSequence(final HaplotypeGraph graph, final MultiDeBruijnVertex source) { + final StringBuilder buffer = new StringBuilder(1000); + final Set visited = new HashSet<>(graph.vertexSet().size()); + final Stack pending = new Stack<>(); + final Stack position = new Stack<>(); + position.ensureCapacity(graph.vertexSet().size()); + pending.ensureCapacity(graph.vertexSet().size()); + pending.add(source); + position.add(0); + int lastPos = -1; + while (!pending.isEmpty()) { + final MultiDeBruijnVertex next = pending.pop(); + if (visited.contains(next)) continue; + visited.add(next); + final int pos = position.pop(); + final CharSequence sequence; + if (graph.isSource(next)) { + if (next == source) { + sequence = new String(next.getSequence()); + } else { + sequence = new StringBuffer(next.getSequence().length).append(new String(next.getSequence())).reverse().append('$'); + } + } else { + sequence = new String(new byte[] { next.getSuffix()}); + } + + if (pos != lastPos + 1) { + buffer.append('[').append(Math.abs(pos)).append(']'); + } + buffer.append(sequence); + lastPos = pos + sequence.length() - 1; + + final List parents = new LinkedList<>(graph.incomingVerticesOf(next)); + Collections.sort(parents,new Comparator() { + @Override + public int compare(final MultiDeBruijnVertex o1, final MultiDeBruijnVertex o2) { + return Byte.compare(o1.getSuffix(),o2.getSuffix()); + } + }); + for (final MultiDeBruijnVertex parent : parents) { + pending.push(parent); + position.push(lastPos + 1); + } + + final List children = new LinkedList<>(graph.incomingVerticesOf(next)); + Collections.sort(children,new Comparator() { + @Override + public int compare(final MultiDeBruijnVertex o1, final MultiDeBruijnVertex o2) { + return Byte.compare(o1.getSuffix(),o2.getSuffix()); + } + }); + for (final MultiDeBruijnVertex child : graph.outgoingVerticesOf(next)) { + pending.push(child); + position.push(lastPos + 1); + } + } + + return buffer.toString(); + } + + + /** + * Calculates the subset of reference path vertices that are amenable to be anchoring vertices. + *

+ *

+ * For a vertex to be anchorable: + *

    + *
  • Should not include bases from a repeat
  • , + *
  • There should not be in a middle of a event block
  • + *
+ *

+ * + * @return never {@code null}. + */ + private Set calculateAnchorableVertexSet() { + updateHaplotypeStructures(); + if (referenceBases == null) + return Collections.emptySet(); + + // We first check what bases in the reference path bases are part of a repeat. + final boolean[] nonAnchorableDueToRepeats = SequenceComplexity.findBasesInShortUnitRepeats( + referenceBases, maxRepeatUnitLength, minRepeatLengthInUnits); + + final Set result = new HashSet<>(100); + final Map expectedRejoins = new HashMap<>(); + + + MultiDeBruijnVertex currentVertex = getReferenceRoute().getFirstVertex(); + final int sourceSequenceLength = currentVertex.getSequence().length; + + // Determine whether the reference source vertex in anchorable discarding repeats: + boolean sourceIsAnchorable = true; + for (int i = 0; i < sourceSequenceLength; i++) + if (nonAnchorableDueToRepeats[i]) { + sourceIsAnchorable = false; + break; + } + + // Update the nonAnchorableDueToRepeats array accordingly. + int index = currentVertex.getSequence().length - 1; + nonAnchorableDueToRepeats[index] = !sourceIsAnchorable; + + + // We keep record on all alternative path lengths: + final CountSet pathLengths = new CountSet(haplotypes.size()); + pathLengths.setTo(0); + + // Now we go through the reference path and determine which vertices are not part of event block. + // We keep track of open divergent paths in expectedRejoins. Thus only those vertices traversed + // when exptectedRejoins size 0 can be anchorable: + while (currentVertex != null) { + int inDegree = inDegreeOf(currentVertex); + if (inDegree > 1) + expectedRejoins.remove(currentVertex); + if (expectedRejoins.size() == 0 && !nonAnchorableDueToRepeats[index]) { + currentVertex.setAdditionalInfo(currentVertex.additionalInfo() + "*"); + result.add(currentVertex); + } + final Set nextEdges = outgoingEdgesOf(currentVertex); + MultiDeBruijnVertex nextReferenceVertex = null; + for (final MultiSampleEdge e : nextEdges) { + final MultiDeBruijnVertex nextVertex = getEdgeTarget(e); + if (e.isRef() && referenceVertices.contains(nextVertex)) + nextReferenceVertex = nextVertex; + else + calculateRejoins(nextVertex, expectedRejoins, referenceVertices, pathLengths, false, false); + } + currentVertex = nextReferenceVertex; + index++; + } + return result; + } + + + + /** + * Returns those vertices that can be used as anchors along the refererence route. + * @return never {@code null} but perhaps empty if there is no such a vertex. + */ + public Set getAnchorableVertices() { + updateHaplotypeStructures(); + return anchorableVertices; + } + + /** + * Finds non-reference wondering paths that will rejoin the reference path from a particular node. + *

+ *

+ * It only considers those paths that rejoin within the anchor points of a read. + *

+ *

+ *

+ * Rather than reporting explicitly the path vertice sequence, this method report the length of the paths + * found. These are dumped into {@code expectedRejoins} where the keys are refernce path vertex where paths rejoin + * and the value is the set of path lengths. + *

+ *

+ *

The path lengths are calculated as the length from the startVertex plus the prefix sizes {@code prefixSizes}

+ *

+ *

You can also ask the method to exhaustively find all paths ({@code exhaustive == true}) or just consider + * intermediate nodes once ({@code exhustive == false}). If the latter only the shortest paths are considered.

+ *

+ *

Finally you also can check on paths backwards ({@code backwards == true}) or forwards ({@code backwards == false})

+ * + * @param startVertex the origin node for those paths. + * @param expectedRejoins map where to place the found paths in a form of the rejoining non-reference vertex (key) and + * set of path lengths (value). + * @param referenceWithinBoundaries reference vertices found between read anchors. The key are the vertices, the values are + * the kmer's offset in the read. + * @param prefixSizes prefix path sizes to be added to the rejoin path sizes. + * @param exhaustive whether all paths should be considered or we only care about find out the rejoining vertices. + * @param backwards whether we want to find backward paths (inverse edge traversal). + * + * Note: it is marked as deprecated as this method signature may change in the future. It is public just because + * is currently shared by several other classes, however it would not be surprising if + * it gets refactored out at some point. So use with care. + */ + @Deprecated + public void calculateRejoins(final MultiDeBruijnVertex startVertex, final Map expectedRejoins, + final Set referenceWithinBoundaries, final CountSet prefixSizes, + final boolean exhaustive, final boolean backwards) { + Queue queue = new LinkedList<>(); + Queue depths = new LinkedList<>(); + queue.add(startVertex); + depths.add(prefixSizes); + + final Set visited = new HashSet<>(); + if (!exhaustive) visited.add(startVertex); + while (!queue.isEmpty()) { + final CountSet depth = depths.remove(); + final MultiDeBruijnVertex v = queue.remove(); + if (referenceVertices.contains(v)) { + if (referenceWithinBoundaries.contains(v)) { + final CountSet previous = expectedRejoins.get(v); + if (previous == null) + expectedRejoins.put(v, depth.clone()); + else + previous.addAll(depth); + } + } else { + final CountSet depthPlusOne = depth.clone(); + depthPlusOne.incAll(1); + final Set nextEdges = backwards ? incomingEdgesOf(v) : outgoingEdgesOf(v); + for (final MultiSampleEdge e : nextEdges) { + final MultiDeBruijnVertex w = backwards ? getEdgeSource(e) : getEdgeTarget(e); + if (visited.contains(w)) // avoid repetitive work. + continue; + if (!exhaustive) visited.add(w); + queue.add(w); + depths.add(depthPlusOne); + } + } + } + } + +} + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java index 5752583c7..ef8c74d0b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/MultiDeBruijnVertex.java @@ -64,7 +64,7 @@ import java.util.concurrent.atomic.AtomicInteger; * Date: 4/17/13 * Time: 3:20 PM */ -final class MultiDeBruijnVertex extends DeBruijnVertex { +public final class MultiDeBruijnVertex extends DeBruijnVertex { private final static boolean KEEP_TRACK_OF_READS = false; // Note that using an AtomicInteger is critical to allow multi-threaded HaplotypeCaller @@ -116,6 +116,10 @@ final class MultiDeBruijnVertex extends DeBruijnVertex { @Override public String additionalInfo() { - return KEEP_TRACK_OF_READS ? (! reads.contains("ref") ? "__" + Utils.join(",", reads) : "") : ""; + return super.additionalInfo() + (KEEP_TRACK_OF_READS ? (! reads.contains("ref") ? "__" + Utils.join(",", reads) : "") : ""); + } + + int getId() { + return id; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index d575f14a5..f33a4883f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -181,7 +181,7 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { return null; } - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.0.raw_readthreading_graph.dot")); + printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot")); // go through and prune all of the chains where all edges have <= pruneFactor. This must occur // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering @@ -190,25 +190,28 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if // we can recover them by merging some N bases from the chain back into the reference - if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(); + if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor); // remove all heading and trailing paths if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); - printDebugGraphTransform(rtgraph, new File("sequenceGraph.0.1.cleaned_readthreading_graph.dot")); + printDebugGraphTransform(rtgraph, new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot")); final SeqGraph initialSeqGraph = rtgraph.convertToSequenceGraph(); + if (debugGraphTransformations) initialSeqGraph.printGraph(new File("" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"),10000); // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph if ( justReturnRawGraph ) return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph); - if ( debug ) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); - printDebugGraphTransform(initialSeqGraph, new File("sequenceGraph.0.2.initial_seqgraph.dot")); + if (debug) logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); + printDebugGraphTransform(initialSeqGraph, new File( "" + refHaplotype.getGenomeLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot")); initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus(); - return new AssemblyResult(status, cleaned.getGraph()); + final AssemblyResult result = new AssemblyResult(status, cleaned.getGraph()); + result.setThreadingGraph(rtgraph); + return result; } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index 7d7df2c06..dc057294e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -58,14 +58,18 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; import org.jgrapht.EdgeFactory; import org.jgrapht.alg.CycleDetector; import java.io.File; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ReadThreadingGraph extends BaseGraph implements KmerSearchableGraph { -public class ReadThreadingGraph extends BaseGraph { /** * Edge factory that encapsulates the numPruningSamples assembly parameter */ @@ -93,6 +97,9 @@ public class ReadThreadingGraph extends BaseGraph nonUniqueKmers; + protected Set nonUniqueKmers; /** * A map from kmers -> their corresponding vertex in the graph */ - private Map uniqueKmers = new LinkedHashMap<>(); + protected Map uniqueKmers = new LinkedHashMap<>(); /** * */ - final int kmerSize; + final boolean debugGraphTransformations; final byte minBaseQualityToUseInAssembly; @@ -125,16 +132,42 @@ public class ReadThreadingGraph extends BaseGraph getNextVertices(final MultiDeBruijnVertex v, final byte b) { + if (v == null) throw new IllegalArgumentException("the input vertex cannot be null"); + if (!vertexSet().contains(v)) throw new IllegalArgumentException("the vertex must be present in the graph"); + final List result = new LinkedList<>(); + for (final MultiDeBruijnVertex w : outgoingVerticesOf(v)) { + if (w.getSuffix() == b) + result.add(w); + } + switch (result.size()) { + case 0: return Collections.emptySet(); + case 1: return Collections.singleton(result.get(0)); + default: + return new HashSet<>(result); + } + } + /** * Create a new ReadThreadingAssembler using kmerSize for matching * @param kmerSize must be >= 1 @@ -143,7 +176,6 @@ public class ReadThreadingGraph extends BaseGraph 0"); // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths - final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex); + final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex, pruneFactor); // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) ) @@ -301,13 +334,14 @@ public class ReadThreadingGraph extends BaseGraph elements = cigar.getCigarElements(); + final int numElements = elements.size(); // don't allow more than a couple of different ops - if ( elements.size() > 3 ) + if ( numElements > MAX_CIGAR_COMPLEXITY ) return false; // the last element must be an M - if ( elements.get(elements.size() - 1).getOperator() != CigarOperator.M ) + if ( elements.get(numElements - 1).getOperator() != CigarOperator.M ) return false; // TODO -- do we want to check whether the Ms mismatch too much also? @@ -334,8 +368,17 @@ public class ReadThreadingGraph extends BaseGraph altPath = findPathToLowestCommonAncestorOfReference(vertex); - if ( altPath == null || isRefSource(altPath.get(0)) ) + final List altPath = findPathToLowestCommonAncestorOfReference(vertex, pruneFactor); + if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH ) return null; // now get the reference path from the LCA @@ -361,24 +405,32 @@ public class ReadThreadingGraph extends BaseGraph findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex) { + protected List findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { final LinkedList path = new LinkedList<>(); MultiDeBruijnVertex v = vertex; while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { - path.addFirst(v); - v = getEdgeSource(incomingEdgeOf(v)); + final MultiSampleEdge edge = incomingEdgeOf(v); + // if it has too low a weight, don't use it (or previous vertexes) for the path + if ( edge.getPruningMultiplicity() < pruneFactor ) + path.clear(); + // otherwise it is safe to use + else + path.addFirst(v); + v = getEdgeSource(edge); } path.addFirst(v); @@ -434,6 +486,33 @@ public class ReadThreadingGraph extends BaseGraph verticesToRemove = new LinkedList<>(); + for( final MultiDeBruijnVertex v : vertexSet() ) { + if( inDegreeOf(v) == 0 && outDegreeOf(v) == 0 ) { + verticesToRemove.add(v); + } + } + this.removeVertex(null); + removeAllVertices(verticesToRemove); } /** @@ -453,15 +532,20 @@ public class ReadThreadingGraph extends BaseGraph uniqueKmers.size(); } - public void recoverDanglingTails() { + /** + * Try to recover dangling tails + * + * @param pruneFactor the prune factor to use in ignoring chain pieces + */ + public void recoverDanglingTails(final int pruneFactor) { if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); int attempted = 0; int nRecovered = 0; for ( final MultiDeBruijnVertex v : vertexSet() ) { - if ( outDegreeOf(v) == 0 && ! isRefNodeAndRefSink(v) ) { + if ( outDegreeOf(v) == 0 && ! isRefSink(v) ) { attempted++; - nRecovered += recoverDanglingChain(v); + nRecovered += recoverDanglingChain(v, pruneFactor); } } @@ -566,6 +650,7 @@ public class ReadThreadingGraph extends BaseGraph vertexMap = new HashMap(); + // create all of the equivalent seq graph vertices for ( final MultiDeBruijnVertex dv : vertexSet() ) { final SeqVertex sv = new SeqVertex(dv.getAdditionalSequence(isSource(dv))); @@ -610,7 +695,7 @@ public class ReadThreadingGraph extends BaseGraph findStart(final SequenceForKmers seqForKmers) { + protected Pair findStart(final SequenceForKmers seqForKmers) { final int uniqueStartPos = seqForKmers.isRef ? 0 : findUniqueStartPosition(seqForKmers.sequence, seqForKmers.start, seqForKmers.stop); if ( uniqueStartPos == -1 ) @@ -642,7 +727,7 @@ public class ReadThreadingGraph extends BaseGraph getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) { + protected Pair getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) { final Kmer kmer = new Kmer(sequence, start, kmerSize); final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, allowRefSource); if ( vertex != null ) { @@ -660,9 +745,11 @@ public class ReadThreadingGraph extends BaseGraph= kmerSize ) { // if the sequence is long enough to get some value out of, add it to the graph final String name = read.getReadName() + "_" + start + "_" + end; - addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, stop, reducedReadCounts, false); + addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false); } lastGood = -1; // reset the last good base @@ -782,4 +868,178 @@ public class ReadThreadingGraph extends BaseGraph + * Note: only used for testing. + * Checkout {@link HaplotypeGraphUnitTest} for examples. + *

+ * @param s the string representation of the graph {@code null}. + */ + public ReadThreadingGraph(final String s) { + super(kmerSizeFromString(s),new MyEdgeFactory(1)); + debugGraphTransformations = false; + minBaseQualityToUseInAssembly = 0; + applyString(s); + alreadyBuilt = true; + } + + /** + * Obtain the kmer size for the string representation. + * @param str the source string representation. + * @return 1 or greater. + * @throws IllegalArgumentException if {@code} str does not contain a valid representation. + */ + private static int kmerSizeFromString(final String str) { + final Matcher matcher = KMERSIZE_EXTRACTOR_PATTERN.matcher(str); + if (matcher.find()) { + return Integer.parseInt(matcher.group(2)); + } else + throw new IllegalArgumentException("the input graph spec does not indicate the kmerSize"); + } + + /** + * Apply description string into the graph. + * + *

+ * Note: this is done just for testing purposes. + * Checkout {@link HaplotypeGraphUnitTest} for examples. + *

+ * @param str the string representation. + */ + private void applyString(final String str) { + final Matcher propertiesSectionMatcher = PROPERTIES_PATTERN.matcher(str); + final int pathStart = propertiesSectionMatcher.find() ? propertiesSectionMatcher.end() : 0; + + final String pathString = str.substring(pathStart); + final Matcher pathMatcher = PATH_PATTERN.matcher(pathString); + + boolean referenceFound = false; + final Map vertexById = new HashMap<>(); + + // Loop between path strings and add them one by one. + while (pathMatcher.find()) { + final String label = pathMatcher.group(2); + final boolean isReference = (label != null && label.equals("REF")); + if (referenceFound) { + if (isReference) + throw new IllegalArgumentException("there are two reference paths"); + + } else + referenceFound |= isReference; + + // Divide each path into its elements getting a list of sequences and labels if applies: + final String elementsString = pathMatcher.group(3); + final String[] elements = elementsString.split("\\s*->\\s*"); + if (elements.length == 0) + throw new IllegalArgumentException("empty path not allowed"); + final String[] seqs = new String[elements.length]; + final String[] ids = new String[elements.length]; + for (int i = 0; i < elements.length; i++) { + ids[i] = pathElementId(elements[i]); + seqs[i] = pathElementSeq(elements[i]); + if (seqs[i].isEmpty() && ids[i] == null) + throw new IllegalArgumentException("path with empty element without an id"); + } + final boolean isSource = ids[0] == null || !vertexById.containsKey(ids[0]); + if (isSource && seqs[0].length() != kmerSize) + throw new IllegalArgumentException("source sequence length must be the same as the kmerSize " + + ids[0] + " " + seqs[0] + " " + pathMatcher.group()); + final MultiDeBruijnVertex firstVertex; + if (ids[0] != null && vertexById.containsKey(ids[0])) + firstVertex = vertexById.get(ids[0]); + else { + firstVertex = new MultiDeBruijnVertex(seqs[0].getBytes()); + addVertex(firstVertex); + if (ids[0] != null) + vertexById.put(ids[0],firstVertex); + } + if (!seqs[0].isEmpty() && + ((isSource && !firstVertex.getSequenceString().equals(seqs[0])) + || (!isSource && firstVertex.getSuffix() != seqs[0].getBytes()[0]))) + throw new IllegalArgumentException("mismatched first element sequence"); + + MultiDeBruijnVertex lastVertex = firstVertex; + for (int i = 1; i < elements.length; i++) { + if (seqs[i].length() > 1) + throw new IllegalArgumentException("non-source vertex sequence must have length 1"); + final MultiDeBruijnVertex nextVertex; + if (ids[i] == null || !vertexById.containsKey(ids[i])) { + final Set nextVertices = getNextVertices(lastVertex,seqs[i].getBytes()[0]); + if (nextVertices.size() == 0) { + nextVertex = new MultiDeBruijnVertex(extendSequence(lastVertex.getSequence(),seqs[i].getBytes()[0])); + addVertex(nextVertex); + } else { + nextVertex = nextVertices.iterator().next(); + } + if (ids[i] != null) + vertexById.put(ids[i],nextVertex); + } else { + nextVertex = vertexById.get(ids[i]); + } + final MultiSampleEdge edge = addEdge(lastVertex,nextVertex); + if (isReference) edge.setIsRef(true); + lastVertex = nextVertex; + } + } + } + + private static String pathElementId(final String element) { + final int parentesysPos = element.indexOf('('); + + if (parentesysPos == -1) + return null; + + final int closeParentesysPos = element.lastIndexOf(')'); + if (closeParentesysPos == -1) + throw new IllegalArgumentException("non-closed id parantesys found in element: " + element); + final String result = element.substring(parentesysPos + 1,closeParentesysPos).trim(); + if (result.isEmpty()) + throw new IllegalArgumentException("empty id found in element: " + element); + return result; + } + + /** + * Returns the lenght of a path element in the string representation. + * @param element the query element. + * @return 0 or greater. + */ + private static String pathElementSeq(final String element) { + final int parentesysPos = element.indexOf('('); + + if (parentesysPos == -1) + return element.trim(); + + return element.substring(0,parentesysPos).trim(); + } + + /** + * Add a base to the end of a byte sequence. + * @param sequence sequence where to add the base to. + * @param b base to add. + * @return never {@code null}, a new array each time. + */ + private static byte[] extendSequence(final byte[] sequence, final byte b) { + final byte[] result = new byte[sequence.length]; + System.arraycopy(sequence,1,result,0,sequence.length - 1); + result[result.length - 1] = b; + return result; + } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index c77557da6..a273cf01d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -48,11 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.Log10PairHMM; import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM; import org.broadinstitute.sting.utils.pairhmm.PairHMM; @@ -64,10 +65,9 @@ import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; import java.util.LinkedHashMap; +import java.util.LinkedList; import java.util.Map; -//import org.broadinstitute.sting.utils.pairhmm.LoglessCachingPairHMM; - public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; @@ -120,8 +120,11 @@ public class PairHMMIndelErrorModel { case LOGLESS_CACHING: pairHMM = new LoglessPairHMM(); break; + case ARRAY_LOGLESS: + pairHMM = new ArrayLoglessPairHMM(); + break; default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT or LOGLESS_CACHING."); + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL, EXACT, LOGLESS_CACHING, or ARRAY_LOGLESS."); } // fill gap penalty table, affine naive model: @@ -202,6 +205,39 @@ public class PairHMMIndelErrorModel { } } + private LinkedHashMap trimHaplotypes(final LinkedHashMap haplotypeMap, + long startLocationInRefForHaplotypes, + long stopLocationInRefForHaplotypes, + final ReferenceContext ref){ + + final LinkedHashMap trimmedHaplotypeMap = new LinkedHashMap<>(); + for (final Allele a: haplotypeMap.keySet()) { + + final Haplotype haplotype = haplotypeMap.get(a); + + if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) + stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + + if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) + startLocationInRefForHaplotypes = haplotype.getStartPosition(); + else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) + startLocationInRefForHaplotypes = haplotype.getStopPosition(); + + final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); + final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); + + if (DEBUG) + System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d\n", + indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); + + // get the trimmed haplotype-bases array and create a new haplotype based on it. Pack this into the new map + final byte[] trimmedHaplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); + final Haplotype trimmedHaplotype = new Haplotype(trimmedHaplotypeBases, haplotype.isReference()); + trimmedHaplotypeMap.put(a, trimmedHaplotype); + } + return trimmedHaplotypeMap; + } + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, final LinkedHashMap haplotypeMap, @@ -218,6 +254,28 @@ public class PairHMMIndelErrorModel { } + /** + * Should we clip a downstream portion of a read because it spans off the end of a haplotype? + * + * @param read the read in question + * @param refWindowStop the end of the reference window + * @return true if the read needs to be clipped, false otherwise + */ + protected static boolean mustClipDownstream(final GATKSAMRecord read, final int refWindowStop) { + return ( !read.isEmpty() && read.getSoftStart() < refWindowStop && read.getSoftStart() + read.getReadLength() > refWindowStop ); + } + + /** + * Should we clip a upstream portion of a read because it spans off the end of a haplotype? + * + * @param read the read in question + * @param refWindowStart the start of the reference window + * @return true if the read needs to be clipped, false otherwise + */ + protected static boolean mustClipUpstream(final GATKSAMRecord read, final int refWindowStart) { + return ( !read.isEmpty() && read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart ); + } + @Ensures("result != null && result.length == pileup.getNumberOfElements()") public synchronized double[][] computeGeneralReadHaplotypeLikelihoods(final ReadBackedPileup pileup, final LinkedHashMap haplotypeMap, @@ -227,6 +285,8 @@ public class PairHMMIndelErrorModel { final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; + final LinkedList readList = new LinkedList<>(); + final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; for (PileupElement p: pileup) { // > 1 when the read is a consensus read representing multiple independent observations @@ -245,9 +305,8 @@ public class PairHMMIndelErrorModel { // in them - a value of 1 will in theory do but we use a slightly higher one just for safety sake, mostly // in case bases at edge of reads have lower quality. final int trailingBases = 3; - final int extraOffset = Math.abs(eventLength); - final int refWindowStart = ref.getWindow().getStart()+(trailingBases+extraOffset); - final int refWindowStop = ref.getWindow().getStop()-(trailingBases+extraOffset); + final int refWindowStart = ref.getWindow().getStart() + trailingBases; + final int refWindowStop = ref.getWindow().getStop() - trailingBases; if (DEBUG) { System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); @@ -255,11 +314,13 @@ public class PairHMMIndelErrorModel { GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead()); - if (!read.isEmpty() && (read.getSoftEnd() > refWindowStop && read.getSoftStart() < refWindowStop)) - read = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, refWindowStop); + // if the read extends beyond the downstream (right) end of the reference window, clip it + if ( mustClipDownstream(read, refWindowStop) ) + read = ReadClipper.hardClipByReadCoordinates(read, read.getSoftStart() + read.getReadLength() - refWindowStop + 1, read.getReadLength() - 1); - if (!read.isEmpty() && (read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart)) - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail (read, refWindowStart); + // if the read extends beyond the upstream (left) end of the reference window, clip it + if ( mustClipUpstream(read, refWindowStart) ) + read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, refWindowStart); if (read.isEmpty()) continue; @@ -297,8 +358,9 @@ public class PairHMMIndelErrorModel { * trailingBases is a padding constant(=3) and we additionally add abs(eventLength) to both sides of read to be able to * differentiate context between two haplotypes */ - long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read)-extraOffset, 0); - long stopLocationInRefForHaplotypes = readEnd -numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read)+extraOffset; + final int absEventLength = Math.abs(eventLength); + long startLocationInRefForHaplotypes = Math.max(readStart + numStartSoftClippedBases - trailingBases - ReadUtils.getFirstInsertionOffset(read) - absEventLength, 0); + long stopLocationInRefForHaplotypes = readEnd - numEndSoftClippedBases + trailingBases + ReadUtils.getLastInsertionOffset(read) + absEventLength; if (DEBUG) System.out.format("orig Start:%d orig stop: %d\n", startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes); @@ -365,52 +427,30 @@ public class PairHMMIndelErrorModel { baseDeletionQualities = contextLogGapOpenProbabilities; } - boolean firstHap = true; - for (Allele a: haplotypeMap.keySet()) { + // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM + final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); + readList.add(processedRead); - Haplotype haplotype = haplotypeMap.get(a); + // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM + readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); - if (stopLocationInRefForHaplotypes > haplotype.getStopPosition()) - stopLocationInRefForHaplotypes = haplotype.getStopPosition(); + // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations + final Map trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); - if (startLocationInRefForHaplotypes < haplotype.getStartPosition()) - startLocationInRefForHaplotypes = haplotype.getStartPosition(); - else if (startLocationInRefForHaplotypes > haplotype.getStopPosition()) - startLocationInRefForHaplotypes = haplotype.getStopPosition(); - - final long indStart = startLocationInRefForHaplotypes - haplotype.getStartPosition(); - final long indStop = stopLocationInRefForHaplotypes - haplotype.getStartPosition(); - - double readLikelihood; - if (DEBUG) - System.out.format("indStart: %d indStop: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d C:%s\n", - indStart, indStop, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength(), read.getCigar().toString()); - - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), (int)indStart, (int)indStop); - - // it's possible that the indel starts at the last base of the haplotypes - if ( haplotypeBases.length == 0 ) { - readLikelihood = -Double.MAX_VALUE; - } else { - if (firstHap) { - //no need to reallocate arrays for each new haplotype, as length won't change - pairHMM.initialize(readBases.length, haplotypeBases.length); - firstHap = false; - } - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, - baseInsertionQualities, baseDeletionQualities, contextLogGapContinuationProbabilities, firstHap); - } - - if (DEBUG) { - System.out.println("H:"+new String(haplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - } + // Get the likelihoods for our clipped read against each of our trimmed haplotypes. + final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); + // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array + for (Allele a: trimmedHaplotypeMap.keySet()){ + double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } + // The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually + readList.remove(processedRead); + + // The same is true for the read/GCP-array map + readGCPArrayMap.remove(processedRead); } } readIdx++; @@ -434,16 +474,16 @@ public class PairHMMIndelErrorModel { return !((read.getAlignmentStart() >= eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) || (read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)); } - private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { - if (b1.length != b2.length) - return 0; // sanity check - - for (int i=0; i < b1.length; i++ ){ - if ( b1[i]!= b2[i] ) - return i; - } - return b1.length; - } +// private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { +// if (b1.length != b2.length) +// return 0; // sanity check +// +// for (int i=0; i < b1.length; i++ ){ +// if ( b1[i]!= b2[i] ) +// return i; +// } +// return b1.length; +// } private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index 7bbc4e981..707bf2722 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.HelpConstants; @@ -839,9 +838,14 @@ public class PhaseByTransmission extends RodWalker, HashMa return metricsCounters; final VariantContext vc = tracker.getFirstValue(variantCollection.variants, context.getLocation()); - if (vc == null || !vc.isBiallelic()) + if ( vc == null ) return metricsCounters; + if ( !vc.isBiallelic() ) { + vcfWriter.add(vc); + return metricsCounters; + } + final VariantContextBuilder builder = new VariantContextBuilder(vc); final GenotypesContext genotypesContext = GenotypesContext.copy(vc.getGenotypes()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java index 13daee8c9..25f6f874d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -133,6 +132,11 @@ public class AssessReducedQuals extends LocusWalker implem return reportLocus ? ref.getLocus() : null; } + /** + * Get the quals separated by version and strand + * @param readPileup the pileup + * @return 2x2 array with sum of quals separated by version in 1st dimension and strand in the 2nd + */ private int[] getPileupQuals(final ReadBackedPileup readPileup) { final int[] quals = new int[2]; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index e15b99824..4b5237087 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -46,10 +46,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; +import org.apache.commons.math.util.MathUtils; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -112,6 +110,9 @@ import java.util.*; @PartitionBy(PartitionType.LOCUS) public class ApplyRecalibration extends RodWalker implements TreeReducible { + public static final String LOW_VQSLOD_FILTER_NAME = "LOW_VQSLOD"; + private final double DEFAULT_VQSLOD_CUTOFF = 0.0; + ///////////////////////////// // Inputs ///////////////////////////// @@ -122,7 +123,7 @@ public class ApplyRecalibration extends RodWalker implements T public List> input; @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) protected RodBinding recal; - @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) + @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=false) protected File TRANCHES_FILE; ///////////////////////////// @@ -134,19 +135,29 @@ public class ApplyRecalibration extends RodWalker implements T ///////////////////////////// // Command Line Arguments ///////////////////////////// + @Advanced @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) - protected double TS_FILTER_LEVEL = 99.0; - @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) + protected Double TS_FILTER_LEVEL = null; + @Advanced + @Argument(fullName="lodCutoff", shortName="lodCutoff", doc="The VQSLOD score below which to start filtering", required=false) + protected Double VQSLOD_CUTOFF = null; + + /** + * For this to work properly, the -ignoreFilter argument should also be applied to the VariantRecalibration command. + */ + @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified, the recalibration will be applied to variants marked as filtered by the specified filter name in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; + @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't output filtered loci after applying the recalibration", required=false) + protected boolean EXCLUDE_FILTERED = false; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP; ///////////////////////////// // Private Member Variables ///////////////////////////// - final private List tranches = new ArrayList(); - final private Set inputNames = new HashSet(); - final private Set ignoreInputFilterSet = new TreeSet(); + final private List tranches = new ArrayList<>(); + final private Set inputNames = new HashSet<>(); + final private Set ignoreInputFilterSet = new TreeSet<>(); //--------------------------------------------------------------------------------------------------------------- // @@ -155,13 +166,15 @@ public class ApplyRecalibration extends RodWalker implements T //--------------------------------------------------------------------------------------------------------------- public void initialize() { - for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) { - if ( t.ts >= TS_FILTER_LEVEL ) { - tranches.add(t); + if( TS_FILTER_LEVEL != null ) { + for ( final Tranche t : Tranche.readTranches(TRANCHES_FILE) ) { + if ( t.ts >= TS_FILTER_LEVEL ) { + tranches.add(t); + } + logger.info(String.format("Read tranche " + t)); } - logger.info(String.format("Read tranche " + t)); + Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity) } - Collections.reverse(tranches); // this algorithm wants the tranches ordered from best (lowest truth sensitivity) to worst (highest truth sensitivity) for( final RodBinding rod : input ) { inputNames.add( rod.getName() ); @@ -172,25 +185,38 @@ public class ApplyRecalibration extends RodWalker implements T } // setup the header fields - final Set hInfo = new HashSet(); + final Set hInfo = new HashSet<>(); hInfo.addAll(GATKVCFUtils.getHeaderFields(getToolkit(), inputNames)); addVQSRStandardHeaderLines(hInfo); - final TreeSet samples = new TreeSet(); + final TreeSet samples = new TreeSet<>(); samples.addAll(SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)); - if( tranches.size() >= 2 ) { - for( int iii = 0; iii < tranches.size() - 1; iii++ ) { - final Tranche t = tranches.get(iii); - hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level for " + t.model.toString() + " model at VQS Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod))); + if( TS_FILTER_LEVEL != null ) { + // if the user specifies both ts_filter_level and lodCutoff then throw a user error + if( VQSLOD_CUTOFF != null ) { + throw new UserException("Arguments --ts_filter_level and --lodCutoff are mutually exclusive. Please only specify one option."); } - } - if( tranches.size() >= 1 ) { - hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level for " + tranches.get(0).model.toString() + " model at VQS Lod < " + tranches.get(0).minVQSLod))); - } else { - throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL); - } - logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1)); + if( tranches.size() >= 2 ) { + for( int iii = 0; iii < tranches.size() - 1; iii++ ) { + final Tranche t = tranches.get(iii); + hInfo.add(new VCFFilterHeaderLine(t.name, String.format("Truth sensitivity tranche level for " + t.model.toString() + " model at VQS Lod: " + t.minVQSLod + " <= x < " + tranches.get(iii+1).minVQSLod))); + } + } + if( tranches.size() >= 1 ) { + hInfo.add(new VCFFilterHeaderLine(tranches.get(0).name + "+", String.format("Truth sensitivity tranche level for " + tranches.get(0).model.toString() + " model at VQS Lod < " + tranches.get(0).minVQSLod))); + } else { + throw new UserException("No tranches were found in the file or were above the truth sensitivity filter level " + TS_FILTER_LEVEL); + } + + logger.info("Keeping all variants in tranche " + tranches.get(tranches.size()-1)); + } else { + if( VQSLOD_CUTOFF == null ) { + VQSLOD_CUTOFF = DEFAULT_VQSLOD_CUTOFF; + } + hInfo.add(new VCFFilterHeaderLine(LOW_VQSLOD_FILTER_NAME, "VQSLOD < " + VQSLOD_CUTOFF)); + logger.info("Keeping all variants with VQSLOD >= " + VQSLOD_CUTOFF); + } final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); @@ -240,7 +266,6 @@ public class ApplyRecalibration extends RodWalker implements T } VariantContextBuilder builder = new VariantContextBuilder(vc); - String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); @@ -250,21 +275,7 @@ public class ApplyRecalibration extends RodWalker implements T if ( recalDatum.hasAttribute(VariantRecalibrator.NEGATIVE_LABEL_KEY)) builder.attribute(VariantRecalibrator.NEGATIVE_LABEL_KEY, true); - for( int i = tranches.size() - 1; i >= 0; i-- ) { - final Tranche tranche = tranches.get(i); - if( lod >= tranche.minVQSLod ) { - if( i == tranches.size() - 1 ) { - filterString = VCFConstants.PASSES_FILTERS_v4; - } else { - filterString = tranche.name; - } - break; - } - } - - if( filterString == null ) { - filterString = tranches.get(0).name+"+"; - } + final String filterString = generateFilterString(lod); if( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { builder.passFilters(); @@ -272,7 +283,10 @@ public class ApplyRecalibration extends RodWalker implements T builder.filters(filterString); } - vcfWriter.add( builder.make() ); + final VariantContext outputVC = builder.make(); + if( !EXCLUDE_FILTERED || outputVC.isNotFiltered() ) { + vcfWriter.add( outputVC ); + } } else { // valid VC but not compatible with this mode, so just emit the variant untouched vcfWriter.add( vc ); } @@ -281,6 +295,36 @@ public class ApplyRecalibration extends RodWalker implements T return 1; // This value isn't used for anything } + /** + * Generate the VCF filter string for this record based on the provided lod score + * @param lod non-null double + * @return the String to use as the VCF filter field + */ + protected String generateFilterString( final double lod ) { + String filterString = null; + if( TS_FILTER_LEVEL != null ) { + for( int i = tranches.size() - 1; i >= 0; i-- ) { + final Tranche tranche = tranches.get(i); + if( lod >= tranche.minVQSLod ) { + if( i == tranches.size() - 1 ) { + filterString = VCFConstants.PASSES_FILTERS_v4; + } else { + filterString = tranche.name; + } + break; + } + } + + if( filterString == null ) { + filterString = tranches.get(0).name+"+"; + } + } else { + filterString = ( lod < VQSLOD_CUTOFF ? LOW_VQSLOD_FILTER_NAME : VCFConstants.PASSES_FILTERS_v4 ); + } + + return filterString; + } + private static VariantContext getMatchingRecalVC(final VariantContext target, final List recalVCs) { for( final VariantContext recalVC : recalVCs ) { if ( target.getEnd() == recalVC.getEnd() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index efc24d5f9..9e36e5dbe 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -77,7 +77,7 @@ public class GaussianMixtureModel { public GaussianMixtureModel( final int numGaussians, final int numAnnotations, final double shrinkage, final double dirichletParameter, final double priorCounts ) { - gaussians = new ArrayList( numGaussians ); + gaussians = new ArrayList<>( numGaussians ); for( int iii = 0; iii < numGaussians; iii++ ) { final MultivariateGaussian gaussian = new MultivariateGaussian( numAnnotations ); gaussians.add( gaussian ); @@ -267,7 +267,7 @@ public class GaussianMixtureModel { public double evaluateDatumMarginalized( final VariantDatum datum ) { int numRandomDraws = 0; double sumPVarInGaussian = 0.0; - final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization + final int numIterPerMissingAnnotation = 20; // Trade off here between speed of computation and accuracy of the marginalization final double[] pVarInGaussianLog10 = new double[gaussians.size()]; // for each dimension for( int iii = 0; iii < datum.annotations.length; iii++ ) { @@ -283,7 +283,7 @@ public class GaussianMixtureModel { } // add this sample's probability to the pile in order to take an average in the end - sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) + sumPVarInGaussian += Math.pow(10.0, nanTolerantLog10SumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) numRandomDraws++; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java index 3adec4108..1b1656a10 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/MultivariateGaussian.java @@ -77,7 +77,7 @@ public class MultivariateGaussian { public MultivariateGaussian( final int numAnnotations ) { mu = new double[numAnnotations]; sigma = new Matrix(numAnnotations, numAnnotations); - pVarInGaussian = new ExpandingArrayList(); + pVarInGaussian = new ExpandingArrayList<>(); } public void zeroOutMu() { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java index 30377b63e..ab6b4adda 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java @@ -160,11 +160,11 @@ public class TrancheManager { } } - public static List findTranches( final ArrayList data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) { + public static List findTranches( final List data, final double[] tranches, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model ) { return findTranches( data, tranches, metric, model, null ); } - public static List findTranches( final ArrayList data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { + public static List findTranches( final List data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size())); Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); @@ -172,7 +172,7 @@ public class TrancheManager { if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); } - List tranches = new ArrayList(); + List tranches = new ArrayList<>(); for ( double trancheThreshold : trancheThresholds ) { Tranche t = findTranche(data, metric, trancheThreshold, model); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index 40032a886..ac4654f73 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; +import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -70,47 +71,47 @@ import java.util.*; */ public class VariantDataManager { - private ExpandingArrayList data; - private final double[] meanVector; - private final double[] varianceVector; // this is really the standard deviation - public final List annotationKeys; + private List data; + private double[] meanVector; + private double[] varianceVector; // this is really the standard deviation + public List annotationKeys; private final VariantRecalibratorArgumentCollection VRAC; protected final static Logger logger = Logger.getLogger(VariantDataManager.class); protected final List trainingSets; public VariantDataManager( final List annotationKeys, final VariantRecalibratorArgumentCollection VRAC ) { this.data = null; - this.annotationKeys = new ArrayList( annotationKeys ); + this.annotationKeys = new ArrayList<>( annotationKeys ); this.VRAC = VRAC; meanVector = new double[this.annotationKeys.size()]; varianceVector = new double[this.annotationKeys.size()]; - trainingSets = new ArrayList(); + trainingSets = new ArrayList<>(); } - public void setData( final ExpandingArrayList data ) { + public void setData( final List data ) { this.data = data; } - public ExpandingArrayList getData() { + public List getData() { return data; } public void normalizeData() { boolean foundZeroVarianceAnnotation = false; for( int iii = 0; iii < meanVector.length; iii++ ) { - final double theMean = mean(iii); - final double theSTD = standardDeviation(theMean, iii); + final double theMean = mean(iii, true); + final double theSTD = standardDeviation(theMean, iii, true); logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); if( Double.isNaN(theMean) ) { throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpConstants.forumPost("discussion/49/using-variant-annotator")); } - foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6); + foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-5); meanVector[iii] = theMean; varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { // Transform each data point via: (x - mean) / standard deviation - datum.annotations[iii] = ( datum.isNull[iii] ? GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); + datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); } } if( foundZeroVarianceAnnotation ) { @@ -125,6 +126,74 @@ public class VariantDataManager { } datum.failingSTDThreshold = remove; } + + // re-order the data by increasing standard deviation so that the results don't depend on the order things were specified on the command line + // standard deviation over the training points is used as a simple proxy for information content, perhaps there is a better thing to use here + final List theOrder = calculateSortOrder(meanVector); + annotationKeys = reorderList(annotationKeys, theOrder); + varianceVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(varianceVector), theOrder)); + meanVector = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(meanVector), theOrder)); + for( final VariantDatum datum : data ) { + datum.annotations = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.annotations), theOrder)); + datum.isNull = ArrayUtils.toPrimitive(reorderArray(ArrayUtils.toObject(datum.isNull), theOrder)); + } + logger.info("Annotations are now ordered by their information content: " + annotationKeys.toString()); + } + + /** + * Get a list of indices which give the ascending sort order of the data array + * @param inputVector the data to consider + * @return a non-null list of integers with length matching the length of the input array + */ + protected List calculateSortOrder(final double[] inputVector) { + final List theOrder = new ArrayList<>(inputVector.length); + final List toBeSorted = new ArrayList<>(inputVector.length); + int count = 0; + for( int iii = 0; iii < inputVector.length; iii++ ) { + toBeSorted.add(new MyDoubleForSorting(-1.0 * Math.abs(inputVector[iii] - mean(iii, false)), count++)); + } + Collections.sort(toBeSorted); + for( final MyDoubleForSorting d : toBeSorted ) { + theOrder.add(d.originalIndex); // read off the sort order by looking at the index field + } + return theOrder; + } + + // small private class to assist in reading off the new ordering of the annotation array + private class MyDoubleForSorting implements Comparable { + final Double myData; + final int originalIndex; + + public MyDoubleForSorting(final double myData, final int originalIndex) { + this.myData = myData; + this.originalIndex = originalIndex; + } + + @Override + public int compareTo(final MyDoubleForSorting other) { + return myData.compareTo(other.myData); + } + } + + /** + * Convenience connector method to work with arrays instead of lists. See ##reorderList## + */ + private T[] reorderArray(final T[] data, final List order) { + return reorderList(Arrays.asList(data), order).toArray(data); + } + + /** + * Reorder the given data list to be in the specified order + * @param data the data to reorder + * @param order the new order to use + * @return a reordered list of data + */ + private List reorderList(final List data, final List order) { + final List returnList = new ArrayList<>(data.size()); + for( final int index : order ) { + returnList.add( data.get(index) ); + } + return returnList; } /** @@ -147,6 +216,10 @@ public class VariantDataManager { trainingSets.add( trainingSet ); } + public List getAnnotationKeys() { + return annotationKeys; + } + public boolean checkHasTrainingSet() { for( final TrainingSet trainingSet : trainingSets ) { if( trainingSet.isTraining ) { return true; } @@ -161,96 +234,77 @@ public class VariantDataManager { return false; } - public boolean checkHasKnownSet() { - for( final TrainingSet trainingSet : trainingSets ) { - if( trainingSet.isKnown ) { return true; } - } - return false; - } - - public ExpandingArrayList getTrainingData() { - final ExpandingArrayList trainingData = new ExpandingArrayList(); + public List getTrainingData() { + final List trainingData = new ExpandingArrayList<>(); for( final VariantDatum datum : data ) { - if( datum.atTrainingSite && !datum.failingSTDThreshold && datum.originalQual > VRAC.QUAL_THRESHOLD ) { + if( datum.atTrainingSite && !datum.failingSTDThreshold ) { trainingData.add( datum ); } } logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." ); if( trainingData.size() < VRAC.MIN_NUM_BAD_VARIANTS ) { logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); + } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) { + logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." ); + Collections.shuffle(trainingData); + return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA); } return trainingData; } - public ExpandingArrayList selectWorstVariants( double bottomPercentage, final int minimumNumber ) { - // The return value is the list of training variants - final ExpandingArrayList trainingData = new ExpandingArrayList(); + public List selectWorstVariants() { + final List trainingData = new ExpandingArrayList<>(); - // First add to the training list all sites overlapping any bad sites training tracks for( final VariantDatum datum : data ) { - if( datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) { - trainingData.add( datum ); - } - } - final int numBadSitesAdded = trainingData.size(); - logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); - - // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants - Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); - final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); - if( numToAdd > data.size() ) { - throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe." ); - } else if( numToAdd == minimumNumber - trainingData.size() ) { - logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); - bottomPercentage = ((float) numToAdd) / ((float) data.size()); - } - int index = 0, numAdded = 0; - while( numAdded < numToAdd && index < data.size() ) { - final VariantDatum datum = data.get(index++); - if( datum != null && !datum.atAntiTrainingSite && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) ) { + if( datum != null && !datum.failingSTDThreshold && !Double.isInfinite(datum.lod) && datum.lod < VRAC.BAD_LOD_CUTOFF ) { datum.atAntiTrainingSite = true; trainingData.add( datum ); - numAdded++; } } - logger.info( "Additionally training with worst " + String.format("%.3f", (float) bottomPercentage * 100.0f) + "% of passing data --> " + (trainingData.size() - numBadSitesAdded) + " variants with LOD <= " + String.format("%.4f", data.get(index).lod) + "." ); + + logger.info( "Training with worst " + trainingData.size() + " scoring variants --> variants with LOD <= " + String.format("%.4f", VRAC.BAD_LOD_CUTOFF) + "." ); + return trainingData; } - public ExpandingArrayList getRandomDataForPlotting( int numToAdd ) { - numToAdd = Math.min(numToAdd, data.size()); - final ExpandingArrayList returnData = new ExpandingArrayList(); - for( int iii = 0; iii < numToAdd; iii++) { - final VariantDatum datum = data.get(GenomeAnalysisEngine.getRandomGenerator().nextInt(data.size())); - if( !datum.failingSTDThreshold ) { - returnData.add(datum); + public List getEvaluationData() { + final List evaluationData = new ExpandingArrayList<>(); + + for( final VariantDatum datum : data ) { + if( datum != null && !datum.failingSTDThreshold && !datum.atTrainingSite && !datum.atAntiTrainingSite ) { + evaluationData.add( datum ); } } - // Add an extra 5% of points from bad training set, since that set is small but interesting - for( int iii = 0; iii < Math.floor(0.05*numToAdd); iii++) { - final VariantDatum datum = data.get(GenomeAnalysisEngine.getRandomGenerator().nextInt(data.size())); - if( datum.atAntiTrainingSite && !datum.failingSTDThreshold ) { returnData.add(datum); } - else { iii--; } - } + return evaluationData; + } + public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { + final List returnData = new ExpandingArrayList<>(); + Collections.shuffle(trainingData); + Collections.shuffle(antiTrainingData); + Collections.shuffle(evaluationData); + returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size()))); + returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size()))); + returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size()))); + Collections.shuffle(returnData); return returnData; } - private double mean( final int index ) { + protected double mean( final int index, final boolean trainingData ) { double sum = 0.0; int numNonNull = 0; for( final VariantDatum datum : data ) { - if( datum.atTrainingSite && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; } + if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += datum.annotations[index]; numNonNull++; } } return sum / ((double) numNonNull); } - private double standardDeviation( final double mean, final int index ) { + protected double standardDeviation( final double mean, final int index, final boolean trainingData ) { double sum = 0.0; int numNonNull = 0; for( final VariantDatum datum : data ) { - if( datum.atTrainingSite && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; } + if( (trainingData == datum.atTrainingSite) && !datum.isNull[index] ) { sum += ((datum.annotations[index] - mean)*(datum.annotations[index] - mean)); numNonNull++; } } return Math.sqrt( sum / ((double) numNonNull) ); } @@ -275,12 +329,9 @@ public class VariantDataManager { try { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HRUN") ) { // Integer valued annotations must be jittered a bit to work in this GMM - value += -0.25 + 0.5 * GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - } - - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.0001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } - if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.001) == 0 ) { value = -0.2 + 0.4*GenomeAnalysisEngine.getRandomGenerator().nextDouble(); } + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } } catch( Exception e ) { value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 824ef1f6e..1c32b852b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -79,14 +80,14 @@ import java.util.*; * Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants. * *

- * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker. + * This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with the ApplyRecalibration walker. *

* *

* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. - * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. + * You can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the the probability that a SNP is a true genetic + * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the probability that a SNP is a true genetic * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the @@ -94,12 +95,7 @@ import java.util.*; * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. *

* - *

- * NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). - * See http://www.r-project.org for more info on how to download and install R. - *

- * - *

Input

+ *

Inputs

*

* The input raw variants to be recalibrated. *

@@ -127,6 +123,17 @@ import java.util.*; * -rscriptFile path/to/output.plots.R * * + *

Caveat

+ * + *
    + *
  • The values used in the example above are only meant to show how the command lines are composed. + * They are not meant to be taken as specific recommendations of values to use in your own work, and they may be + * different from the values cited elsewhere in our documentation. For the latest and greatest recommendations on + * how to set parameter values for you own analyses, please read the Best Practices section of the documentation.
  • + * + *
  • In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). + * See http://www.r-project.org for more info on how to download and install R.
  • + *
*/ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARDISC, extraDocs = {CommandLineGATK.class} ) @@ -136,7 +143,7 @@ public class VariantRecalibrator extends RodWalker> resource = Collections.emptyList(); @@ -175,7 +182,8 @@ public class VariantRecalibrator extends RodWalker replicate = new ArrayList<>(); ///////////////////////////// // Debug Arguments @@ -213,7 +227,7 @@ public class VariantRecalibrator extends RodWalker ignoreInputFilterSet = new TreeSet(); + private final Set ignoreInputFilterSet = new TreeSet<>(); private final VariantRecalibratorEngine engine = new VariantRecalibratorEngine( VRAC ); //--------------------------------------------------------------------------------------------------------------- @@ -222,8 +236,9 @@ public class VariantRecalibrator extends RodWalker(Arrays.asList(USE_ANNOTATIONS)), VRAC ); + dataManager = new VariantDataManager( new ArrayList<>(Arrays.asList(USE_ANNOTATIONS)), VRAC ); if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) Utils.warnUser(logger, String.format( @@ -252,9 +267,13 @@ public class VariantRecalibrator extends RodWalker hInfo = new HashSet(); + final Set hInfo = new HashSet<>(); ApplyRecalibration.addVQSRStandardHeaderLines(hInfo); recalWriter.writeHeader( new VCFHeader(hInfo) ); + + for( int iii = 0; iii < REPLICATE * 2; iii++ ) { + replicate.add(GenomeAnalysisEngine.getRandomGenerator().nextDouble()); + } } //--------------------------------------------------------------------------------------------------------------- @@ -263,8 +282,9 @@ public class VariantRecalibrator extends RodWalker map( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) { - final ExpandingArrayList mapList = new ExpandingArrayList(); + final ExpandingArrayList mapList = new ExpandingArrayList<>(); if( tracker == null ) { // For some reason RodWalkers get map calls with null trackers return mapList; @@ -284,7 +304,7 @@ public class VariantRecalibrator extends RodWalker reduceInit() { - return new ExpandingArrayList(); + return new ExpandingArrayList<>(); } + @Override public ExpandingArrayList reduce( final ExpandingArrayList mapValue, final ExpandingArrayList reduceSum ) { reduceSum.addAll( mapValue ); return reduceSum; } + @Override public ExpandingArrayList treeReduce( final ExpandingArrayList lhs, final ExpandingArrayList rhs ) { rhs.addAll( lhs ); return rhs; @@ -321,30 +344,23 @@ public class VariantRecalibrator extends RodWalker reduceSum ) { dataManager.setData( reduceSum ); dataManager.normalizeData(); // Each data point is now (x - mean) / standard deviation // Generate the positive model using the training data and evaluate each variant - final GaussianMixtureModel goodModel = engine.generateModel( dataManager.getTrainingData() ); + final List positiveTrainingData = dataManager.getTrainingData(); + final GaussianMixtureModel goodModel = engine.generateModel( positiveTrainingData, VRAC.MAX_GAUSSIANS ); engine.evaluateData( dataManager.getData(), goodModel, false ); // Generate the negative model using the worst performing data and evaluate each variant contrastively - final ExpandingArrayList negativeTrainingData = dataManager.selectWorstVariants( VRAC.PERCENT_BAD_VARIANTS, VRAC.MIN_NUM_BAD_VARIANTS ); - GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData ); + final List negativeTrainingData = dataManager.selectWorstVariants(); + final GaussianMixtureModel badModel = engine.generateModel( negativeTrainingData, Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS)); engine.evaluateData( dataManager.getData(), badModel, true ); - // Detect if the negative model failed to converge because of too few points and/or too many Gaussians and try again - while( badModel.failedToConverge && VRAC.MAX_GAUSSIANS > 4 ) { - logger.info("Negative model failed to converge. Retrying..."); - VRAC.MAX_GAUSSIANS--; - badModel = engine.generateModel( negativeTrainingData ); - engine.evaluateData( dataManager.getData(), goodModel, false ); - engine.evaluateData( dataManager.getData(), badModel, true ); - } - if( badModel.failedToConverge || goodModel.failedToConverge ) { - throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)"); + throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minNumBadVariants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --maxGaussians 4, for example).") ); } engine.calculateWorstPerformingAnnotation( dataManager.getData(), goodModel, badModel ); @@ -355,31 +371,28 @@ public class VariantRecalibrator extends RodWalker tranches = TrancheManager.findTranches( dataManager.getData(), TS_TRANCHES, metric, VRAC.MODE ); tranchesStream.print(Tranche.tranchesString( tranches )); - // Find the filtering lodCutoff for display on the model PDFs. Red variants are those which were below the cutoff and filtered out of the final callset. - double lodCutoff = 0.0; - for( final Tranche tranche : tranches ) { - if( MathUtils.compareDoubles(tranche.ts, TS_FILTER_LEVEL, 0.0001) == 0 ) { - lodCutoff = tranche.minVQSLod; - } - } - logger.info( "Writing out recalibration table..." ); dataManager.writeOutRecalibrationTable( recalWriter ); if( RSCRIPT_FILE != null ) { logger.info( "Writing out visualization Rscript file..."); - createVisualizationScript( dataManager.getRandomDataForPlotting( 6000 ), goodModel, badModel, lodCutoff ); + createVisualizationScript( dataManager.getRandomDataForPlotting( 1000, positiveTrainingData, negativeTrainingData, dataManager.getEvaluationData() ), goodModel, badModel, 0.0, dataManager.getAnnotationKeys().toArray(new String[USE_ANNOTATIONS.length]) ); } - // Execute the RScript command to plot the table of truth values - RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class)); - executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV); - // Print out the command line to make it clear to the user what is being executed and how one might modify it - logger.info("Executing: " + executor.getApproximateCommandLine()); - executor.exec(); + if(VRAC.MODE == VariantRecalibratorArgumentCollection.Mode.INDEL) { + // Print out an info message to make it clear why the tranches plot is not generated + logger.info("Tranches plot will not be generated since we are running in INDEL mode"); + } else { + // Execute the RScript command to plot the table of truth values + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(PLOT_TRANCHES_RSCRIPT, VariantRecalibrator.class)); + executor.addArgs(TRANCHES_FILE.getAbsoluteFile(), TARGET_TITV); + // Print out the command line to make it clear to the user what is being executed and how one might modify it + logger.info("Executing: " + executor.getApproximateCommandLine()); + executor.exec(); + } } - private void createVisualizationScript( final ExpandingArrayList randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff ) { + private void createVisualizationScript( final List randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff, final String[] annotationKeys ) { PrintStream stream; try { stream = new PrintStream(RSCRIPT_FILE); @@ -399,11 +412,11 @@ public class VariantRecalibrator extends RodWalker fakeData = new ExpandingArrayList(); + final List fakeData = new ExpandingArrayList<>(); double minAnn1 = 100.0, maxAnn1 = -100.0, minAnn2 = 100.0, maxAnn2 = -100.0; for( final VariantDatum datum : randomData ) { minAnn1 = Math.min(minAnn1, datum.annotations[iii]); @@ -412,8 +425,9 @@ public class VariantRecalibrator extends RodWalker data ) { - final GaussianMixtureModel model = new GaussianMixtureModel( VRAC.MAX_GAUSSIANS, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS ); + public GaussianMixtureModel generateModel( final List data, final int maxGaussians ) { + final GaussianMixtureModel model = new GaussianMixtureModel( maxGaussians, data.get(0).annotations.length, VRAC.SHRINKAGE, VRAC.DIRICHLET_PARAMETER, VRAC.PRIOR_COUNTS ); variationalBayesExpectationMaximization( model, data ); return model; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java new file mode 100644 index 000000000..3d1a9da57 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CalculateGenotypePosteriors.java @@ -0,0 +1,264 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.ArgumentCollection; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.variant.vcf.*; +import java.util.*; + +/** + * Calculates genotype posterior likelihoods given panel data + * + *

+ * Given a VCF with genotype likelihoods from the HaplotypeCaller, UnifiedGenotyper, or another source which provides + * -unbiased- GLs, calculate the posterior genotype state and likelihood given allele frequency information from + * both the samples themselves and input VCFs describing allele frequencies in related populations. + * + * VCFs to use for informing the genotype likelihoods (e.g. a population-specific VCF from 1000 genomes) should have + * at least one of: + * - AC field and AN field + * - MLEAC field and AN field + * - genotypes + * + * The AF field will not be used in this calculation as it does not provide a way to estimate the confidence interval + * or uncertainty around the allele frequency, while AN provides this necessary information. This uncertainty is + * modeled by a Dirichlet distribution: that is, the frequency is known up to a Dirichlet distribution with + * parameters AC1+q,AC2+q,...,(AN-AC1-AC2-...)+q, where "q" is the global frequency prior (typically q << 1). The + * genotype priors applied then follow a Dirichlet-Multinomial distribution, where 2 alleles per sample are drawn + * independently. This assumption of independent draws is the assumption Hardy-Weinberg Equilibrium. Thus, HWE is + * imposed on the likelihoods as a result of CalculateGenotypePosteriors. + * + *

Input

+ *

+ * A VCF with genotype likelihoods, and optionally genotypes, AC/AN fields, or MLEAC/AN fields + *

+ * + *

+ * A collection of VCFs to use for informing allele frequency priors. Each VCF must have one of + * - AC field and AN field + * - MLEAC field and AN field + * - genotypes + *

+ * + *

Output

+ *

+ * A new VCF with: + * 1) Genotype posteriors added to the genotype fields ("GP") + * 2) Genotypes and GQ assigned according to these posteriors + * 3) Per-site genotype priors added to the INFO field ("PG") + *

+ * + *

Examples

+ *
+ * Inform the genotype assignment of NA12878 using the 1000G Euro panel
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -V NA12878.wgs.HC.vcf \
+ *   -VV 1000G_EUR.genotypes.combined.vcf \
+ *   -o NA12878.wgs.HC.posteriors.vcf \
+ *
+ * Refine the genotypes of a large panel based on the discovered allele frequency
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -V input.vcf \
+ *   -o output.withPosteriors.vcf
+ *
+ * Apply frequency and HWE-based priors to the genotypes of a family without including the family allele counts
+ * in the allele frequency estimates
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -V input.vcf \
+ *   -o output.withPosteriors.vcf \
+ *   --ignoreInputSamples
+ *
+ * Calculate the posterior genotypes of a callset, and impose that a variant *not seen* in the external panel
+ * is tantamount to being AC=0, AN=100 within that panel
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CalculateGenotypePosteriors \
+ *   -VV external.panel.vcf \
+ *   -V input.vcf \
+ *   -o output.withPosteriors.vcf
+ *   --numRefSamplesIfNoCall 100
+ *
+ * 
+ * + */ +public class CalculateGenotypePosteriors extends RodWalker { + + /** + * The input VCF (posteriors will be calculated for these samples, and written to the output) + */ + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + /** + * Supporting external panels. Allele counts from these panels (taken from AC,AN or MLEAC,AN or raw genotypes) will + * be used to inform the frequency distribution underying the genotype priors. + */ + @Input(fullName="supporting", shortName = "VV", doc="Other callsets to use in generating genotype posteriors", required=false) + public List> supportVariants = new ArrayList>(); + + /** + * The global prior of a variant site -- i.e. the expected allele frequency distribution knowing only that N alleles + * exist, and having observed none of them. This is the "typical" 1/x trend, modeled here as not varying + * across alleles. The calculation for this parameter is (Effective population size) * (steady state mutation rate) + * + */ + @Argument(fullName="globalPrior",shortName="G",doc="The global Dirichlet prior parameters for the allele frequency",required=false) + public double globalPrior = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + + /** + * When a variant is not seen in a panel, whether to infer (and with what effective strength) that only reference + * alleles were ascertained at that site. E.g. "If not seen in 1000Genomes, treat it as AC=0, AN=2000". This is + * applied across all external panels, so if numRefIsMissing = 10, and the variant is absent in two panels, this + * confers evidence of AC=0,AN=20 + */ + @Argument(fullName="numRefSamplesIfNoCall",shortName="nrs",doc="The number of homozygous reference to infer were " + + "seen at a position where an \"other callset\" contains no site or genotype information",required=false) + public int numRefIfMissing = 1; + + /** + * Rather than looking for the MLEAC field first, and then falling back to AC; first look for the AC field and then + * fall back to MLEAC or raw genotypes + */ + @Argument(fullName="defaultToAC",shortName="useAC",doc="Use the AC field as opposed to MLEAC. Does nothing if VCF lacks MLEAC field",required=false) + public boolean defaultToAC = false; + + /** + * Do not use the [MLE] allele count from the input samples (the ones for which you're calculating posteriors) + * in the site frequency distribution; only use the AC and AN calculated from external sources. + */ + @Argument(fullName="ignoreInputSamples",shortName="ext",doc="Use external information only; do not inform genotype priors by "+ + "the discovered allele frequency in the callset whose posteriors are being calculated. Useful for callsets containing "+ + "related individuals.",required=false) + public boolean ignoreInputSamples = false; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + private final boolean NO_EM = false; + + public void initialize() { + // Get list of samples to include in the output + final List rodNames = Arrays.asList(variantCollection.variants.getName()); + + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit(), rodNames); + + if ( vcfRods.size() > 1 ) + throw new IllegalStateException("Somehow more than one variant was bound?"); + + final VCFHeader header = new ArrayList<>(vcfRods.values()).get(0); // pure laziness + + if ( ! header.hasGenotypingData() ) { + throw new UserException("VCF has no genotypes"); + } + + if ( header.hasInfoLine(VCFConstants.MLE_ALLELE_COUNT_KEY) ) { + final VCFInfoHeaderLine mleLine = header.getInfoHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY); + if ( mleLine.getCountType() != VCFHeaderLineCount.A ) { + throw new UserException("VCF does not have a properly formatted MLEAC field: the count type should be \"A\""); + } + + if ( mleLine.getType() != VCFHeaderLineType.Integer ) { + throw new UserException("VCF does not have a properly formatted MLEAC field: the field type should be \"Integer\""); + } + } + + final TreeSet vcfSamples = new TreeSet<>(SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + // Initialize VCF header + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_POSTERIORS_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Posterior Genotype Likelihoods")); + headerLines.add(new VCFInfoHeaderLine("PG", VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Genotype Likelihood Prior")); + headerLines.add(new VCFHeaderLine("source", "CalculateGenotypePosteriors")); + + vcfWriter.writeHeader(new VCFHeader(headerLines, vcfSamples)); + } + + public Integer reduceInit() { return 0; } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || context == null || ref == null ) { + return 0; + } + + final Collection vcs = tracker.getValues(variantCollection.variants, ref.getLocus()); + + final Collection otherVCs = tracker.getValues(supportVariants, context.getLocation()); + + final int missing = supportVariants.size() - otherVCs.size(); + + for ( VariantContext vc : vcs ) { + vcfWriter.add(PosteriorLikelihoodsUtils.calculatePosteriorGLs(vc, otherVCs, missing * numRefIfMissing, globalPrior, !ignoreInputSamples, NO_EM, defaultToAC)); + } + + return 1; + } + + public Integer reduce(Integer l, Integer r) { return r + l; } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java new file mode 100644 index 000000000..c9e4e44f0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtils.java @@ -0,0 +1,261 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; + +import java.util.*; + +public class PosteriorLikelihoodsUtils { + + public static VariantContext calculatePosteriorGLs(final VariantContext vc1, + final Collection resources, + final int numRefSamplesFromMissingResources, + final double globalFrequencyPriorDirichlet, + final boolean useInputSamples, + final boolean useEM, + final boolean useAC) { + if ( useEM ) + throw new IllegalArgumentException("EM loop for posterior GLs not yet implemented"); + + final Map totalAlleleCounts = new HashMap<>(); + for ( final VariantContext resource : resources ) { + addAlleleCounts(totalAlleleCounts,resource,useAC); + } + + if ( useInputSamples ) { + addAlleleCounts(totalAlleleCounts,vc1,useAC); + } + + totalAlleleCounts.put(vc1.getReference(),totalAlleleCounts.get(vc1.getReference())+numRefSamplesFromMissingResources); + + // now extract the counts of the alleles present within vc1, and in order + final double[] alleleCounts = new double[vc1.getNAlleles()]; + int alleleIndex = 0; + for ( final Allele allele : vc1.getAlleles() ) { + + alleleCounts[alleleIndex++] = globalFrequencyPriorDirichlet + ( totalAlleleCounts.containsKey(allele) ? + totalAlleleCounts.get(allele) : 0 ); + } + + final List likelihoods = new ArrayList<>(vc1.getNSamples()); + for ( final Genotype genotype : vc1.getGenotypes() ) { + likelihoods.add(genotype.hasLikelihoods() ? genotype.getLikelihoods().getAsVector() : null ); + } + + final List posteriors = calculatePosteriorGLs(likelihoods,alleleCounts,vc1.getMaxPloidy(2)); + + final GenotypesContext newContext = GenotypesContext.create(); + for ( int genoIdx = 0; genoIdx < vc1.getNSamples(); genoIdx ++ ) { + final GenotypeBuilder builder = new GenotypeBuilder(vc1.getGenotype(genoIdx)); + if ( posteriors.get(genoIdx) != null ) { + GATKVariantContextUtils.updateGenotypeAfterSubsetting(vc1.getAlleles(), builder, + GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles()); + builder.attribute(VCFConstants.GENOTYPE_POSTERIORS_KEY, + Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs())); + + } + newContext.add(builder.make()); + } + + final List priors = Utils.listFromPrimitives( + GenotypeLikelihoods.fromLog10Likelihoods(getDirichletPrior(alleleCounts, vc1.getMaxPloidy(2))).getAsPLs()); + + return new VariantContextBuilder(vc1).genotypes(newContext).attribute("PG",priors).make(); + } + + /** + * Given genotype likelihoods and known allele counts, calculate the posterior likelihoods + * over the genotype states + * @param genotypeLikelihoods - the genotype likelihoods for the individual + * @param knownAlleleCountsByAllele - the known allele counts in the population. For AC=2 AN=12 site, this is {10,2} + * @param ploidy - the ploidy to assume + * @return - the posterior genotype likelihoods + */ + protected static List calculatePosteriorGLs(final List genotypeLikelihoods, + final double[] knownAlleleCountsByAllele, + final int ploidy) { + if ( ploidy != 2 ) { + throw new IllegalStateException("Genotype posteriors not yet implemented for ploidy != 2"); + } + + final double[] genotypePriorByAllele = getDirichletPrior(knownAlleleCountsByAllele,ploidy); + final List posteriors = new ArrayList<>(genotypeLikelihoods.size()); + for ( final double[] likelihoods : genotypeLikelihoods ) { + double[] posteriorLikelihoods = null; + + if ( likelihoods != null ) { + if ( likelihoods.length != genotypePriorByAllele.length ) { + throw new IllegalStateException(String.format("Likelihoods not of correct size: expected %d, observed %d", + knownAlleleCountsByAllele.length*(knownAlleleCountsByAllele.length+1)/2,likelihoods.length)); + } + + posteriorLikelihoods = new double[genotypePriorByAllele.length]; + for ( int genoIdx = 0; genoIdx < likelihoods.length; genoIdx ++ ) { + posteriorLikelihoods[genoIdx] = likelihoods[genoIdx] + genotypePriorByAllele[genoIdx]; + } + + posteriorLikelihoods = MathUtils.toLog10(MathUtils.normalizeFromLog10(posteriorLikelihoods)); + + } + + posteriors.add(posteriorLikelihoods); + } + + return posteriors; + } + + // convenience function for a single genotypelikelihoods array. Just wraps. + protected static double[] calculatePosteriorGLs(final double[] genotypeLikelihoods, + final double[] knownAlleleCountsByAllele, + final int ploidy) { + return calculatePosteriorGLs(Arrays.asList(genotypeLikelihoods),knownAlleleCountsByAllele,ploidy).get(0); + } + + + /** + * Given known allele counts (whether external, from the sample, or both), calculate the prior distribution + * over genotype states. This assumes + * 1) Random sampling of alleles (known counts are unbiased, and frequency estimate is Dirichlet) + * 2) Genotype states are independent (Hardy-Weinberg) + * These assumptions give rise to a Dirichlet-Multinomial distribution of genotype states as a prior + * (the "number of trials" for the multinomial is simply the ploidy) + * @param knownCountsByAllele - the known counts per allele. For an AC=2, AN=12 site this is {10,2} + * @param ploidy - the number of chromosomes in the sample. For now restricted to 2. + * @return - the Dirichlet-Multinomial distribution over genotype states + */ + protected static double[] getDirichletPrior(final double[] knownCountsByAllele, final int ploidy) { + if ( ploidy != 2 ) { + throw new IllegalStateException("Genotype priors not yet implemented for ploidy != 2"); + } + + // multi-allelic format is + // AA AB BB AC BC CC AD BD CD DD ... + final double sumOfKnownCounts = MathUtils.sum(knownCountsByAllele); + final double[] priors = new double[knownCountsByAllele.length*(knownCountsByAllele.length+1)/2]; + int priorIndex = 0; + for ( int allele2 = 0; allele2 < knownCountsByAllele.length; allele2++ ) { + for ( int allele1 = 0; allele1 <= allele2; allele1++) { + final int[] counts = new int[knownCountsByAllele.length]; + counts[allele1] += 1; + counts[allele2] += 1; + priors[priorIndex++] = MathUtils.dirichletMultinomial(knownCountsByAllele,sumOfKnownCounts,counts,ploidy); + } + } + + return priors; + } + + private static void addAlleleCounts(final Map counts, final VariantContext context, final boolean useAC) { + final int[] ac; + if ( context.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && ! useAC ) { + ac = extractInts(context.getAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY)); + } else if ( context.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + ac = extractInts(context.getAttribute(VCFConstants.ALLELE_COUNT_KEY)); + } else { + ac = new int[context.getAlternateAlleles().size()]; + int idx = 0; + for ( final Allele allele : context.getAlternateAlleles() ) { + ac[idx++] = context.getCalledChrCount(allele); + } + } + + for ( final Allele allele : context.getAlleles() ) { + final int count; + if ( allele.isReference() ) { + if ( context.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) { + count = context.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY,-1) - (int) MathUtils.sum(ac); + } else { + count = context.getCalledChrCount() - (int) MathUtils.sum(ac); + } + } else { + count = ac[context.getAlternateAlleles().indexOf(allele)]; + } + if ( ! counts.containsKey(allele) ) { + counts.put(allele,0); + } + counts.put(allele,count + counts.get(allele)); + } + } + + public static int[] extractInts(final Object integerListContainingVCField) { + List mleList = null; + if ( integerListContainingVCField instanceof List ) { + if ( ((List) integerListContainingVCField).get(0) instanceof String ) { + mleList = new ArrayList<>(((List) integerListContainingVCField).size()); + for ( Object s : ((List)integerListContainingVCField)) { + mleList.add(Integer.parseInt((String) s)); + } + } else { + mleList = (List) integerListContainingVCField; + } + } else if ( integerListContainingVCField instanceof Integer ) { + mleList = Arrays.asList((Integer) integerListContainingVCField); + } else if ( integerListContainingVCField instanceof String ) { + mleList = Arrays.asList(Integer.parseInt((String)integerListContainingVCField)); + } + if ( mleList == null ) + throw new IllegalArgumentException(String.format("VCF does not have properly formatted "+ + VCFConstants.MLE_ALLELE_COUNT_KEY+" or "+VCFConstants.ALLELE_COUNT_KEY)); + + final int[] mle = new int[mleList.size()]; + + if ( ! ( mleList.get(0) instanceof Integer ) ) { + throw new IllegalStateException("BUG: The AC values should be an Integer, but was "+mleList.get(0).getClass().getCanonicalName()); + } + + for ( int idx = 0; idx < mle.length; idx++) { + mle[idx] = mleList.get(idx); + } + + return mle; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java b/protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java new file mode 100644 index 000000000..8040ca62a --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/SequenceComplexity.java @@ -0,0 +1,100 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils; + +/** + * Helper class to handle sequence complexity analyses. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class SequenceComplexity { + + /** + * Indicates what positions in a base sequence is found in homopolymers or STR repeat. + * + *

+ * The result is an boolean array with as many positions as the input base array. + *

+ *

+ * Each entry the result makes reference to the base at the same position in the input, where {@code true} + * means that it forms part of a repeat. + *

+ * + * @param bases the input bases. + * @param maxRepeatUnitLength what is the largest repeat unit to consider. + * @param minRepeatLengthInUnits what is minimum length of a repeat in units to consider it significantly long. Shorter + * repeats won't be considered as such. + * @return never {@code null} but an array with the same length as the reference haplotype. + */ + public static boolean[] findBasesInShortUnitRepeats(final byte[] bases, final int maxRepeatUnitLength, + final int minRepeatLengthInUnits) { + final boolean[] result = new boolean[bases.length]; + final int[] repeatAbsoluteLengthCount = new int[maxRepeatUnitLength]; + for (int i = 0; i < maxRepeatUnitLength; i++) + repeatAbsoluteLengthCount[i] = i + 1; + for (int i = 0; i < bases.length; i++) + for (int j = 1; j <= maxRepeatUnitLength; j++) { + final int prevI = i - j; + if (prevI < 0) continue; + if (bases[prevI] == bases[i]) // repeat continuation. + repeatAbsoluteLengthCount[j - 1]++; + else if (minRepeatLengthInUnits <= (repeatAbsoluteLengthCount[j - 1] / j)) { // end of a long enough repeat. + for (int k = i - repeatAbsoluteLengthCount[j - 1]; k < i; k++) + result[k] = true; + repeatAbsoluteLengthCount[j - 1] = j; + } else { // end of not long enough repeat. + repeatAbsoluteLengthCount[j - 1] = j; + } + } + // Do the marking for the last position in bases. + for (int j = 1; j <= maxRepeatUnitLength; j++) + if (minRepeatLengthInUnits <= (repeatAbsoluteLengthCount[j - 1] / j)) + for (int k = bases.length - repeatAbsoluteLengthCount[j - 1]; k < bases.length; k++) + result[k] = true; + return result; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java b/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java new file mode 100644 index 000000000..e1de32bf6 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/collections/CountSet.java @@ -0,0 +1,521 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.collections; + +import com.google.java.contract.Requires; +import com.sun.istack.internal.NotNull; + +import java.lang.reflect.Array; +import java.util.*; + +/** + * Efficient implementation for a small set of integer primitive values. + *

+ * It includes a increment operation incAll which is convenient when analyzing the read-threading graphs. Nevertheless + * it can be also be used in general purpose. + *

+ *

+ * It does not provide a O(1) look-up of its elements though. These are kept in a sorted array so look up is implemented + * using a binary search O(log n). Therefore it might not be optimal for problems that require large integer sets. + *

+ *

+ * Also note that addition can be costly for large sets unless done in order: O(n). + *

+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class CountSet implements Cloneable, Set { + + /** + * The size of the set. + */ + private int size; + + /** + * Holds the element of the set within the subrange [0 .. size - 1] in ascending order. + */ + private int[] elements; + + /** + * Creates a copy of an existing int-set. + * @param template the intset to copy values from. + */ + public CountSet(final CountSet template) { + elements = template.elements.clone(); + size = template.size; + } + + /** + * Creates a new set indicating the expected maximum number of elements it will contain. + * @param initialCapacity the desired initial capacity of the set. + * @throws IllegalArgumentException if initialCapacity is negative. + */ + public CountSet(int initialCapacity) { + if (initialCapacity < 0) + throw new IllegalArgumentException(); + elements = new int[initialCapacity]; + size = 0; + } + + /** + * Set the set contents to a single integer value. + * @param value the integer value to set the set to. + */ + public void setTo(int value) { + ensureCapacity(1); + size = 1; + elements[0] = value; + } + + /** + * Set the content of this set to a collection of integers. + * @param values the new values to be included in the set. + * @throws NullPointerException if value is null. + */ + public void setTo(int ... values) { + ensureCapacity(values.length); + size = values.length; + System.arraycopy(values, 0, elements, 0, size); + Arrays.sort(elements,0,size); + } + + /** + * Increase (or decrease) all elements in the set by a number. + * @param delta the number of add (or substract if negative) to all elements. + * + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean incAll(final int delta) { + if (size == 0 || delta == 0) + return false; + for (int i = 0; i < size; i++) + elements[i] += delta; + return true; + } + + /** + * Returns the smallest integer value in the set. + * + * @throws NoSuchElementException if the set is empty (thus there is no minimum). + * @return the smallest integer value in the set. + */ + public int min() { + if (size == 0) + throw new NoSuchElementException("cannot have a min from an empty set"); + return elements[0]; + } + + /** + * Returns the largest integer value in the set. + * + * @throws NoSuchElementException if the set is empty (thus there is no maximum). + * @return the largest integer value in the set. + */ + public int max() { + if (size == 0) + throw new NoSuchElementException("cannot have a max from an empty set"); + return elements[size - 1]; + } + + /** + * Adds a range of integer values to the collection. + * + * This method avoid the need to explicity indicate all values in that range. Notice that the range is fully inclusive. + * You can indicate a decrease range (fromValue > toValue). + * + * @param fromValue the first value to add in the set (inclusive). + * @param toValue the last value to add to the set (inclusive). + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addRange(final int fromValue, final int toValue) { + final int lowEnd; + final int highEnd; + + if (fromValue <= toValue) { + lowEnd = fromValue; highEnd = toValue; + } else { + highEnd = fromValue; lowEnd = toValue; + } + + //TODO to be optimized to add missing sub-ranges in one go: + boolean result = false; + for (int i = lowEnd; i <= highEnd; i++) + result = add(i) | result; + return result; + } + + /** + * Add an integer value to the set. + * @param value to add to the set. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean add(final int value) { + int pos = Arrays.binarySearch(elements,0,size,value); + if (pos >= 0) return false; + int insertPos = - pos - 1; + ensureCapacity(size + 1); + System.arraycopy(elements, insertPos, elements, insertPos + 1, size - insertPos); + elements[insertPos] = value; + size++; + return true; + } + + /** + * Add a arbitrary number of integers to the set. + * + * @param values integer to add to the set. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addAll(final int ... values) { + ensureCapacity(size + values.length); + boolean result = false; + for (final int v : values) + result = add(v) | result; + return result; + } + + @Override + public boolean addAll(final Collection numbers) { + ensureCapacity(size + numbers.size()); + boolean result = false; + for (final Number n : numbers) + result = add(n.intValue()) | result; + return result; + } + + /** + * Add all values within a range in an integer array. + * + * @param source array where the values to add are found. + * @param fromIndex first position from source to add (inclusive). + * @param toIndex index after the last position in source to add (thus exclusive). + * @throws NullPointerException if source is null. + * @throws NegativeArraySizeException if fromIndex or toIndex are negative. + * @throws ArrayIndexOutOfBoundsException if fromIndex or toIndex are beyond bounds + * allowed [0 .. source.length]. + * @return true if the set changed as a result of this invocation, false otherwise. + */ + public boolean addAll(final int[] source, final int fromIndex, final int toIndex) { + ensureCapacity(size + source.length); + boolean result = false; + for (int i = fromIndex; i < toIndex; i++) + result = add(source[i]) | result; + return result; + } + + + /** + * Add all elements present in a int-set. + * + * @param other the other inset. + * + * @throws NullPointerException if other is null. + * @return true if this set changed due to this operation, false otherwise. + */ + public boolean addAll(final CountSet other) { + return addAll(other.elements,0,other.size); + } + + /** + * Checks whether a integer value is included in the set. + * @param value the value to check. + * @return true if value is inside the set, false otherwise. + */ + public boolean contains(final int value) { + return Arrays.binarySearch(elements,0,size,value) >= 0; + } + + /** + * Make sure that this int-set has capacity to handle a number of elements. + *

+ * If the set has already that or greater capacity nothing would be changed. + * + * @param capacity the requested capacity. + */ + private void ensureCapacity(final int capacity) { + if (elements.length >= capacity) return; + int newLength = Math.max(elements.length << 1, capacity); + elements = Arrays.copyOf(elements,newLength); + } + + + @Override + public int size() { + return size; + } + + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public boolean contains(final Object o) { + if (o instanceof Integer) { + final int i = (Integer)o; + return contains(i); + } else + return false; //To change body of implemented methods use File | Settings | File Templates. + } + + @Override + @NotNull + public Iterator iterator() { + return new MyIterator(); + } + + @Override + @NotNull + public Object[] toArray() { + final Integer[] result = new Integer[size]; + for (int i = 0; i < size; i++) + result[i] = elements[i]; + return result; + } + + @Override + @NotNull + @SuppressWarnings("unchecked") + public T[] toArray(final T[] a) { + if (a == null) + throw new NullPointerException(); + + @SuppressWarnings("unchecked") + final Class componentClass = (Class) a.getClass().getComponentType(); + if (!componentClass.isAssignableFrom(Integer.class)) + throw new ArrayStoreException(); + + @SuppressWarnings("unchecked") + final T[] dest = (a.length < size) ? (T[]) (Object[]) Array.newInstance(componentClass, size) : a; + + for (int i = 0; i < size; i++) + dest[i] = (T) (Integer) elements[i]; + return dest; + } + + /** + * Copies the content of the set into an integer array. The result can be freely modified by the invoker. + * @return never null but a zero-length array if the set is empty. + */ + @NotNull + public int[] toIntArray() { + return Arrays.copyOfRange(elements,0,size); + } + + /** + * Copy the content of the set into an array. + * @param dest the destination array. + * @param offset where to store the first element of the set. + * @throws NullPointerException if dest is null. + * @throws ArrayIndexOutOfBoundsException if offset is out of range of there is not enough + * space after offset in the destination array to hold all values in the set. + */ + public void copyTo(final int[] dest, int offset) { + if (dest == null) + throw new NullPointerException(); + if (dest.length < (size + offset)) + throw new ArrayIndexOutOfBoundsException("destination is to short"); + System.arraycopy(elements,0,dest,offset,size); + } + + /** + * Copy the content of the set into an array. + * @param dest the destination array. + * @throws NullPointerException if dest is null. + * @throws ArrayIndexOutOfBoundsException if there is not enough + * space after offset in the destination array to hold all values in the set. + */ + public void copyTo(final int[] dest) { + copyTo(dest,0); + } + + + @Override + public boolean add(final Integer integer) { + return add((int) integer); + } + + @Override + public boolean remove(final Object o) { + return o instanceof Integer && remove((int)o); + } + + /** + * Removes a single integer value for the set. + * @param i the value to remove. + * @return true if the set has changed as a result of this invocation, false otherwise. + */ + public boolean remove(final int i) { + final int pos = Arrays.binarySearch(elements,0,size,i); + if (pos < 0) + return false; + else { + removeIndex(pos); + return true; + } + } + + @Override + public boolean containsAll(final Collection c) { + for (final Object o : c) + if (!contains(o)) + return false; + return true; + } + + + @Override + public boolean retainAll(final Collection c) { + if (size == 0) + return false; + @SuppressWarnings("all") + final CountSet retainIndices = new CountSet(c.size() + 2); + retainIndices.add(-1); + retainIndices.add(size); + for (final Object o : c) { + if (!(o instanceof Integer)) + continue; + final int pos = Arrays.binarySearch(elements,0,size,(int) o); + if (pos < 0) + continue; + retainIndices.add(pos); + } + if (retainIndices.size == 2) { + size = 0; + return true; + } else if (retainIndices.size == size + 2) { + return false; + } else { + for (int idx = retainIndices.size - 1; idx > 0; idx--) { + final int toIdx = retainIndices.elements[idx]; + final int fromIdx = retainIndices.elements[idx - 1] + 1; + removeIndices(toIdx,fromIdx); + } + return true; + } + } + + /** + * Removes the values found in a range of indexes in {@link #elements}. + * @param fromIdx first index to remove (inclusive). + * @param toIdx right after last index to remove (exclusive). + */ + @Requires("fromIdx >= toIdx & fromIdx >= 0 & toIdx <= size") + private void removeIndices(final int fromIdx, final int toIdx) { + System.arraycopy(elements,toIdx,elements,fromIdx,size - toIdx); + size -= toIdx - fromIdx; + } + + @Override + public boolean removeAll(final Collection c) { + boolean result = false; + for (final Object o : c) + result = remove(o) | result; + return result; + } + + @Requires("idx >= 0 && idx < size") + private void removeIndex(int idx) { + System.arraycopy(elements,idx+1,elements,idx,size - idx - 1); + } + + @Override + public void clear() { + size = 0; + } + + /** + * Returns a copy of this set which can be changed without modifying the original one. + * @return never {@code null}. + */ + @NotNull + @SuppressWarnings("all") + public CountSet clone() { + return new CountSet(this); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(2 + size() * 10); + sb.append('{'); + for (int i = 0; i < size; i++) + sb.append(elements[i]).append(','); + sb.replace(sb.length()-1,sb.length(),"}"); + return sb.toString(); + + } + + + /** + * Custom iterator class for {@link CountSet IntSets} + */ + private class MyIterator implements Iterator { + /** What position I am in. */ + private int next = 0; + + @Override + public boolean hasNext() { + return next < size; + } + + @Override + public Integer next() { + if (next >= size) + throw new NoSuchElementException(); + return elements[next]; + } + + @Override + public void remove() { + if (next == 0) + throw new IllegalStateException(); + if (next >= size) + throw new NoSuchElementException(); + removeIndex(next - 1); + } + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java index 8f509b36b..98aedf786 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -46,6 +46,9 @@ package org.broadinstitute.sting.utils.gvcf; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypeBuilder; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -230,6 +233,7 @@ public class GVCFWriter implements VariantContextWriter { gb.DP(block.getMedianDP()); gb.attribute(MIN_DP_FORMAT_FIELD, block.getMinDP()); gb.attribute(MIN_GQ_FORMAT_FIELD, block.getMinGQ()); + gb.PL(block.getMinPLs()); return vcb.genotypes(gb.make()).make(); } @@ -283,7 +287,7 @@ public class GVCFWriter implements VariantContextWriter { } final Genotype g = vc.getGenotype(0); - if ( g.isHomRef() ) { + if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) { // create bands final VariantContext maybeCompletedBand = addHomRefSite(vc, g); if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java index 282e49217..ebd167a31 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -69,10 +69,11 @@ import java.util.List; */ final class HomRefBlock { private final VariantContext startingVC; - int stop; + private int stop; private final int minGQ, maxGQ; - private List GQs = new ArrayList<>(100); - private List DPs = new ArrayList<>(100); + private int[] minPLs = null; + final private List GQs = new ArrayList<>(100); + final private List DPs = new ArrayList<>(100); private final Allele ref; /** @@ -116,9 +117,23 @@ final class HomRefBlock { public void add(final int pos, final Genotype g) { if ( g == null ) throw new IllegalArgumentException("g cannot be null"); if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); + if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); + if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs = PL.clone(); + } + } else { // otherwise take the min with the provided genotype's PLs + final int[] PL = g.getPL(); + if( PL.length == 3 ) { + minPLs[0] = Math.min(minPLs[0], PL[0]); + minPLs[1] = Math.min(minPLs[1], PL[1]); + minPLs[2] = Math.min(minPLs[2], PL[2]); + } + } stop = pos; GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission DPs.add(g.getDP()); @@ -141,6 +156,8 @@ final class HomRefBlock { public int getMinDP() { return MathUtils.arrayMin(DPs); } /** Get the median DP observed within this band */ public int getMedianDP() { return MathUtils.median(DPs); } + /** Get the min PLs observed within this band, can be null if no PLs have yet been observed */ + public int[] getMinPLs() { return minPLs; } protected int getGQUpperBound() { return maxGQ; } protected int getGQLowerBound() { return minGQ; } diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java b/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java index 4609c3209..ba58c3ae8 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeLDCalculator.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.utils.haplotype; import com.google.java.contract.Requires; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.variantcontext.Allele; @@ -100,7 +100,7 @@ public class HaplotypeLDCalculator { final Map map = new HashMap(haplotypes.size()); for( final Haplotype h : haplotypes ) { // count up the co-occurrences of the events for the R^2 calculation - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0]; + final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0]; map.put(h, haplotypeLikelihood); } haplotypeLikelihoodsPerSample.add(map); diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java index c298485f6..4d48ca82a 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/CalledHaplotypeBAMWriter.java @@ -46,15 +46,17 @@ package org.broadinstitute.sting.utils.haplotypeBAMWriter; -import net.sf.samtools.SAMFileWriter; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** * Writes a BAM containing just the reads in stratifiedReadMap aligned to their @@ -99,7 +101,9 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter { for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for ( final Map.Entry> entry : readAlleleLikelihoodMap.getLikelihoodReadMap().entrySet() ) { final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue(), allelesOfCalledHaplotypes); - writeReadAgainstHaplotype(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); + final Haplotype haplotype = alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()); + if (haplotype == null) continue; + writeReadAgainstHaplotype(entry.getKey(), haplotype, paddedReferenceLoc.getStart(), bestAllele.isInformative()); } } } diff --git a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 509399fd9..6d839a832 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -77,8 +77,9 @@ public abstract class HaplotypeBAMWriter { protected final static String READ_GROUP_ID = "ArtificialHaplotype"; protected final static String HAPLOTYPE_TAG = "HC"; - final ReadDestination output; - boolean writeHaplotypesAsWell = true; + private final ReadDestination output; + private boolean writeHaplotypesAsWell = true; + private boolean onlyRealignInformativeReads = false; /** * Possible modes for writing haplotypes to BAMs @@ -181,9 +182,21 @@ public abstract class HaplotypeBAMWriter { final Haplotype haplotype, final int referenceStart, final boolean isInformative) { - final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); - if ( alignedToRef != null ) - output.add(alignedToRef); + if( onlyRealignInformativeReads && !isInformative ) { + if( originalRead != null ) { + output.add(originalRead); + } + } else if (haplotype == null) { + output.add(originalRead); + return; + } else { + final GATKSAMRecord alignedToRef = createReadAlignedToRef(originalRead, haplotype, referenceStart, isInformative); + if ( alignedToRef != null ) { + output.add(alignedToRef); + } else { + output.add(originalRead); + } + } } /** @@ -305,7 +318,15 @@ public abstract class HaplotypeBAMWriter { return writeHaplotypesAsWell; } - public void setWriteHaplotypesAsWell(boolean writeHaplotypesAsWell) { + public void setWriteHaplotypesAsWell(final boolean writeHaplotypesAsWell) { this.writeHaplotypesAsWell = writeHaplotypesAsWell; } + + public boolean getOnlyRealignInformativeReads() { + return onlyRealignInformativeReads; + } + + public void setOnlyRealignInformativeReads(final boolean onlyRealignInformativeReads) { + this.onlyRealignInformativeReads = onlyRealignInformativeReads; + } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java new file mode 100644 index 000000000..a693ec22d --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/ArrayLoglessPairHMM.java @@ -0,0 +1,450 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: bradt + * Date: 6/11/13 + */ +public class ArrayLoglessPairHMM extends PairHMM { + private static final double INITIAL_CONDITION = Math.pow(2, 1020); + private static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + + private static final int matchToMatch = 0; + private static final int indelToMatch = 1; + private static final int matchToInsertion = 2; + private static final int insertionToInsertion = 3; + private static final int matchToDeletion = 4; + private static final int deletionToDeletion = 5; + + protected double[][] transition = null; // The transition probabilities cache + protected double[][] prior = null; // The prior probabilities cache + + // Array declarations for arrays implementation + private double[] currentMatchArray = null; + private double[] currentDeleteArray = null; + private double[] currentInsertArray = null; + private double[] parentMatchArray = null; + private double[] parentDeleteArray = null; + private double[] parentInsertArray = null; + private double[] grandparentMatchArray = null; + private double[] grandparentDeleteArray = null; + private double[] grandparentInsertArray = null; + + // When successive haplotypes have a common prefix, these arrays store cached info from the previous haplotype; for reading + private double[] matchCacheArray = null; + private double[] deleteCacheArray = null; + private double[] insertCacheArray = null; + + // These arrays store cache info for use with the next haplotype; for writing + private double[] nextMatchCacheArray = null; + private double[] nextDeleteCacheArray = null; + private double[] nextInsertCacheArray = null; + + // Used when caching to store our intermediate sum at point of first difference bw successive haplotypes + private double partialSum; + + + /** + * {@inheritDoc} + */ + @Override + public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { + super.initialize(readMaxLength, haplotypeMaxLength); + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + // Initialize all arrays + // Final Cell of array is a padding cell, initialized to zero. + currentMatchArray = new double[paddedMaxReadLength]; + currentDeleteArray = new double[paddedMaxReadLength]; + currentInsertArray = new double[paddedMaxReadLength]; + + parentMatchArray = new double[paddedMaxReadLength]; + parentDeleteArray = new double[paddedMaxReadLength]; + parentInsertArray = new double[paddedMaxReadLength]; + + grandparentMatchArray = new double[paddedMaxReadLength]; + grandparentDeleteArray = new double[paddedMaxReadLength]; + grandparentInsertArray = new double[paddedMaxReadLength]; + + // Initialize the special arrays used for caching when successive haplotypes have a common prefix + matchCacheArray = new double[paddedMaxReadLength]; + deleteCacheArray = new double[paddedMaxReadLength]; + insertCacheArray = new double[paddedMaxReadLength]; + + nextMatchCacheArray = new double[paddedMaxReadLength]; + nextDeleteCacheArray = new double[paddedMaxReadLength]; + nextInsertCacheArray = new double [paddedMaxReadLength]; + + } + + + /** + * {@inheritDoc} + */ + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + + if ( ! constantsAreInitialized) { + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + + // note that we initialized the constants + constantsAreInitialized = true; + } + initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + + // Some housekeeping to be done if we are starting a new read + if (recacheReadValues) { + hapStartIndex = 0; + + initializeProbabilities(transition, insertionGOP, deletionGOP, overallGCP); + // note that we initialized the constants + constantsAreInitialized = true; + + // Read length may have changed, so we need to set zero-value padding at the appropriate position. + padMatchAndInsertArrays(readBases.length); + } + + // if we have not cached from a previous haplotype, clear any info we may have accumulated in a previous HMM iteration + if (hapStartIndex == 0) { + clearPreviouslyCachedInfo(readBases.length); + + // Haplotype length may have changed, so we need to set initial-value padding at the appropriate position. + padDeleteArrays(haplotypeBases.length, readBases.length); + } + + // We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + clearArraySolutionPosition(); + + // Some parameters to control behavior during the dynamic programming loop + final int maxDiagonals = readBases.length + haplotypeBases.length - hapStartIndex - 1; // Number of diagonals for a matrix = rows + cols - 1; + int startFill; // The lower bound of the array indices we want to over-write + int endFill; // The upper bound of the array indices we want to over-write + final int cacheSumIndex = nextHapStartIndex - hapStartIndex + readBases.length - 1; // This array will contain the partial sum to cache for the next haplotype + double finalArraySumProbabilities = partialSum; // The final answer prior to log10 correction + + // Perform dynamic programming using arrays, as if over diagonals of a hypothetical read/haplotype alignment matrix + for (int i = 1; i <= maxDiagonals; i++) { + // set the bounds for cells we wish to fill in the arrays + startFill = Math.max(readBases.length - i, 0); + endFill = Math.min(maxDiagonals - i + 1, readBases.length); + + // apply any previously cached array information + if (i <= readBases.length) + applyPreviouslyCachedInfo(startFill); + + // fill in the cells for our current arrays + updateArrays(readBases.length, hapStartIndex, nextHapStartIndex, startFill, endFill, i); + + // final probability is the log10 sum of the last element in the Match and Insertion state arrays + // this way we ignore all paths that ended in deletions! (huge) + // but we have to sum all the paths ending in the M and I arrays, because they're no longer extended. + // Where i > readBases.length, array[0] corresponds to bottom row of a [read] x [haplotype] matrix. Before this, they carries the 0's we set above. + finalArraySumProbabilities += currentInsertArray[0] + currentMatchArray[0]; + + // Partial sum for caching the next haplotype: + // At the position of the last similar base between this haplotype and the next one... + // ...remember the partial sum, so that we can start here on the next hap. + if (i == cacheSumIndex) + partialSum = finalArraySumProbabilities; + + rotateArrayReferences(); + } + // The cache arrays we wrote for this haplotype will be read for the next haplotype. + rotateCacheArrays(); + + //return result + return Math.log10(finalArraySumProbabilities) - INITIAL_CONDITION_LOG10; + } + + /** + * Initializes the matrix that holds all the constants related to the editing + * distance between the read and the haplotype. + * + * @param haplotypeBases the bases of the haplotype + * @param readBases the bases of the read + * @param readQuals the base quality scores of the read + * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) + */ + public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + + // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases + // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. + + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = startIndex; j < haplotypeBases.length; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + /** + * Initializes the matrix that holds all the constants related to quality scores. + * + * @param insertionGOP insertion quality scores of the read + * @param deletionGOP deletion quality scores of the read + * @param overallGCP overall gap continuation penalty + */ + @Requires({ + "insertionGOP != null", + "deletionGOP != null", + "overallGCP != null" + }) + @Ensures("constantsAreInitialized") + protected static void initializeProbabilities(final double[][] transition, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + for (int i = 0; i < insertionGOP.length; i++) { + final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); + transition[i+1][matchToMatch] = QualityUtils.qualToProb((byte) qualIndexGOP); + transition[i+1][indelToMatch] = QualityUtils.qualToProb(overallGCP[i]); + transition[i+1][matchToInsertion] = QualityUtils.qualToErrorProb(insertionGOP[i]); + transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); + transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); + transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); + } + } + + /** + * Pad the ends of the Match and Insert arrays with 0. + * Analogous to setting zeros in the first row in the Match, Insert matrices of N2MemoryPairHMM. + * + * @param padPosition Which index in the arrays we wish to pad + */ + private void padMatchAndInsertArrays(final int padPosition) { + grandparentMatchArray[padPosition] = 0; + grandparentInsertArray[padPosition] = 0; + parentMatchArray[padPosition] = 0; + parentInsertArray[padPosition] = 0; + currentMatchArray[padPosition] = 0; + currentInsertArray[padPosition] = 0; + matchCacheArray[padPosition] = 0; + insertCacheArray[padPosition] = 0; + nextMatchCacheArray[padPosition] = 0; + nextInsertCacheArray[padPosition] = 0; + } + + /** + * Pad the Delete arrays with an intial value. Let's us have free deletions at the beginning of the alignment. + * Analogous to padding the first row of the Delete matrix of N2MemoryPairHMM. + * + * @param haplotypeLength The length of the present haplotype. Necessary for calculating initial padding value + * @param padPosition Which index in the arrays we wish to pad + */ + private void padDeleteArrays(final int haplotypeLength, final int padPosition) { + final double initialValue = INITIAL_CONDITION / haplotypeLength; + + // Pad the deletion arrays. Akin to padding the first row in the deletion matrix + parentDeleteArray[padPosition] = initialValue; + grandparentDeleteArray[padPosition] = initialValue; + currentDeleteArray[padPosition] = initialValue; + deleteCacheArray[padPosition] = initialValue; + nextDeleteCacheArray[padPosition] = initialValue; + } + + /** + * We build up our solution by looking at position [0] in the match, insert arrays. Need to set these to 0 before we start. + * + */ + private void clearArraySolutionPosition() { + grandparentMatchArray[0] = 0; + grandparentInsertArray[0] = 0; + parentMatchArray[0] = 0; + parentInsertArray[0] = 0; + currentMatchArray[0] = 0; + currentInsertArray[0] = 0; + } + + /** + * Clears cached information saved from the last haplotype, + * allowing us to start at the beginning of the present haplotype with intitial values of 0. + * + * @param fillLength How much of the cache arrays do we need to zero + */ + private void clearPreviouslyCachedInfo(final int fillLength) { + Arrays.fill(matchCacheArray, 0, fillLength, 0); + Arrays.fill(deleteCacheArray, 0, fillLength, 0); + Arrays.fill(insertCacheArray, 0, fillLength, 0); + + partialSum = 0; + } + + /** + * Applies cached information saved from the last haplotype, + * allowing us to start in the middle of the present haplotype. + * + * @param indK the index in the arrays we wish to update with cached info + */ + private void applyPreviouslyCachedInfo(int indK) { + // apply caching info necessary for calculating current DELETE array values + parentMatchArray[indK] = matchCacheArray[indK]; + parentDeleteArray[indK] = deleteCacheArray[indK]; + + // apply caching info necessary for calculating current MATCH array values + grandparentMatchArray[indK + 1] = matchCacheArray[indK + 1]; + grandparentDeleteArray[indK + 1] = deleteCacheArray[indK + 1]; + grandparentInsertArray[indK + 1] = insertCacheArray[indK + 1]; + } + + /** + * Records the mid-process state of one location in the read/haplotype alignment. + * Writes new cache information for use with the next haplotype we see. + * + * @param indK the index in the cache arrays we wish to store information in + */ + private void recordNewCacheInfo(int indK) { + nextMatchCacheArray[indK] = currentMatchArray[indK]; + nextDeleteCacheArray[indK] = currentDeleteArray[indK]; + nextInsertCacheArray[indK] = currentInsertArray[indK]; + } + + /** + * Update the HMM arrays for the current diagonal. + * + * @param readLength The length of the read + * @param hapStartIndex An offset that tells us if we are starting in the middle of the present haplotype + * @param nextHapStartIndex An offset that tells us which base in the NEXT haplotype we need to look at to record new caching info + * @param startFill The lower bound of the array indices we want to over-write + * @param endFill The upper bound of the array indices we want to over-write + * @param iii The index indicating which diagonal of the read/haplotype alignment we are working on + */ + private void updateArrays(final int readLength, + final int hapStartIndex, + final int nextHapStartIndex, + final int startFill, + final int endFill, + final int iii) { + + // The coordinate in our priors and transition matrices corresponding to a given position in the read/haplotype alignment + int matrixRow; + int matrixCol; + + int arrayIndex; + for (arrayIndex = startFill; arrayIndex < endFill; arrayIndex++) { + // translate the array position into a row, column in the priors and transition matrices + matrixRow = readLength - arrayIndex - 1; + matrixCol = iii - matrixRow - 1 + hapStartIndex; + + // update cell for each of our current arrays. Prior, transition matrices are padded +1 row,col + updateArrayCell(arrayIndex, prior[matrixRow+1][matrixCol+1], transition[matrixRow+1]); + + // Set up caching for the next haplotype + // At the position of the final similar base between this haplotype and the next one, remember the mid-array values + if (matrixCol == nextHapStartIndex - 1) + recordNewCacheInfo(arrayIndex); + } + } + + /** + * Updates a cell in the HMM arrays + * + * @param indK index in the arrays to update + * @param prior the likelihood editing distance matrix for the read x haplotype + * @param transition an array with the six transition relevant to this location + */ + private void updateArrayCell( final int indK, final double prior, final double[] transition) { + currentMatchArray[indK] = prior * ( grandparentMatchArray[indK + 1] * transition[matchToMatch] + + grandparentInsertArray[indK + 1] * transition[indelToMatch] + + grandparentDeleteArray[indK + 1] * transition[indelToMatch] ); + currentInsertArray[indK] = parentMatchArray[indK + 1] * transition[matchToInsertion] + parentInsertArray[indK + 1] * transition[insertionToInsertion]; + currentDeleteArray[indK] = parentMatchArray[indK] * transition[matchToDeletion] + parentDeleteArray[indK] * transition[deletionToDeletion]; + } + + /** + * To prepare for the next diagonal in our loop, each array must be bumped to an older generation + * + */ + private void rotateArrayReferences() { + double[] tempMatchArray = grandparentMatchArray; + double[] tempDeleteArray = grandparentDeleteArray; + double[] tempInsertArray = grandparentInsertArray; + + grandparentMatchArray = parentMatchArray; + grandparentDeleteArray = parentDeleteArray; + grandparentInsertArray = parentInsertArray; + + parentMatchArray = currentMatchArray; + parentDeleteArray = currentDeleteArray; + parentInsertArray = currentInsertArray; + + currentMatchArray = tempMatchArray; + currentDeleteArray = tempDeleteArray; + currentInsertArray = tempInsertArray; + } + + /** + * To prepare for the next haplotype, the caching info we wrote is copied into the cach-read arrays + * + */ + private void rotateCacheArrays() { + matchCacheArray = nextMatchCacheArray.clone(); + deleteCacheArray = nextDeleteCacheArray.clone(); + insertCacheArray = nextInsertCacheArray.clone(); + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java new file mode 100644 index 000000000..b80036bb2 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CnyPairHMM.java @@ -0,0 +1,302 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.io.File; +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +public final class CnyPairHMM extends PairHMM implements BatchPairHMM { + private static class HmmInput { + public byte[] readBases; + public byte[] readQuals; + public byte[] insertionGOP; + public byte[] deletionGOP; + public byte[] overallGCP; + public List haplotypes; + } + + public static class ResultQueue { + private int offset; + private List batchResults; + + public ResultQueue() { + batchResults = new LinkedList<>(); + offset = 0; + } + + public void push(double[] results) { + batchResults.add(results); + } + + public double pop() { + double[] results = batchResults.get(0); + double top = results[offset++]; + if (offset == results.length) { + batchResults.remove(0); + offset = 0; + } + return top; + } + } + + final static String libPath = "/opt/convey/personalities/32100.1.1.1.0"; + final static String libName = "gmvhdl_gatk_hmm"; + + private static boolean loaded = false; + private List batchRequests = new LinkedList<>(); + private ResultQueue resultQueue = new ResultQueue(); + + static public boolean isAvailable() { + if (!loaded) { + File library = new File(libPath + "/lib" + libName + ".so"); + return library.exists(); + } + return true; + } + + private native void initFpga(); + private native int dequeueRequirement(int reflen, int readlen); + private native int enqueue(byte[] haplotypeBases, + byte[] readBases, + byte[] readQuals, + byte[] insertionGOP, + byte[] deletionGOP, + byte[] overallGCP, + int hapStartIndex, + boolean recacheReadValues); + private native int flushQueue(); + private native int dequeue(double[] results); + private native double softHmm(byte[] haplotypeBases, + byte[] readBases, + byte[] readQuals, + byte[] insertionGOP, + byte[] deletionGOP, + byte[] overallGCP, + int hapStartIndex, + boolean recacheReadValues); + + public native void reportStats(); + + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + if (!loaded) { + addLibraryPath(libPath); + System.loadLibrary(libName); + initFpga(); + loaded = true; + System.out.println("FPGA HMM Initialized"); + } + } + + public void batchAdd(final List haplotypes, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP) { + final int numHaplotypes = haplotypes.size(); + HmmInput test = new HmmInput(); + test.readBases = readBases; + test.readQuals = readQuals; + test.insertionGOP = insertionGOP; + test.deletionGOP = deletionGOP; + test.overallGCP = overallGCP; + test.haplotypes = haplotypes; + batchRequests.add(test); + for (int jjj = 0; jjj < numHaplotypes; jjj++) { + final boolean recacheReadValues = (jjj == 0); + final Haplotype haplotype = haplotypes.get(jjj); + enqueuePrepare(haplotype.getBases(), readBases); + if (enqueue(haplotype.getBases(), readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, recacheReadValues) == 0) + throw new RuntimeException("FPGA queue overflow in batchAdd"); + } + } + + public double[] batchGetResult() { + double[] results; + + int n = flushQueue(); + if (n > 0) { + results = new double[n]; + if (dequeue(results) != n) + System.out.println("queue underflow in enqueuePrepare"); + resultQueue.push(results); + } + + final HmmInput test = batchRequests.remove(0); + final int numHaplotypes = test.haplotypes.size(); + results = new double[numHaplotypes]; + for (int jjj = 0; jjj < numHaplotypes; jjj++) { + results[jjj] = resultQueue.pop(); + if (results[jjj]<-60.0) { + final Haplotype haplotype = test.haplotypes.get(jjj); + results[jjj]=softHmm(haplotype.getBases(), test.readBases, test.readQuals, test.insertionGOP, test.deletionGOP, test.overallGCP, 0, true); + } + } + return results; + } + + /** + * {@inheritDoc} + */ + @Override + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap){ + + // initialize the pairHMM if necessary + if (! initialized) { + int readMaxLength = findMaxReadLength(reads); + int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + initialize(readMaxLength, haplotypeMaxLength); + } + + // Pass the read bases/quals, and the haplotypes as a list into the HMM + performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); + + // Get the log10-likelihoods for each read/haplotype ant pack into the results map + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap); + + return likelihoodMap; + } + + private void collectLikelihoodResults(List reads, Map alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { + for(final GATKSAMRecord read : reads){ + final double[] likelihoods = batchGetResult(); + int jjj = 0; + for (Allele allele : alleleHaplotypeMap.keySet()){ + final double log10l = likelihoods[jjj]; + likelihoodMap.add(read, allele, log10l); + jjj++; + } + } + } + + private void performBatchAdditions(List reads, Map alleleHaplotypeMap, Map GCPArrayMap) { + final List haplotypeList = getHaplotypeList(alleleHaplotypeMap); + for(final GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); + } + } + + + protected double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex) { + return 0.0; + } + + private List getHaplotypeList(Map alleleHaplotypeMap){ + final List haplotypeList = new LinkedList<>(); + for (Allele a : alleleHaplotypeMap.keySet()){ + haplotypeList.add(alleleHaplotypeMap.get(a)); + } + return haplotypeList; + } + + private void enqueuePrepare(byte[] haplotypeBases, byte[] readBases) { + double[] results = null; + int n = dequeueRequirement(haplotypeBases.length, readBases.length); + if (n>0) { + results = new double[n]; + if (dequeue(results)!=n) + System.out.println("queue underflow in enqueuePrepare"); + } else if (n<0) { + n = flushQueue(); + if (n > 0) { + results = new double[n]; + if (dequeue(results) != n) + System.out.println("queue underflow in enqueuePrepare"); + } + } + + if (results != null) + resultQueue.push(results); + } + + public static void addLibraryPath(String pathToAdd) { + try { + final Field usrPathsField = ClassLoader.class.getDeclaredField("usr_paths"); + usrPathsField.setAccessible(true); + + //get array of paths + final String[] paths = (String[])usrPathsField.get(null); + + //check if the path to add is already present + for(String path : paths) { + if(path.equals(pathToAdd)) { + return; + } + } + + //add the new path + final String[] newPaths = Arrays.copyOf(paths, paths.length + 1); + newPaths[newPaths.length-1] = pathToAdd; + usrPathsField.set(null, newPaths); + } catch (Exception ex) { + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java new file mode 100644 index 000000000..fb9dda8b2 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FastLoglessPairHMM.java @@ -0,0 +1,820 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +/** + * Fast partial PairHMM backed on the standard Logless PairHMM + * + */ +public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { + + + /** + * Initial read length capacity. + */ + private static final int INITIAL_READ_LENGTH_CAPACITY = 200; + + /** + * Initial haplotype length capacity. + */ + private static final int INITIAL_HAPLOTYPE_LENGTH_CAPACITY = 400; + + + /** + * Holds the current read capacity. + *

It can only go up overtime.

+ */ + private int readCapacity = INITIAL_READ_LENGTH_CAPACITY; + + /** + * Holds the current haplotype length capacity. + *

It can only go up overtime.

+ */ + private int haplotypeCapacity = INITIAL_HAPLOTYPE_LENGTH_CAPACITY; + + private int maxToCol; + private int haplotypeLength; + + /** + * Returns the currently loaded read base qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + public byte[] getReadQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readQuals; + } + + /** + * Returns the currently loaded read insertion qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadInsQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readInsQuals; + } + + /** + * Returns the currently loaded read deletion qualities. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadDelQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readDelQuals; + } + + /** + * Returns the currently loaded read gap extension penalty.. + * + * @throws IllegalStateException if no read was previously loaded using {@link #loadRead}. + * @return never {@code null}. + */ + @SuppressWarnings("unused") + public byte[] getReadGepQuals() { + if (readQuals == null) + throw new IllegalStateException("no read was loaded onto the pairhmm calculator"); + return readGepQuals; + } + + + /** + * Creates a new pair-hmm calculator instance give the gap continuation penalty. + * + * @param gcp the gap-continuation penalty. + */ + public FastLoglessPairHMM(final byte gcp) { + constantGCP = gcp; + initialize(readCapacity,haplotypeCapacity); + } + + @Override + public byte getGapExtensionPenalty() { + return constantGCP; + } + + + @Override + public double subComputeReadLikelihoodGivenHaplotypeLog10(final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, final int nextHapStartIndex) { + this.readBases = readBases; + this.haplotypeBases = haplotypeBases; + this.haplotypeLength = haplotypeBases.length; + return super.subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases,readBases,readQuals, + insertionGOP,deletionGOP,overallGCP,hapStartIndex,recacheReadValues,nextHapStartIndex); + } + + /** + * Implement the last step summation to calculate the total likelihood. + * + * @param row number of the last row of the pair-hmm where the likelihood values are present. + * @param fromCol inclusive first column to include in the summation. + * @param toCol exclusive last column to include in the summation. + * @return 0 or less. + */ + protected double finalLikelihoodCalculation(final int row, + final int fromCol, final int toCol) { + + final double divider = Math.max(1,2 *(toCol - fromCol)); + final double dividerInverse = 1.0 / divider; + double finalLikelihood = 0; + + for (int j = fromCol; j < toCol; j++) { + finalLikelihood += matchMatrix[row][j] * dividerInverse; + finalLikelihood += insertionMatrix[row][j] * dividerInverse; + } + return StrictMath.log10(finalLikelihood) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + + /** + * Initialize the matrix values for a problem including the trailing end of the read. + * + *

+ * Notice that you can improve performance by omitting filling reusable values from + * previous haplotype calculations. You can set {@code haplotypeStartOffset} to skill + * those columns. + *

+ * + * @param readStart inclusive first position of the read used in the calculations. + * @param readEnd exclusive last position of the read considered in the calculations. + * @param haplotypeStartOffset offset of the haplotype right after the reusable prefix + * from previous calls. + * + * + */ + protected void initializeMatrixValuesForTrailingProblem(final int readStart, final int readEnd, + final int haplotypeStartOffset) { + + @SuppressWarnings("all") + final int zeroRow = readStart; + final int toRow = readEnd + 1; + final int toCol = haplotypeLength + 1; + + // fill first row with -Inf fot M and I but not for Deletion if leading + // to allow for free deletions at the beginning. + if (readStart == 0) { + // First row initialization: + Arrays.fill(matchMatrix[zeroRow],haplotypeStartOffset,toCol,0); + Arrays.fill(deletionMatrix[zeroRow],haplotypeStartOffset,toCol,INITIAL_CONDITION); + + if (haplotypeStartOffset == 0) + for (int i = zeroRow + 1; i < toRow; i++) + insertionMatrix[i][0] = matchMatrix[i][0] = deletionMatrix[i][0] = 0; + + } else { + Arrays.fill(matchMatrix[zeroRow], Math.max(1,haplotypeStartOffset), toCol,0); + Arrays.fill(insertionMatrix[zeroRow], haplotypeStartOffset, toCol,0); + if (haplotypeStartOffset == 0) { + matchMatrix[zeroRow][0] = INITIAL_CONDITION; + deletionMatrix[zeroRow][0] = 0; + } + if (haplotypeStartOffset <= 1) deletionMatrix[zeroRow][1] = matchMatrix[zeroRow][1] * transition[zeroRow][matchToDeletion]; + for (int i = Math.max(haplotypeStartOffset,2); i < toCol; i++) { + deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] + * transition[zeroRow][deletionToDeletion]; + } + + if (haplotypeStartOffset == 0) { + matchMatrix[zeroRow + 1][0] = deletionMatrix[zeroRow + 1][0] = 0; + insertionMatrix[zeroRow + 1][0] = matchMatrix[zeroRow][0] * transition[zeroRow + 1][matchToInsertion]; + + + for (int i = zeroRow + 2; i < toRow; i++) { + matchMatrix[i][0] = deletionMatrix[i][0] = 0; + insertionMatrix[i][0] = insertionMatrix[i - 1][0] + * transition[i][insertionToInsertion]; + } + } + } + } + + /** + * Initializes calculation matrices give the characteristics of the next and previous problems. + * @param currentProblem reference to the Lk calculation problem we are dealing currently. + * @param previousProblem reference to the Lk calculation problem that has been solved just before. + * + */ + protected void initializeMatrixValues(final Problem currentProblem, final Problem previousProblem) { + if (previousProblem != null && + previousProblem.readStart == currentProblem.readStart && + previousProblem.hapStart == currentProblem.hapStart && + maxToCol >= currentProblem.hapEnd + 1) + return; + + final int zeroRow = currentProblem.readStart; + final int zeroCol = currentProblem.hapStart; + final int toRow = currentProblem.readEnd + 1; + final int toCol = currentProblem.hapEnd + 1; + maxToCol = toCol; + + // fill first row with -Inf fot M and I but not for Deletion if leading + // to allow for free deletions at the beginning. + if (currentProblem.leading) { + // First row initialization: + Arrays.fill(matchMatrix[zeroRow],zeroCol,toCol,0); + Arrays.fill(deletionMatrix[zeroRow],zeroCol,toCol,INITIAL_CONDITION); + + for (int i = zeroRow + 1; i < toRow; i++) + insertionMatrix[i][zeroCol] = matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; + + } else { // If not leading set the appropriate matching 1.0 prob and + // deletion + extension. + + Arrays.fill(matchMatrix[zeroRow], zeroCol + 1, toCol,0); + Arrays.fill(insertionMatrix[zeroRow], zeroCol, toCol,0); + matchMatrix[zeroRow][zeroCol] = INITIAL_CONDITION; + deletionMatrix[zeroRow][zeroCol] = 0; + deletionMatrix[zeroRow][zeroCol + 1] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow][matchToDeletion]; + for (int i = zeroCol + 2; i < toCol; i++) { + deletionMatrix[zeroRow][i] = deletionMatrix[zeroRow][i - 1] + * transition[zeroRow][deletionToDeletion]; + } + + matchMatrix[zeroRow + 1][zeroCol] = deletionMatrix[zeroRow + 1][zeroCol] = 0; + insertionMatrix[zeroRow + 1][zeroCol] = matchMatrix[zeroRow][zeroCol] * transition[zeroRow + 1][matchToInsertion]; + + for (int i = zeroRow + 2; i < toRow; i++) { + matchMatrix[i][zeroCol] = deletionMatrix[i][zeroCol] = 0; + insertionMatrix[i][zeroCol] = insertionMatrix[i - 1][zeroCol] + * transition[i][insertionToInsertion]; + } + } + } + + /** + * Constant gap-continuation-penalty. + */ + private final byte constantGCP; + + /** + * Currently loaded haplotype base sequence. + */ + private byte[] haplotypeBases; + + /** + * Currently loaded read base sequence. + */ + private byte[] readBases; + + /** + * Read qualities. + */ + private byte[] readQuals; + + /** + * Read insertion qualities. + */ + private byte[] readInsQuals; + + /** + * Read deletion qualities. + */ + private byte[] readDelQuals; + + /** + * Read gap-extension-penalties. + */ + private byte[] readGepQuals; + + /** + * Cached results. + */ + private Map cachedResults = new HashMap<>(); + + /** + * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. + * + * @param read the target read. + * @throws NullPointerException if {@code read} is null. + */ + @Override + public void loadRead(final GATKSAMRecord read) { + loadRead(read.getReadBases(),read.getBaseQualities(),read.getBaseInsertionQualities(),read.getBaseDeletionQualities(),read.getMappingQuality()); + } + + /** + * Loads the read that is going to be evaluated in following calls to {@link #calculateLocalLikelihoods}. + * + * @param readBases the read bases. + * @param readQuals the read base call quality scores. + * @param readInsQuals the read insertion quality scores. + * @param readDelQuals the read deletion quality scores. + * @param mq the read mapping quality score. + * @throws NullPointerException if any of the arrays passed is {@code null}. + * @throws IllegalArgumentException if the arrays passed have incompatible sizes. + */ + public void loadRead(final byte[] readBases, final byte[] readQuals, final byte[] readInsQuals, final byte[] readDelQuals, int mq) { + // TODO This is a copy&paste from PairHMM*Engine read data preparation code. + // TODO It is simply to difficult to share the code without changing that class and I don't want + // TODO to do so for now. + if (readBases.length != readQuals.length) throw new IllegalArgumentException("the read quality array length does not match the read base array length"); + if (readBases.length != readInsQuals.length) throw new IllegalArgumentException("the read insert quality array length does not match the read base array length"); + if (readBases.length != readDelQuals.length) throw new IllegalArgumentException("the read deletion quality length does not match the read base array length"); + maxToCol = 0; + + if (readBases.length > readCapacity) { + readCapacity = readBases.length; + initialize(readCapacity,haplotypeCapacity); + } + paddedReadLength = readBases.length + 1; + final byte[] overallGCP = new byte[readBases.length]; + Arrays.fill(overallGCP, constantGCP); // Is there a way to derive + + for (int kkk = 0; kkk < readQuals.length; kkk++) { + readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], + mq); // cap base quality by mapping + readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE + : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); + readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); + readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); + } + this.readBases = readBases; + this.readQuals = readQuals; + this.readInsQuals = readInsQuals; + this.readDelQuals = readDelQuals; + this.readGepQuals = overallGCP; + initializeProbabilities(transition,readInsQuals, readDelQuals, overallGCP); + if (haplotypeBases != null) + fillPriorsTable(0); + cachedResults.clear(); + } + + @Override + public void loadHaplotypeBases(final byte[] haplotypeBases) { + if (readBases == null) + throw new IllegalStateException( + "no read was loaded before the haplotype"); + this.haplotypeBases = haplotypeBases.clone(); + haplotypeLength = haplotypeBases.length; + paddedHaplotypeLength = haplotypeLength; + if (haplotypeCapacity < haplotypeLength) { + haplotypeCapacity = haplotypeLength; + initialize(readCapacity,haplotypeCapacity); + initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); + } + initializePriors(this.haplotypeBases, readBases, readQuals, 0); + } + + + /** + * Changes only the suffix of the currently loaded haplotype. + * + *

+ * If from is 0, this is equivalent to call {@link #loadHaplotypeBases(byte[])} directly. + *

+ * @param from first position on the current haplotype to substitute with the new suffix. + * It can be up to the length of the haplotype in such case this operation is in + * effect just extending that haplotype. + * @param suffix the new bases for the end part of the current haplotype. + * @param suffixFrom inclusive first position of the actual suffix within the {@code suffix} array. + * @param suffixTo exclusive last position of the actual suffix within the {@code suffix} array. + * + * @throws IllegalStateException if no read was loaded with {@link #loadRead}. + * @throws IllegalArgumentException if from is more than 0 but no haplotype was loaded previously or if indices passed are inconsistent. + * @throws ArrayIndexOutOfBoundsException if indices passed are outside valid ranges. + */ + public void changeHaplotypeSuffix(final int from, final byte[] suffix, final int suffixFrom, final int suffixTo) { + if (readBases == null) + throw new IllegalStateException( + "no read was loaded before the haplotype"); + if (haplotypeBases == null && from > 0) + throw new IllegalArgumentException("from cannot be larger than 0 if no haplotype bases was previously loaded"); + if (suffixFrom < 0) + throw new ArrayIndexOutOfBoundsException("the suffix from index cannot be negative"); + if (suffixTo > suffix.length) + throw new ArrayIndexOutOfBoundsException("the suffix to index cannot be larger than the suffix array length"); + if (suffixFrom > suffixTo) + throw new IllegalArgumentException("the suffix to index cannot be smaller than the suffix from index"); + if (from > haplotypeLength) + throw new IllegalArgumentException("the from index cannot be greater than the current haplotype length"); + if (from < 0) + throw new IllegalArgumentException("the from index cannot be negative"); + + int startIndex = from; + if (haplotypeBases == null) { + haplotypeBases = Arrays.copyOfRange(suffix,suffixFrom,suffixTo); + haplotypeLength = suffixTo - suffixFrom; + } else { + final int newLength = from + suffixTo - suffixFrom; + if (haplotypeBases.length < newLength) + haplotypeBases = Arrays.copyOf(haplotypeBases,newLength); + System.arraycopy(suffix,suffixFrom,haplotypeBases,from,newLength - from); + haplotypeLength = newLength; + } + paddedHaplotypeLength = haplotypeLength + 1; + if (haplotypeCapacity < haplotypeLength) { + haplotypeCapacity = haplotypeLength; + initialize(readCapacity,haplotypeCapacity); + initializeProbabilities(transition, readInsQuals, readDelQuals, readGepQuals); + startIndex = 0; + } + //startIndex = 0; + fillPriorsTable(startIndex); + } + + /** + * Returns the bases of the current haplotype. + * + * @throws IllegalStateException if no haplotype was loaded previously + * @return never {@code null} + */ + public byte[] getHaplotypeBases() { + if (haplotypeBases == null) + throw new IllegalStateException(); + return Arrays.copyOfRange(haplotypeBases,0,haplotypeLength); + } + + /** + * Returns a debug representation of the pair-hmm. + * @return never {@code null}. + */ + public String toString() { + return "" + haplotypeLength + ":" + new String(Arrays.copyOfRange(haplotypeBases,0,haplotypeLength)); + } + + @Override + protected void initializePriors(final byte[] hapBases, final byte[] readBases, final byte[] baseQuals, final int idx) { + haplotypeBases = hapBases; + haplotypeLength = haplotypeBases.length; + this.readBases = readBases; + this.readQuals = baseQuals; + fillPriorsTable(idx); + } + + /** + * Fills the prior table up. + * + *

+ * It accepts an argument to save unnecessary prefix filling up. + *

+ * + * @param idx first position in the haplotype to start filling from. + */ + protected void fillPriorsTable(final int idx) { + for (int i = 0; i < readBases.length; i++) { + final byte x = readBases[i]; + final byte qual = readQuals[i]; + for (int j = idx; j < haplotypeLength; j++) { + final byte y = haplotypeBases[j]; + prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); + } + } + } + + + /** + * Decorates haplotype set with their likelihoods as compared with the currently loaded read. + * + * + * @param readStart inclusive start position of the targeted section of the read. + * @param readEnd exclusive end position just beyond the targeted section of the read. + * @param haplotypes in/out set of haplotypes. + */ + public void calculateLocalLikelihoods(final int readStart, final int readEnd, final PairHMMReadyHaplotypes haplotypes) { + final PairHMMReadyHaplotypes.Iterator entryIterator = haplotypes.iterator(); + boolean isFirst = true; + while (entryIterator.hasNext()) { + entryIterator.next(); + final int startIndex = entryIterator.startIndex(); + final byte[] bases = entryIterator.bases(); + changeHaplotypeSuffix(startIndex,bases,startIndex,bases.length); + final double likelihood = calculateLikelihood(readStart, readEnd, startIndex, isFirst); + isFirst = false; + entryIterator.setLikelihood(likelihood); + } + } + + + + @Override + public double calculateLocalLikelihood(final int readStart, final int readEnd, + final int hapStart, final int hapEnd, final boolean kmerMatch) { + if (readBases == null || haplotypeBases == null) + throw new IllegalStateException("read or haplotype was not loaded"); + final int hapSegmentLength = hapEnd - hapStart; + final int readSegmentLength = readEnd - readStart; + // trivial case when there is a single base match. + if (kmerMatch) { + return calculateLocalLikelihoodsExactMatch(readStart, hapStart, hapSegmentLength, readSegmentLength); + } else if (hapSegmentLength == readSegmentLength) { + if (hapSegmentLength == 0) { + return calculateLocalLikelihoodEmptySquare(readStart, readEnd); + } else if (hapSegmentLength == 1) { + return calculateLocalLikelihoodSingleBase(readStart, readEnd, hapStart); + } else { // general (slower) solution. + return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); + } + } else if (hapSegmentLength == 0) { // must be full insertion we + return calculateLocalLikelihoodInsertion(readStart, readEnd); + } else if (readSegmentLength == 0) { // full deletion. + return calculateLocalLikelihoodDeletion(readStart, hapStart, hapEnd); + } else { // general (slower) solution. + return calculateLocalLikelihoodsGeneral(readStart, readEnd, hapStart, hapEnd); + } + } + + /** + * Fast likelihood when the pair-hmm represents a deletion in the read. + */ + private double calculateLocalLikelihoodDeletion(final int readStart, final int hapStart, final int hapEnd) { + double result = INITIAL_CONDITION; + if (readStart > 0) { // no penalty if at the beginning. + result *= transition[readStart][matchToDeletion]; + result *= + StrictMath.pow(transition[readStart][deletionToDeletion],hapEnd - hapStart - 1); + result *= transition[readStart][indelToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + + /** + * Fast likelihood when the pair-hmm represents a insertion in the read. + */ + private double calculateLocalLikelihoodInsertion(final int readStart, final int readEnd) { + double result = INITIAL_CONDITION; + result *= transition[readStart + 1][matchToInsertion]; + for (int i = readStart + 1; i < readEnd; i++) { + result *= transition[i + 1][insertionToInsertion]; + } + if (readEnd < readBases.length) { + result *= transition[readEnd + 1][indelToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Single base mismatch fast likelihood calculation. + */ + private double calculateLocalLikelihoodSingleBase(final int readStart, final int readEnd, final int hapStart) { + double result = INITIAL_CONDITION; + result *= prior[readStart + 1][hapStart + 1]; + if (readStart > 0) { + result *= transition[readStart + 1][matchToMatch]; + } + if (readEnd < readBases.length) { + result *= transition[readEnd + 1][matchToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Empty square Pair-hmm. + */ + private double calculateLocalLikelihoodEmptySquare(final int readStart, final int readEnd) { + double result = INITIAL_CONDITION; + if (readStart > 0 && readEnd < readBases.length) { + result *= transition[readStart + 1][matchToMatch]; + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Likelihood assuming that there is a exact match between both sequences: read and haplotype + */ + private double calculateLocalLikelihoodsExactMatch(final int readStart, final int hapStart, final int hapSegmentLength, final int readSegmentLength) { + double result = INITIAL_CONDITION; + if (hapSegmentLength == 1) { + result *= prior[readStart + 1][hapStart + 1]; + } else { + for (int i = 0; i < readSegmentLength; i++) { + result *= prior[readStart + i + 1][hapStart + i + 1]; + if (i > 0) { + result *= transition[readStart + i + 1][matchToMatch]; + } + } + } + return StrictMath.log10(result) - INITIAL_CONDITION_LOG10; + } + + /** + * Revert to a general pair-hmm solution. + */ + private double calculateLocalLikelihoodsGeneral(final int readStart, final int readEnd, final int hapStart, final int hapEnd) { + final Problem p = new Problem(readStart, readEnd, hapStart, hapEnd); + final Double cachedCost = cachedResults.get(p); + if (cachedCost != null) { + return cachedCost; + } + double cost = calculateLocalLikelihoodGeneral(p); + cachedResults.put(p, cost); + return cost; + } + + /** + * Resolve the regular full pair-hmm. + * + *

+ * With the possibility of reuse the previous haplotype common prefix by using + * a startIndex which is greater than 0. + */ + private double calculateLikelihood(final int readStart, final int readEnd, final int startIndex, final boolean initializeEdges) { + final int edgeStart = initializeEdges ? 0 : startIndex + 1; + initializeMatrixValuesForTrailingProblem(readStart, readEnd, edgeStart); + updateTable(readStart + 1, readEnd + 1, startIndex + 1, haplotypeLength + 1); + if (readEnd == readBases.length) + return finalLikelihoodCalculation(readEnd,0,haplotypeLength + 1) - (readStart == 0 ? StrictMath.log10(haplotypeLength) : 0); + else { + final double divider = 3.0; + final double dividerInverted = 1.0 / divider; + return StrictMath.log10(matchMatrix[readEnd][haplotypeLength] + * transition[readEnd][matchToMatch] * dividerInverted + + insertionMatrix[readEnd][haplotypeLength] + * transition[readEnd][indelToMatch] * dividerInverted + + deletionMatrix[readEnd][haplotypeLength] + * transition[readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + } + + + private double calculateLocalLikelihoodGeneral(final Problem p) { + + initializeMatrixValues(p,null); + // int fromCol = p.hapStart + 1; + // if (previousProblem == null) { + // fromCol = p.hapStart + 1; + // } else { + // final int sharedPrefix = previousProblem.followerStartIndex(p); + // if (sharedPrefix >= 0) + // fromCol = sharedPrefix + 1; + // else + // fromCol = p.hapStart + 1; + // } + // previousProblem = p; + + updateTable(p.readStart + 1, p.readEnd + 1, + p.hapStart + 1, p.hapEnd + 1); + + if (p.trailing) { + return finalLikelihoodCalculation(p.readEnd,p.hapStart,p.hapEnd + 1) + - (p.leading ? StrictMath.log10(p.hapEnd - p.hapStart) : 0); + } else { + final double divider = 3.0; + final double dividerInverted = 1.0 / divider; + return StrictMath.log10(matchMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][matchToMatch] * dividerInverted + + insertionMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][indelToMatch] * dividerInverted + + deletionMatrix[p.readEnd][p.hapEnd] + * transition[p.readEnd][indelToMatch] * dividerInverted) - INITIAL_CONDITION_LOG10 + StrictMath.log10(divider); + } + } + + private void updateTable(final int rowFrom, final int rowTo, + final int colFrom, final int colTo) { + + for (int i = rowFrom; i < rowTo; i++) { + for (int j = colFrom; j < colTo; j++) { + updateCell(i, j, prior[i][j], transition[i]); + } + } + + } + + /** + * Holds the properties of a pair-hmm computational problem. + */ + public class Problem { + private final byte[] haplotypeSegment; + private final int readStart; + private final int readEnd; + private final int hapStart; + private final int hapEnd; + private final int hashCode; + private final boolean trailing; + private final boolean leading; + + /** + * Construct a new project object. + * @param start inclusive start position on the read to consider. + * @param end exclusive after last position on the read to consider. + * @param hapStart inclusive start position on the haplotype to consider. + * @param hapEnd exclusive after last position on the haplotype to consider. + */ + public Problem(final int start, final int end, final int hapStart, + final int hapEnd) { + if (start < 0 || start > readBases.length) + throw new IllegalArgumentException("bad start index " + start); + if (end < start || end > readBases.length) + throw new IllegalArgumentException("bad end index " + end + " < " + start + " or " + end + " > " + readBases.length); + if (hapStart < 0 || hapStart > haplotypeLength) + throw new IllegalArgumentException("bad hap start index " + + hapStart + " is larger than the haplotypeLength " + haplotypeLength); + if (hapEnd < hapStart || hapEnd > haplotypeLength) + throw new IllegalArgumentException("bad hap end index " + + hapEnd + " outside [" + hapStart + "," + + haplotypeLength + "]"); + + haplotypeSegment = Arrays.copyOfRange(haplotypeBases, hapStart, hapEnd); + readStart = start; + readEnd = end; + this.hapStart = hapStart; + this.hapEnd = hapEnd; + trailing = readEnd == readBases.length; + leading = readStart == 0; + + hashCode = ((start * 31 + end) * 31 + Arrays.hashCode(haplotypeSegment) * 31); + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public boolean equals(Object o) { + if (o == this) + return true; + else if (o == null) + return false; + else if (o.getClass() != this.getClass()) + return false; + else { + final Problem p = (Problem) o; + return (p.hashCode == this.hashCode) && (p.readStart == this.readStart) && (p.readEnd == this.readEnd) && Arrays.equals(haplotypeSegment, p.haplotypeSegment); + } + } + + + } + + /** + * Returns the currently loaded read base calls. + * @return {@code never null}. + */ + public byte[] getReadBases() { + if (readBases == null) + throw new IllegalStateException("no read was previously loaded."); + return readBases; + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java new file mode 100644 index 000000000..152274947 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/FlexibleHMM.java @@ -0,0 +1,105 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * API for the fast (partial) HMM calculation engine. + */ +public interface FlexibleHMM { + + /** + * Load a read into the HMM calculation matrices. + * @param read the read record to load into the HMM calculating engine. + */ + public void loadRead(GATKSAMRecord read); + + /** + * Returns the current read bases. + * + * @return never null. + */ + public byte[] getReadBases(); + + /** + * Loads a haplotype bases in the HMM calculation matrices. + * @param haplotype the haplotype sequence. + * + * @throws IllegalStateException if no read has been previously loaded. + * @throws NullPointerException if {@code haplotype} is {@code null}. + */ + public void loadHaplotypeBases(byte[] haplotype); + + /** + * Resolve the partial Fast PairHMM for a section of the read and haplotype + * @param readFrom inclusive offset of the first position on the read. + * @param readTo exclusive offset of the last position on the read. + * @param haplotypeFrom inclusive offset of the first position on the haplotype. + * @param haplotypeTo exclusive offset of the last position on the haplotype. + * @param treatAsMatch can assume that both pieces are the same sequence. + * @return the cost the sub-HMM. + */ + public double calculateLocalLikelihood(int readFrom, int readTo, int haplotypeFrom, int haplotypeTo, boolean treatAsMatch); + + /** + * Load a read given its relevant information pieces by separate. + * @param bases read bases. + * @param bq base qualities. + * @param iq insertion qualities. + * @param dq deletion qualities. + * @param mq read mapping quality. + */ + public void loadRead(byte[] bases, byte[] bq, byte[] iq, byte[] dq, int mq); + + + /** + * Returns the constant gap extension penalty in Phred scale + * @return never @code null. + */ + byte getGapExtensionPenalty(); +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java index 184a2689d..0725e24b4 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java +++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessPairHMM.java @@ -55,28 +55,20 @@ import org.broadinstitute.sting.utils.QualityUtils; * User: rpoplin, carneiro * Date: 10/16/12 */ -public final class LoglessPairHMM extends N2MemoryPairHMM { +public class LoglessPairHMM extends N2MemoryPairHMM { protected static final double INITIAL_CONDITION = Math.pow(2, 1020); protected static final double INITIAL_CONDITION_LOG10 = Math.log10(INITIAL_CONDITION); - private static final int matchToMatch = 0; - private static final int indelToMatch = 1; - private static final int matchToInsertion = 2; - private static final int insertionToInsertion = 3; - private static final int matchToDeletion = 4; - private static final int deletionToDeletion = 5; + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected static final double TRISTATE_CORRECTION = 3.0; + protected static final int matchToMatch = 0; + protected static final int indelToMatch = 1; + protected static final int matchToInsertion = 2; + protected static final int insertionToInsertion = 3; + protected static final int matchToDeletion = 4; + protected static final int deletionToDeletion = 5; - /** - * {@inheritDoc} - */ - @Override - public void initialize(final int readMaxLength, final int haplotypeMaxLength ) { - super.initialize(readMaxLength, haplotypeMaxLength); - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; - } /** * {@inheritDoc} @@ -89,7 +81,8 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final int nextHapStartIndex) { if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { final double initialValue = INITIAL_CONDITION / haplotypeBases.length; @@ -135,7 +128,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { * @param readQuals the base quality scores of the read * @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read) */ - public void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { + protected void initializePriors(final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final int startIndex) { // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2. @@ -146,7 +139,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { for (int j = startIndex; j < haplotypeBases.length; j++) { final byte y = haplotypeBases[j]; prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) ); + QualityUtils.qualToProb(qual) : (QualityUtils.qualToErrorProb(qual) / (doNotUseTristateCorrection ? 1.0 : TRISTATE_CORRECTION)) ); } } } @@ -173,6 +166,11 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { transition[i+1][insertionToInsertion] = QualityUtils.qualToErrorProb(overallGCP[i]); transition[i+1][matchToDeletion] = QualityUtils.qualToErrorProb(deletionGOP[i]); transition[i+1][deletionToDeletion] = QualityUtils.qualToErrorProb(overallGCP[i]); + //TODO it seems that it is not always the case that matchToMatch + matchToDeletion + matchToInsertion == 1. + //TODO We have detected cases of 1.00002 which can cause problems downstream. This are typically masked + //TODO by the fact that we always add a indelToMatch penalty to all PairHMM likelihoods (~ -0.1) + //TODO This is in fact not well justified and although it does not have any effect (since is equally added to all + //TODO haplotypes likelihoods) perhaps we should just remove it eventually and fix this != 1.0 issue here. } } @@ -187,7 +185,7 @@ public final class LoglessPairHMM extends N2MemoryPairHMM { * @param prior the likelihood editing distance matrix for the read x haplotype * @param transition an array with the six transition relevant to this location */ - private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { + protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { matchMatrix[indI][indJ] = prior * ( matchMatrix[indI - 1][indJ - 1] * transition[matchToMatch] + insertionMatrix[indI - 1][indJ - 1] * transition[indelToMatch] + diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java index 9672bc5f3..943e87461 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RepeatCovariate.java @@ -74,6 +74,11 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; } + public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { + this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; + this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; + } + @Override public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { // store the original bases and then write Ns over low quality ones @@ -103,7 +108,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate { } - private Pair findTandemRepeatUnits(byte[] readBases, int offset) { + public Pair findTandemRepeatUnits(byte[] readBases, int offset) { int maxBW = 0; byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { diff --git a/protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java b/protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java new file mode 100644 index 000000000..e42f109b2 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/utils/sam/ClippedGATKSAMRecord.java @@ -0,0 +1,107 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.sam; + +import java.util.Arrays; + +/** + * Represents a hard-clipped view of a read. + */ +public class ClippedGATKSAMRecord extends GATKSAMRecord { + + private byte[] insertionQuals; + + private byte[] deletionQuals; + + /** + * Creates a hard-clipped view on a existing read record. + * @param read the underlying unclipped read. + * @param start inclusive first position in {@code read} included in the clipped view. + * @param end inclusive last position in {@code read} included in the clipped view. + */ + public ClippedGATKSAMRecord(final GATKSAMRecord read, int start, int end) { + super(read.getHeader(), read.getReferenceIndex(), read.getAlignmentStart() + start, (short) read.getReadNameLength(), + (short) 100, -1, read.getCigarLength(), read.getFlags(), end - start, + read.getMateReferenceIndex(), read.getMateAlignmentStart(), read.getInferredInsertSize(), + new byte[0]); + this.setReadBases(Arrays.copyOfRange(read.getReadBases(), start, end)); + this.setBaseQualities(Arrays.copyOfRange(read.getBaseQualities(),start,end)); + this.setReadName(read.getReadName()); + insertionQuals = Arrays.copyOfRange(read.getBaseInsertionQualities(),start,end); + deletionQuals = Arrays.copyOfRange(read.getBaseDeletionQualities(),start,end); + } + + @Override + public byte[] getBaseDeletionQualities() { + return deletionQuals; + } + + @Override + public byte[] getBaseInsertionQualities() { + return insertionQuals; + } + + @Override + public int getMappingQuality() { + return 100; + } + + @Override + public int hashCode() { + return getReadName().hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof GATKSAMRecord) { + return getReadName().equals(((GATKSAMRecord)o).getReadName()); + } else { + return false; + } + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 37dc7adba..58c3bb9bd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -46,14 +46,25 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.Arrays; public class VariantAnnotatorIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static String baseTestString() { return "-T VariantAnnotator -R " + b36KGReference + " --no_cmdline_in_header -o %s"; } @@ -290,4 +301,96 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("Testing InbreedingCoeff annotation with PED file", spec); } + @Test + public void testStrandBiasBySample() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testStrandBiasBySample", spec).getFirst().get(0); + + final String baseNoFS = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA FisherStrand -A StrandBiasBySample"; + final WalkerTestSpec specNoFS = new WalkerTestSpec(baseNoFS, 1, Arrays.asList("")); + specNoFS.disableShadowBCF(); + final File outputVCFNoFS = executeTest("testStrandBiasBySample component stand bias annotation", specNoFS).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoFS.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A FisherStrand"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testStrandBiasBySample re-annotation of FisherStrand", specAnn).getFirst().get(0); + + // confirm that the FisherStrand values are identical for the two pipelines + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + Assert.assertTrue(vc.hasAttribute("FS")); + Assert.assertTrue(vcAnn.hasAttribute("FS")); + Assert.assertEquals(vc.getAttributeAsDouble("FS", 0.0), vcAnn.getAttributeAsDouble("FS", -1.0)); + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } + + @Test + public void testQualByDepth() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("testQualByDepth", spec).getFirst().get(0); + + final String baseNoQD = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, CEUTRIO_BAM) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -XA QualByDepth"; + final WalkerTestSpec specNoQD = new WalkerTestSpec(baseNoQD, 1, Arrays.asList("")); + specNoQD.disableShadowBCF(); + final File outputVCFNoQD = executeTest("testQualByDepth calling without QD", specNoQD).getFirst().get(0); + + final String baseAnn = String.format("-T VariantAnnotator -R %s -V %s", REF, outputVCFNoQD.getAbsolutePath()) + " --no_cmdline_in_header -o %s -L 20:10130000-10134800 -A QualByDepth"; + final WalkerTestSpec specAnn = new WalkerTestSpec(baseAnn, 1, Arrays.asList("139a4384f5a7c1f49ada67f416642249")); + specAnn.disableShadowBCF(); + final File outputVCFAnn = executeTest("testQualByDepth re-annotation of QD", specAnn).getFirst().get(0); + + // confirm that the QD values are present in the new file for all biallelic variants + // QD values won't be identical because some filtered reads are missing during re-annotation + + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + + final VCFCodec codecAnn = new VCFCodec(); + final FileInputStream sAnn = new FileInputStream(outputVCFAnn); + final LineIterator lineIteratorAnn = codecAnn.makeSourceFromStream(new PositionalBufferedStream(sAnn)); + codecAnn.readHeader(lineIteratorAnn); + + while( lineIterator.hasNext() && lineIteratorAnn.hasNext() ) { + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + + final String lineAnn = lineIteratorAnn.next(); + Assert.assertFalse(lineAnn == null); + final VariantContext vcAnn = codecAnn.decode(lineAnn); + + if( vc.isBiallelic() ) { + Assert.assertTrue(vc.hasAttribute("QD")); + Assert.assertTrue(vcAnn.hasAttribute("QD")); + } + } + + Assert.assertFalse(lineIterator.hasNext()); + Assert.assertFalse(lineIteratorAnn.hasNext()); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index 32791dd97..4f5b7477c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -97,15 +97,15 @@ public class HeaderElementUnitTest extends BaseTest { HeaderElement headerElement = new HeaderElement(1000, 0); // first test that if we add and then remove it, we have no data - headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip); + headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); headerElement.addInsertionToTheRight(); - headerElement.removeBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip); + headerElement.removeBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); headerElement.removeInsertionToTheRight(); testHeaderIsEmpty(headerElement); // now, test that the data was added as expected for ( int i = 0; i < 10; i++ ) - headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip); + headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); testHeaderData(headerElement, test); // test the insertion adding functionality @@ -115,8 +115,8 @@ public class HeaderElementUnitTest extends BaseTest { } private void testHeaderIsEmpty(final HeaderElement headerElement) { - Assert.assertFalse(headerElement.hasConsensusData()); - Assert.assertFalse(headerElement.hasFilteredData()); + Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS)); + Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED)); Assert.assertFalse(headerElement.hasInsertionToTheRight()); Assert.assertTrue(headerElement.isEmpty()); } @@ -125,9 +125,9 @@ public class HeaderElementUnitTest extends BaseTest { Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip); Assert.assertFalse(headerElement.isEmpty()); Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertEquals(headerElement.hasConsensusData(), test.MQ >= minMappingQual); - Assert.assertEquals(headerElement.hasFilteredData(), test.MQ < minMappingQual); - Assert.assertEquals(headerElement.hasConsensusData() ? headerElement.getConsensusBaseCounts().getRMS() : headerElement.getFilteredBaseCounts().getRMS(), (double)test.MQ); + Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS), test.MQ >= minMappingQual); + Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED), test.MQ < minMappingQual); + Assert.assertEquals(headerElement.getBaseCounts(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) ? SlidingWindow.ConsensusType.POSITIVE_CONSENSUS : SlidingWindow.ConsensusType.FILTERED).getRMS(), (double)test.MQ); Assert.assertFalse(headerElement.isVariantFromMismatches(0.05, 0.05)); Assert.assertEquals(headerElement.isVariant(0.05, 0.05, 0.05), test.isClip); } @@ -145,7 +145,7 @@ public class HeaderElementUnitTest extends BaseTest { @DataProvider(name = "alleles") public Object[][] createAllelesData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 }; @@ -174,7 +174,7 @@ public class HeaderElementUnitTest extends BaseTest { for ( int i = 0; i < test.counts.length; i++ ) { final BaseIndex base = BaseIndex.values()[i]; for ( int j = 0; j < test.counts[i]; j++ ) - headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false); + headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false, false); } final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue, test.pvalue); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 4fbbe1d0c..067f36d58 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -46,9 +46,14 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; +import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; @@ -70,6 +75,8 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam"; final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057"; final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam"; + final String HIGH_COVERAGE_BAM = privateTestDir + "NA20313.highCoverageRegion.bam"; + final String HIGH_COVERAGE_L = " -L 1:1650830-1650870"; final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; @@ -158,44 +165,44 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "fa1cffc4539e0c20b818a11da5dba5b9", false); + RRTest("testDefaultCompression ", L, "0e503f7b79ace4c89d74f0943a0de1c0", false); } @Test(enabled = true) public void testDefaultCompressionWithKnowns() { - RRTest("testDefaultCompressionWithKnowns ", L, "d1b5fbc402810d9cdc020bb3503f1325", true); + RRTest("testDefaultCompressionWithKnowns ", L, "6db7ce2733d006f8bd61c42a40d23728", true); } private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; @Test(enabled = true) public void testMultipleIntervals() { - RRTest("testMultipleIntervals ", intervals, "7e9dcd157ad742d4ebae7e56bc4af663", false); + RRTest("testMultipleIntervals ", intervals, "207f2c6d3db956e19412a45a231ca367", false, "043b2838c27d8f9580379b54c18ff40a"); } @Test(enabled = true) public void testMultipleIntervalsWithKnowns() { - RRTest("testMultipleIntervalsWithKnowns ", intervals, "dbb1e95e1bcad956701142afac763717", true); + RRTest("testMultipleIntervalsWithKnowns ", intervals, "f3b11a8a7673b301e27137936fafc6b6", true, "043b2838c27d8f9580379b54c18ff40a"); } @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "8f8fd1a53fa0789116f45e4cf2625906", false); + RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "dcc3716b3665aa1c2dbe6b22d6534aef", false); } @Test(enabled = true) public void testHighCompressionWithKnowns() { - RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "52fd2a77802a4677b604abb18e15d96a", true); + RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "97ae655bf0e483ea227b1aac67ced024", true); } @Test(enabled = true) public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "79c6543d5ce84ebc2ca74404498edbd1", false); + RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a1377eb922e0b09a03a280b691b0b3ff", false); } @Test(enabled = true) public void testLowCompressionWithKnowns() { - RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "271aec358b309603291a974b5ba3bd60", true); + RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "bd7c5b0b210694f364ca6a41f5b89870", true); } @Test(enabled = true) @@ -207,7 +214,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testIndelCompression() { - final String md5 = "d20e6012300898a0315c795cab7583d8"; + final String md5 = "9c9305eda5e4e7f22246ec8a4b242c97"; RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); } @@ -215,27 +222,25 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("e5da09662708f562c0c617ba73cf4763")), "4f916da29d91852077f0a2fdbdd2c7f6"); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("1bda512143be1016dfaca1f7020b6398")), "4f916da29d91852077f0a2fdbdd2c7f6"); } - private static final String COREDUCTION_QUALS_TEST_MD5 = "26d84a2bd549a01a63fcebf8847a1b7d"; - @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5f4d2c1d9c010dfd6865aeba7d0336fe")), COREDUCTION_QUALS_TEST_MD5); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("58c2bae5a339af2ea3c22a46ce8faa68"))); } @Test(enabled = true) public void testCoReductionWithKnowns() { String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("ca48dd972bf57595c691972c0f887cb4")), COREDUCTION_QUALS_TEST_MD5); + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5c251932b49d99a810581e3a6f762878"))); } @Test(enabled = true) public void testInsertionsAtEdgeOfConsensus() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("760500a5b036b987f84099f45f26a804"))); + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("c10653a8c21fb32b5cf580d3704b0edd"))); } /** @@ -249,7 +254,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("67f8a3a647f8ec5212104bdaafd8c862")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fddbec29d0945afbbb34b42994614c15")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); } /** @@ -260,7 +265,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("4f0ef477c0417d1eb602b323474ef377"))); + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7dfe2647992ce1154db340fc742d523a"))); } /** @@ -270,7 +275,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("0ce693b4ff925998867664e4099f3248"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("595e5812c37189930cae93e45765def4"))); } /** @@ -280,7 +285,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7e7b358443827ca239db3b98f299aec6")), "2af063d1bd3c322b03405dbb3ecf59a9"); + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b005727119eee27995705959a637085e")), "2af063d1bd3c322b03405dbb3ecf59a9"); } /** @@ -301,5 +306,42 @@ public class ReduceReadsIntegrationTest extends WalkerTest { String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null"; executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class)); } + + /** + * Confirm that compression is not capping coverage counts to max byte + */ + @Test(enabled = true) + public void testCompressionWorksForHighDepth() { + final String base = String.format("-T ReduceReads -npt -R %s -I %s %s", b37KGReference, HIGH_COVERAGE_BAM, HIGH_COVERAGE_L) + " -o %s"; + final File outputBam = executeTestWithoutAdditionalRRTests("testCompressionWorksForHighDepth", + new WalkerTestSpec(base, 1, Arrays.asList(""))).first.get(0); // No MD5s; we only want to check the coverage + + boolean sawHighCoveragePosition = false; + final SAMFileReader reader = new SAMFileReader(outputBam); + reader.setSAMRecordFactory(new GATKSamRecordFactory()); + + for ( final SAMRecord rawRead : reader ) { + final GATKSAMRecord read = (GATKSAMRecord)rawRead; + read.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, rawRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); + + if ( ! read.isReducedRead() ) + continue; + + final int[] decodedCounts = read.getReducedReadCounts(); + for ( final int count : decodedCounts ) { + if ( count > Byte.MAX_VALUE ) { + sawHighCoveragePosition = true; + break; + } + } + + if ( sawHighCoveragePosition ) + break; + } + + reader.close(); + + Assert.assertTrue(sawHighCoveragePosition, "No positions were found with coverage over max byte (127); the coverage is incorrectly being capped somewhere!"); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index bd0a8933c..c49a671e2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -318,7 +318,7 @@ public class SlidingWindowUnitTest extends BaseTest { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; - this.description = String.format("%d %d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage); + this.description = String.format("%d %d %d %b %b", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage, readsShouldBeLowQuality, variantBaseShouldBeLowQuality); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -390,40 +390,40 @@ public class SlidingWindowUnitTest extends BaseTest { List tests = new ArrayList(); // test high quality reads and bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 1, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 9, 6, 5 + DEEP_COVERAGE_ITERATIONS)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc320), false, false, 11, 10, 4 + (6 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 11, 8, 7 + DEEP_COVERAGE_ITERATIONS)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc320), false, false, 13, 12, 6 + (6 * DEEP_COVERAGE_ITERATIONS))}); // test low quality reads - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 3, 3, 3)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 3, 3, 3)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 3, 3, 3)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 3, 3, 3)}); // test low quality bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 1, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 1, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 1, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 1, 1, 1)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 2, 2, 2)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 2, 2, 2)}); // test mixture - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 1, 1, 1)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 3, 3, 3)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 2, 2, 2)}); // test I/D operators - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9, 2 + (7 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 9, 9, 2 + (7 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 10, 10, 2 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 11, 11, 2 + (9 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); return tests.toArray(new Object[][]{}); } @@ -517,6 +517,39 @@ public class SlidingWindowUnitTest extends BaseTest { Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all } + @Test + public void testConsensusCreationForInsertions() { + + final int totalNumReads = 7; + final ObjectList myReads = new ObjectArrayList<>(totalNumReads); + + // add reads, one with a SNP and one with a SNP and insertion + for ( int i = 0; i < totalNumReads; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + + final byte[] bases = Utils.dupBytes((byte) 'A', readLength); + if ( i < 2 ) + bases[20] = 'C'; + if ( i == 0 ) + bases[80] = 'C'; + read.setReadBases(bases); + + if ( i == 0 ) + read.setCigarString("80M1I19M"); + + myReads.add(read); + } + + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + final Pair, CompressionStash> result = slidingWindow.close(null); + Assert.assertEquals(result.getFirst().size(), 3); // no compression at all for SNPs + } + @Test public void testAddingReadPairWithSameCoordinates() { final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); @@ -586,6 +619,76 @@ public class SlidingWindowUnitTest extends BaseTest { Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); } + @DataProvider(name = "DownsamplingFromClose") + public Object[][] createDownsamplingFromCloseTestData() { + + final ObjectList myReads = new ObjectArrayList<>(20); + for ( int i = 0; i < 21; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); + final byte[] bases = Utils.dupBytes((byte) 'A', readLength); + if ( i < 5 ) + bases[50] = 'C'; + read.setReadBases(bases); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + myReads.add(read); + } + + List tests = new ArrayList<>(); + + for ( int i = 1; i < 25; i++ ) + tests.add(new Object[]{myReads, i}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DownsamplingFromClose", enabled = true) + public void testDownsamplingTestFromClose(final ObjectList myReads, final int dcov) { + + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); // no het compression + + Assert.assertEquals(result.getFirst().size(), Math.min(dcov, myReads.size()), "Down-sampling was not performed correctly"); + } + + @DataProvider(name = "NoDownsamplingForConsensusReads") + public Object[][] createNoDownsamplingForConsensusReadsData() { + + final ObjectList myReads = new ObjectArrayList<>(20); + for ( int i = 0; i < 30; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); + final byte[] bases = Utils.dupBytes((byte) 'A', readLength); + if ( i < 10 ) + bases[50] = 'C'; + read.setReadBases(bases); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); + read.setReadNegativeStrandFlag(i % 2 == 0); + myReads.add(read); + } + + List tests = new ArrayList<>(); + + for ( int i = 0; i < 5; i++ ) + tests.add(new Object[]{myReads, i}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "NoDownsamplingForConsensusReads", enabled = true) + public void testNoDownsamplingForConsensusReads(final ObjectList myReads, final int dcov) { + + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + Pair, CompressionStash> result = slidingWindow.close(null); // allow het compression (so we expect 4 reads) + + Assert.assertEquals(result.getFirst().size(), 4, "Down-sampling was performed on consensus reads!"); + } ////////////////////////////////////////////////////////////// //// This section tests the consensus base quals accuracy //// @@ -739,21 +842,45 @@ public class SlidingWindowUnitTest extends BaseTest { read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); read.setMappingQuality(30); + read.setReadNegativeStrandFlag(false); // add the read final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); for ( int i = 0; i < start; i++ ) - Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); + Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); for ( int i = 0; i < readLength; i++ ) - Assert.assertEquals(windowHeader.get(start + i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 1); + Assert.assertEquals(windowHeader.get(start + i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1); for ( int i = start + readLength; i < currentHeaderLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); + Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); // now remove the read slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, true, start); for ( int i = 0; i < currentHeaderLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getConsensusBaseCounts().countOfBase(BaseUtils.Base.A.base), 0); + Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); + } + + @Test + public void testUpdateHeaderForReadWithHighMQ() { + + // set up the window header + final int currentHeaderStart = 100; + final LinkedList windowHeader = new LinkedList<>(); + for ( int i = 0; i < readLength; i++ ) + windowHeader.add(new HeaderElement(currentHeaderStart + i)); + + // set up the read + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setMappingQuality(180); + read.setReadNegativeStrandFlag(false); + + // add the read and make sure it's not filtered because of low MQ (byte vs. int) + final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); + slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); + for ( int i = 0; i < readLength; i++ ) + Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1); } ////////////////////////////////////////////////////////////////////////////////// diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java index 52e385957..0259e9685 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/diagnosetargets/DiagnoseTargetsIntegrationTest.java @@ -66,11 +66,11 @@ public class DiagnoseTargetsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSingleSample() { - DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "1771e95aed2b3b240dc353f84e19847d"); + DTTest("testSingleSample ", "-I " + singleSample + " -max 75", "6ca3d3917a7b65eaa877aa3658d80912"); } @Test(enabled = true) public void testMultiSample() { - DTTest("testMultiSample ", "-I " + multiSample, "c7f1691dbe5f121c4a79be823d3057e5"); + DTTest("testMultiSample ", "-I " + multiSample, "f50c6b9bef9f63f0a8b32ae9a9bdd51a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java new file mode 100644 index 000000000..7ab891bd0 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/missing/QualifyMissingIntervalsUnitTest.java @@ -0,0 +1,131 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.missing; + +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import java.util.List; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Created with IntelliJ IDEA. + * User: carneiro + * Date: 9/20/13 + * Time: 3:59 PM + */ +public class QualifyMissingIntervalsUnitTest extends BaseTest { + @Test(enabled = true) + public void testInterpretation() { + final QualifyMissingIntervals tool = new QualifyMissingIntervals(); + + final Metrics unmappable = new Metrics(0.5, 7500.0, 0.0, 2500, 20); + final Metrics highGC = new Metrics(0.99, 0.0, 0.0, 0, 20); + final Metrics lowGC = new Metrics(0.09, 0.0, 0.0, 0, 20); + final Metrics unsequenceable = new Metrics(0.5, 3.0, 1200.0, 10, 20); + final Metrics noData = new Metrics(0.5, 0.0, 0.0, 0, 20); + final Metrics unknown = new Metrics(0.5, 30.0, 120000.0, 2500, 20); + + final Metrics[] array = {unmappable, highGC, lowGC, unsequenceable, noData, unknown}; + + final GenomeLoc testInterval = new UnvalidatingGenomeLoc("chr1", 0, 10000, 20000); + final GenomeLoc smallInterval = new UnvalidatingGenomeLoc("chr1", 0, 1, 4); + + + Assert.assertNotEquals(tool.checkMappability(unmappable), ""); + Assert.assertNotEquals(tool.checkGCContent(highGC), ""); + Assert.assertNotEquals(tool.checkGCContent(lowGC), ""); + Assert.assertNotEquals(tool.checkContext(unsequenceable), ""); + + Assert.assertEquals(tool.interpret(unmappable, testInterval), QualifyMissingIntervals.Interpretation.UNMAPPABLE.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(noData, testInterval), QualifyMissingIntervals.Interpretation.NO_DATA.toString()); + Assert.assertEquals(tool.interpret(unknown, testInterval), QualifyMissingIntervals.Interpretation.UNKNOWN.toString()); + + for (Metrics m : array) + Assert.assertEquals(tool.interpret(m, smallInterval), QualifyMissingIntervals.Interpretation.SMALL_INTERVAL.toString()); + } + + @Test(enabled = true) + void testGetPositionInTarget () { + final UnvalidatingGenomeLoc target = new UnvalidatingGenomeLoc("a", 0, 30, 50); + final List targets = new ObjectArrayList<>(1); + targets.add(target); + + // left overlap + UnvalidatingGenomeLoc interval = new UnvalidatingGenomeLoc("a", 0, 10, 50); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -20); + + // right overlap + interval = new UnvalidatingGenomeLoc("a", 0, 40, 60); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -10); + + // interval > target with short right tail + interval = new UnvalidatingGenomeLoc("a", 0, 10, 60); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -10); + + // interval > target with short left tail + interval = new UnvalidatingGenomeLoc("a", 0, 10, 80); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), -30); + + // interval < target with short right tail + interval = new UnvalidatingGenomeLoc("a", 0, 32, 40); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), 2); + + // interval < target with short left tail + interval = new UnvalidatingGenomeLoc("a", 0, 40, 42); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, targets), 8); + + // no overlap + interval = new UnvalidatingGenomeLoc("a", 0, 40, 42); + Assert.assertEquals(QualifyMissingIntervals.getPositionInTarget(interval, new ObjectArrayList()), Integer.MIN_VALUE); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java index 77c9f96c9..fd1f0de8a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java @@ -48,49 +48,24 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.collections.Pair; +import org.junit.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; import java.util.Random; public class BiasedDownsamplingIntegrationTest extends WalkerTest { - private final static String baseCommand1 = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommand2 = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:1,000,000-5,000,000"; - private final static String baseCommand3 = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000"; + private final static String baseCommandUG = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000"; + private final static String baseCommandHC = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:4,000,000-5,000,000" + " --useFilteredReadsForAnnotations"; + private final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - // -------------------------------------------------------------------------------------------------------------- - // - // testing UnifiedGenotyper contamination down-sampling - // - // -------------------------------------------------------------------------------------------------------------- - - @Test(enabled = false) - public void testContaminationDownsamplingFlat() { - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1, - Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb")); - executeTest("test contamination_percentage_to_filter 0.20", spec); - } - - @Test(enabled = false) - public void testContaminationDownsamplingFlatAndPerSample() { - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_per_sample_file " + ArtificalBAMLocation + "NA12878.NA19240.contam.txt --contamination_fraction_to_filter 0.10", 1, - Arrays.asList("53395814dd6990448a01a294ccd69bd2")); - executeTest("test contamination_percentage_to_filter per-sample and .20 overall", spec); - } - - @Test(enabled = false) - public void testContaminationDownsamplingPerSampleOnly() { - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand1 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contaminationFile " + ArtificalBAMLocation + "NA19240.contam.txt", 1, - Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); - executeTest("test contamination_percentage_to_filter per-sample", spec); - } - // -------------------------------------------------------------------------------------------------------------- // @@ -98,150 +73,49 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - @Test(enabled = false) + @Test private void testDefaultContamination() { final String bam1 = "NA11918.with.1.NA12842.reduced.bam"; final String bam2 = "NA12842.with.1.NA11918.reduced.bam"; WalkerTestSpec spec = new WalkerTestSpec( - baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s ", 1, - Arrays.asList("e2e5a8dd313f8d7e382e7d49dfac59a2")); - executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with default downsampling.", spec); + baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination .05 ", 1, + Arrays.asList("b13612312ff991cf40ddc44255e76ecd")); + executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with .05 downsampling.", spec); } - private void testFlatContamination(final String bam1, final String bam2, final Double downsampling, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination " + downsampling.toString(), 1, - Arrays.asList(md5)); - executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec); - } - - @Test(enabled = false) - public void testFlatContaminationCase1() { - testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "e2e5a8dd313f8d7e382e7d49dfac59a2"); - } - - @Test(enabled = false) - public void testFlatContaminationCase2() { - testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "549737002f98775fea8f46e7ea174dde"); - } - - @Test(enabled = false) - public void testFlatContaminationCase3() { - testFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "529d82c2a33fcc303a5dc55de2d56979"); - } - - @Test(enabled = false) - public void testFlatContaminationCase4() { - testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.1, "b5689972fbb7d230a372ee5f0da1c6d7"); - } - - @Test(enabled = false) - public void testFlatContaminationCase5() { - testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.2, "9dceee2e921b53fbc1ce137a7e0b7b74"); - } - - @Test(enabled = false) - public void testFlatContaminationCase6() { - testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.2.NA11918.reduced.bam", 0.3, "d6a74061033503af80dcaea065bfa075"); - } - - @Test(enabled = false) - public void testFlatContaminationCase7() { - testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "7d1b5efab58a1b8f9d99fcf5af82f15a"); - } - - @Test(enabled = false) - public void testFlatContaminationCase8() { - testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "a7f8d5c79626aff59d7f426f79d8816e"); - } - - @Test(enabled = false) - public void testFlatContaminationCase9() { - testFlatContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.3, "fcf482398b7c908e3e2d1e4d5da6377b"); - } - - private void testPerSampleContamination(String bam1, String bam2, String persampleFile, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand2 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contaminationFile " + persampleFile, 1, - Arrays.asList(md5)); - executeTest("test contamination on Artificial Contamination (per-sample) on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase1() { - testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "e00278527a294833259e9e411728e395"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase2() { - testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "a443e793f0b0e2ffce1b751634d706e2"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase3() { - testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "e11d83a7815ce757afbcf7689568cb25"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase4() { - testPerSampleContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "615042eeeffe042bd1c86279d34f80b6"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase5() { - testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.1.txt", "9bc99fc79ca34744bf26cb19ee4ef44d"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase6() { - testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.2.txt", "143626fe5fce765d6c997a64f058a813"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase7() { - testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.3.txt", "f2593674cef894eda4e0be9cf3158f57"); - } - - @Test(enabled = false) - public void testPerSampleContaminationCase8() { - testPerSampleContamination("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.4.txt", "fb7ce0740767ae3896b3e552026da1e4"); - } - - - private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling, final String md5) { - final String command = baseCommand3 + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; - - WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList(md5)); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); - - spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList(md5)); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); - - } // verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level - @Test(enabled = false) - public void testPerSampleEqualsFlatContaminationCase1() { - testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0, ""); + + @DataProvider(name="PerSampleEqualFlatContamBams") + public Object[][] makePerSampleEqualFlatContamBams() { + final List tests = new LinkedList(); + tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0}) ; + tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ; + tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ; + + return tests.toArray(new Object[][]{}); } - @Test(enabled = false) - public void testPerSampleEqualsFlatContaminationCase2() { - testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15, ""); - } + @Test(dataProvider = "PerSampleEqualFlatContamBams") + private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling) { + final String command = baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; - @Test(enabled = false) - public void testPerSampleEqualsFlatContaminationCase3() { - testPerSampleEqualsFlat("NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3, ""); - } + WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList("")); + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result + Pair, List> test1 = executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); + + spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList("")); + + rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result + Pair, List> test2 = executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); + + //verify that the md5s match up. + Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0)); + } // -------------------------------------------------------------------------------------------------------------- // @@ -250,50 +124,39 @@ public class BiasedDownsamplingIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- - @Test(enabled = false) - public void testHCContaminationDownsamplingFlat() { - final String baseCommand = "-T HaplotypeCaller -R " + b36KGReference + " --no_cmdline_in_header --dbsnp " + b36dbSNP129; - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -contamination 0.20", 1, - Arrays.asList("c3a253467ead7b1cfe9fd9dd310828b1")); - executeTest("HC calling with contamination_percentage_to_filter 0.20", spec); - } - // HaplotypeCaller can only (currently) use flat contamination reduction, not per-sample. Until that is implemented, this test - @Test(enabled = false) - public void testHCCannotProcessPerSampleContamination() { - final String baseCommand = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:3,000,000-5,000,000"; - final String bam1 = "NA11918.with.1.NA12842.reduced.bam"; - final String perSampleFile = ArtificalBAMLocation + "contamination.case.1.txt"; - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand + " -I " + ArtificalBAMLocation + bam1 + " -o %s -contaminationFile " + perSampleFile, 1, - UserException.class); - executeTest("HC should fail on per-Sample contamination removal.", spec); + @DataProvider(name="PerSampleEqualFlatContamBamsHC") + public Object[][] makePerSampleEqualFlatContamBamsHC() { + final List tests = new LinkedList(); + tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0 }) ; + tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ; + tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ; + + return tests.toArray(new Object[][]{}); } - private void testHCFlatContamination(final String bam1, final String bam2, final Double downsampling, final String md5) { - final String baseCommand = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:3,000,000-5,000,000"; + @Test(dataProvider = "PerSampleEqualFlatContamBamsHC") + private void testPerSampleEqualsFlatHC(final String bam1, final String bam2, final String persampleFile, final Double downsampling) { + final String command = baseCommandHC + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; + + WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList("")); + final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + + rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result + + Pair, List> test1= executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); + + WalkerTestSpec spec2 = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList("")); + + rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result + Pair, List> test2=executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); + + //verify that the md5s match up. + Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0)); - WalkerTestSpec spec = new WalkerTestSpec( - baseCommand + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination " + downsampling.toString(), 1, - Arrays.asList(md5)); - executeTest("HC test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " downsampling " + downsampling.toString(), spec); } - @Test(enabled = false) - public void testHCFlatContaminationCase1() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.05, "c3e695381d8627e3922d8c642b66c3ce"); - } - @Test(enabled = false) - public void testHCFlatContaminationCase2() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.1, "002d2b45336d88d7c04e19f9f26e29d9"); - } - @Test(enabled = false) - public void testHCFlatContaminationCase3() { - testHCFlatContamination("NA11918.with.1.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", 0.2, "1809a33ac112d1a3bd7a071c566794dd"); - } - -} +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index 355a47cbc..445864380 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -129,7 +129,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { } private List getConsensusAlleles(int eventLength, boolean isInsertion, int minCnt, double minFraction, String altBases) { - final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(pileupProvider.genomeLocParser, true, minCnt, minFraction); + final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(true, minCnt, minFraction); return counter.computeConsensusAlleles(pileupProvider.referenceContext, pileupProvider.getAlignmentContextFromAlleles(isInsertion?eventLength:-eventLength,altBases,numReadsPerAllele), AlignmentContextUtils.ReadOrientation.COMPLETE); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index aaa3b1284..460b80121 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -79,6 +79,6 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "98f4d78aad745c6e853b81b2e4e207b4"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "4dd1b38f0389e339ce8a05956956aa8a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index 0eb89adc7..48f36ccc6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -58,7 +58,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","25902d7a6a0c00c60c2d5845dfaa1a4c"); + executor.PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","39f559996f8d429839c585bbab68dbde"); } @Test(enabled = true) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java index 65a569cdc..9556f9bf1 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIndelCallingIntegrationTest.java @@ -56,8 +56,8 @@ import java.util.List; public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { - private final static String baseCommandIndels = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommandIndels = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- // @@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("ef8151aa699da3272c1ae0986d16ca21")); + Arrays.asList("3c8727ee6e2a6f10ab728c4869dd5b92")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("7f88229ccefb74513efb199b61183cb8")); + Arrays.asList("0cbe889e03bab6512680ecaebd52c536")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("1928ad48bcd0ca180e046bc235cfb3f4")); + Arrays.asList("3d12bdb816d27bf7c9efb4c13dc2aec7")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("6663e434a7b549f23bfd52db90e53a1a")); + Arrays.asList("475f8148123792064130faf9f9030fec")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("581c552664e536df6d0f102fb0d10e5a")); + Arrays.asList("a7e4e1bd128424d46cffdd538b220074")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1, - Arrays.asList("5596851d19582dd1af3901b7d703ae0a")); + Arrays.asList("a2c8e83f37cd1e114b42af4b873f57bc")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("862d82c8aa35f1da4f9e67b5b48dfe52")); + Arrays.asList("d3721bee5edaa31fdd35edd7aa75feb3")); executeTest("test minIndelFraction 0.0", spec); } @@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("8d9fc96be07db791737ac18135de4d63")); + Arrays.asList("a5b6d7b32953500d936d3dff512a6254")); executeTest("test minIndelFraction 0.25", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 532982853..2cdddd49f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -51,12 +51,16 @@ import org.broad.tribble.readers.AsciiLineReader; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; import java.util.Collections; +import java.util.List; // ********************************************************************************** // // Note that this class also serves as an integration test for the VariantAnnotator! // @@ -64,8 +68,8 @@ import java.util.Collections; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; // -------------------------------------------------------------------------------------------------------------- // @@ -85,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("4aa226c00a242047cf427d0919003048")); + Arrays.asList("bc8a4e4ceb46776169b47146805c882a")); executeTest("test SLOD", spec); } @@ -101,7 +105,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("50937942e3d228614d2531c3be237709")); + Arrays.asList("21185d9a7519356ba672757f5a522971")); executeTest("test using comp track", spec); } @@ -164,7 +168,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void emitPLsAtAllSites() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --output_mode EMIT_ALL_SITES -allSitePLs", 1, - Arrays.asList("7cc55db8693759e059a05bc4398f6f69")); + Arrays.asList("552aced1b1ef7e4a554223f4719f9560")); + // GDA: TODO: BCF encoder/decoder doesn't seem to support non-standard values in genotype fields. IE even if there is a field defined in FORMAT and in the header the BCF2 encoder will still fail + spec1.disableShadowBCF(); + executeTest("test all site PLs 1", spec1); } @@ -175,12 +182,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "3b66f82dbb746875638e076bf51a1583" ); + testHeterozosity( 0.01, "2f3051caa785c7c1e2a8b23fa4da90b1" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "714c1795334c7c62c046a75479381ae6" ); + testHeterozosity( 1.0 / 1850, "228df9e38580d8ffe1134da7449fa35e" ); } private void testHeterozosity(final double arg, final String md5) { @@ -196,7 +203,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "6f79205f7ed8006470f056f6805db6c8"; + private final static String COMPRESSED_OUTPUT_MD5 = "eebec02fdde9937bffaf44902ace6207"; @Test public void testCompressedOutput() { @@ -217,24 +224,25 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "d408b4661b820ed86272415b8ea08780"; + String md5 = "1f3fad09a63269c36e871e7ee04ebfaa"; + final String myCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, Arrays.asList(md5)); executeTest("test parallelization (single thread)", spec1); GenomeAnalysisEngine.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, Arrays.asList(md5)); executeTest("test parallelization (2 threads)", spec2); GenomeAnalysisEngine.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, + myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, Arrays.asList(md5)); executeTest("test parallelization (4 threads)", spec3); } @@ -252,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("31be725b2a7c15e9769391ad940c0587")); + Arrays.asList("150b31ba05113ca1996b548be5170d6d")); executeTest(String.format("test multiple technologies"), spec); } @@ -271,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("dcc5cec42730567982def16da4a7f286")); + Arrays.asList("7d0ee85cd89f4addd84c5511daaaa5c5")); executeTest(String.format("test calling with BAQ"), spec); } @@ -320,4 +328,58 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { nLines++; Assert.assertTrue(nLines > 0); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing only emit samples + // + // -------------------------------------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testOnlyEmitSample() throws Exception { + final String base = "-T UnifiedGenotyper -R " + b37KGReference + " -I " + + privateTestDir + "AFR.complex.variants.bam --disableDithering" + + " -o %s -L 20:10,000,000-10,100,000"; + final WalkerTestSpec specAllSamples = new WalkerTestSpec(base, 1, Arrays.asList("")); + specAllSamples.disableShadowBCF(); + final File allSamplesVCF = executeTest("testOnlyEmitSampleAllSamples", specAllSamples).first.get(0); + final List allSampleVCs = GATKVCFUtils.readVCF(allSamplesVCF).getSecond(); + + final WalkerTestSpec onlyHG01879 = new WalkerTestSpec(base + " -onlyEmitSamples HG01879", 1, Arrays.asList("")); + onlyHG01879.disableShadowBCF(); + final File onlyHG01879VCF = executeTest("testOnlyEmitSample", onlyHG01879).first.get(0); + final List onlyHG01879VCs = GATKVCFUtils.readVCF(onlyHG01879VCF).getSecond(); + + Assert.assertEquals(allSampleVCs.size(), onlyHG01879VCs.size()); + for ( int i = 0; i < allSampleVCs.size(); i++ ) { + final VariantContext allSampleVC = allSampleVCs.get(i); + final VariantContext onlyHG01879VC = onlyHG01879VCs.get(i); + + if ( allSampleVC == null ) { + Assert.assertNull(onlyHG01879VC); + } else { + Assert.assertNotNull(onlyHG01879VC); + + Assert.assertTrue(allSampleVC.getGenotypes().size() > 1, "All samples should have had more than 1 genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.getGenotypes().size(), 1, "Should have found a single sample genotype, but didn't"); + Assert.assertEquals(onlyHG01879VC.hasGenotype("HG01879"), true); + + Assert.assertEquals(allSampleVC.getStart(), onlyHG01879VC.getStart()); + Assert.assertEquals(allSampleVC.getChr(), onlyHG01879VC.getChr()); + Assert.assertEquals(allSampleVC.getEnd(), onlyHG01879VC.getEnd()); + Assert.assertEquals(allSampleVC.getFilters(), onlyHG01879VC.getFilters()); + Assert.assertEquals(allSampleVC.getAlleles(), onlyHG01879VC.getAlleles()); + Assert.assertEquals(allSampleVC.getAttributes(), onlyHG01879VC.getAttributes()); + Assert.assertEquals(allSampleVC.getPhredScaledQual(), onlyHG01879VC.getPhredScaledQual()); + + final Genotype allG = allSampleVC.getGenotype("HG01879"); + final Genotype onlyG = onlyHG01879VC.getGenotype("HG01879"); + Assert.assertEquals(allG.getAD(), onlyG.getAD()); + Assert.assertEquals(allG.getDP(), onlyG.getDP()); + Assert.assertEquals(allG.getAlleles(), onlyG.getAlleles()); + Assert.assertEquals(allG.getPL(), onlyG.getPL()); + Assert.assertEquals(allG.toString(), onlyG.toString()); + } + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 1bfbbac17..18554e157 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -53,7 +53,7 @@ import java.util.Arrays; public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ - private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; // -------------------------------------------------------------------------------------------------------------- // @@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("a9466c1e3ce1fc4bac83086b25a6df54")); + Arrays.asList("ec0977e3fd3e2ac29c9821f0ca830455")); executeTest("test MultiSample Pilot1", spec); } @@ -88,22 +88,22 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("aaadb2a355d87344eabb6ac4495a11e4")); + Arrays.asList("02b521fe88a6606a29c12c0885c3debd")); executeTest("test SingleSample Pilot2", spec); } @Test public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("06c85e8eab08b67244cf38fc785aca22")); + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, + Arrays.asList("dd5ad3beaa75319bb2ef1434d2dd9f73")); executeTest("test Multiple SNP alleles", spec); } @Test public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, Arrays.asList("d915535c1458733f09f82670092fcab6")); executeTest("test bad read", spec); } @@ -111,16 +111,16 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ @Test public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("f3da1ff1e49a831af055ca52d6d07dd7")); + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("a973298b2801b80057bea88507e2858d")); executeTest("test reverse trim", spec); } @Test public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("20ff311f363c51b7385a76f6f296759c")); + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, + Arrays.asList("8d91d98c4e79897690d3c6918b6ac761")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java index 33810e255..3b5690046 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f")); executeTest("test calling on a ReducedRead BAM", spec); } @@ -74,13 +74,13 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "4b4902327fb132f9aaab3dd5ace934e1"); + testReducedCalling("INDEL", "942930038cf7fc9a80b969461aaa9aa6"); } private void testReducedCalling(final String model, final String md5) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, + "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, Arrays.asList(md5)); executeTest("test calling on a ReducedRead BAM with " + model, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java new file mode 100644 index 000000000..d8c3a3ebd --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java @@ -0,0 +1,425 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.caliper.Param; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 8/13/13 + * Time: 2:48 PM + * To change this template use File | Settings | File Templates. + */ +public class ActiveRegionTestDataSetUnitTest extends BaseTest { + + + + @Test(dataProvider="activeRegionTestDataSets") + public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + Assert.assertNotNull(as); + Assert.assertEquals(as.assemblyResultSet().getMaximumKmerSize(),kmerSize); + final List reads = as.readList(); + Assert.assertEquals(reads.size(),readCount); + for (final GATKSAMRecord r : reads) { + Assert.assertEquals(r.getReadLength(),readLength); + } + + final List haplotypes = as.haplotypeList(); + final List haplotypeCivars = Civar.fromCharSequence(variation).optionalizeAll().unroll(); + + Assert.assertEquals(haplotypes.size(),haplotypeCivars.size()); + Assert.assertTrue(haplotypeCivars.size() > 1); + int variants = 0; + for (int i = 0; i < variation.length(); i++) { + final char c = variation.charAt(i); + switch (c) { + case 'W': + case 'T': + case 'C': + case 'D': + case 'I': + variants++; + default: + + } + } + + Assert.assertEquals(haplotypes.size(),(int) Math.pow(2,variants)); + + final Map haplotypeNumberByString = new HashMap<>(); + for (int i = 0; i < haplotypes.size(); i++) { + final Haplotype hap = haplotypes.get(i); + final Civar civar = haplotypeCivars.get(i); + Assert.assertEquals(hap.getBaseString(),civar.applyTo(as.getReference())); + if (i == 0) { + Assert.assertEquals(hap.getBaseString(), as.getReference()); + } else { + Assert.assertNotEquals(hap.getBaseString(),as.getReference()); + } + Assert.assertFalse(haplotypeNumberByString.containsKey(hap.getBaseString())); + haplotypeNumberByString.put(hap.getBaseString(), i); + } + + final int[] hapReadsNotInReference = new int[haplotypes.size()]; + + for (int i = 0; i < readCount; i++) { + final GATKSAMRecord r = as.readList().get(i); + + final int hapNumber = i % haplotypes.size(); + final int offset = i % (haplotypes.get(hapNumber).length() - readLength); + Assert.assertEquals(r.getReadString(),haplotypes.get(hapNumber).getBaseString().substring(offset,offset+readLength)); + if (as.getReference().indexOf(r.getReadString()) == -1) { + hapReadsNotInReference[hapNumber]++; + } + } + + Assert.assertEquals(hapReadsNotInReference[0],0); + + for (int i = 1; i < hapReadsNotInReference.length; i++) { + Assert.assertNotEquals(hapReadsNotInReference[i],0); + } + + } + + /** + * Constructs a test data-set based on the given parameters. + * @param kmerSize length of the kmer. + * @param readLength length of the read. + * @param variation variation in that active region. + * @param readCount number of reads in the active region + * @param regionSize Active region size (~ size of the haplotype(s)) + * @param bq Base quality value common for all base-calls. + * @param iq Insertion quality based for all read positions. + * @param dq Deletion quality based for all read positions. + * @return never null. + */ + public static ActiveRegionTestDataSet createActiveRegionTestDataSet(final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + + final String reference = REF.substring(0, regionSize); + + final ActiveRegionTestDataSet result = new ActiveRegionTestDataSet(kmerSize, reference, + new String[]{"Civar:" + variation}, + new String[]{"*:" + readCount + ":" + readLength}, byteRepeat(bq, readLength), byteRepeat(dq, readLength), byteRepeat(iq, readLength)); + + + return result; + } + + @DataProvider(name="activeRegionTestDataSets") + public Iterator activeRegionTestDataSets() { + return new java.util.Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < ACTIVE_REGION_TEST_DATA_SET_PARAMETERS.length; + } + + @Override + public Object[] next() { + + final Object[] params = ACTIVE_REGION_TEST_DATA_SET_PARAMETERS[i++]; + final int kmerSize = (Integer) params[0]; + final int readLength = (Integer) params[1]; + final String variation = (String) params[2]; + final int readCount = (Integer) params[3]; + final int regionSize = (Integer) params[4]; + final ActiveRegionTestDataSet dataSet = createActiveRegionTestDataSet(kmerSize, readLength, variation, readCount, regionSize, (byte) 20, (byte) 35, (byte) 35); + return new Object[] { dataSet , kmerSize, readLength, variation, readCount, regionSize, (byte)20, (byte) 35, (byte) 35}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + private static int[] KMER_SIZES = new int[] { 10 }; + private static int[] READ_COUNTS = new int[] { 1000 }; + private static int[] READ_LENGTHS = new int[] { 100 }; + private static String[] VARIATION_CIVARS = new String[] { + "*1T*", + "*3Iacg*", + "*30Igctcggatgccttgcggggctccagagtcc*", + "*3D*", + "*30D*", + "*1T3=3Iacg*", + "*1T*3Iacg*", + "*1T8=1T8=1T8=1T8=1T*", + "*1T*1T*1T*1T*1T*" + }; + + private static int[] REGION_SIZE = new int[] { 300 }; + + static { + try { + if (KMER_SIZES == null) KMER_SIZES = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("kmerSize").getAnnotation(Param.class).value()); + if (READ_COUNTS == null) READ_COUNTS = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("readCount").getAnnotation(Param.class).value()); + if (READ_LENGTHS == null) READ_LENGTHS = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("readLength").getAnnotation(Param.class).value()); + if (VARIATION_CIVARS == null) VARIATION_CIVARS = HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("variation").getAnnotation(Param.class).value(); + if (REGION_SIZE == null) REGION_SIZE = intValues(HCLikelihoodCalculationEnginesBenchmark.class.getDeclaredField("regionSize").getAnnotation(Param.class).value()); + + } catch (NoSuchFieldException e) { + throw new ExceptionInInitializerError(e); //To change body of catch statement use File | Settings | File Templates. + } + } + + private static int[] intValues(final String[] kmerSizes) { + final int[] result = new int[kmerSizes.length]; + for (int i = 0; i < result.length; i++) + result[i] = Integer.parseInt(kmerSizes[i]); + return result; + } + + private static final Object[][] ACTIVE_REGION_TEST_DATA_SET_PARAMETERS; + + static { + final int totalLength = KMER_SIZES.length * READ_COUNTS.length * READ_LENGTHS.length * VARIATION_CIVARS.length * REGION_SIZE.length; + ACTIVE_REGION_TEST_DATA_SET_PARAMETERS = new Object[totalLength][]; + int next = 0; + for (final int ks : KMER_SIZES) + for (final int rc : READ_COUNTS) + for (final int rl : READ_LENGTHS) + for (final String v : VARIATION_CIVARS) + for (final int rs : REGION_SIZE) + ACTIVE_REGION_TEST_DATA_SET_PARAMETERS[next++] = new Object[] { ks, rl, v, rc, rs }; + + } + + private static byte[] byteRepeat(final byte bq, final int readLength) { + final byte[] result = new byte[readLength]; + Arrays.fill(result, bq); + return result; + } + + + + private static final String REF = + "TCGAGAAATTTGTATCCCGCCCCCGCAGCTTGCCAGCTCTTTCAGTATCATGGAGCCCAT" + + "GGTTGAATGAGTCCAATAACGAACTTCGACATGATAAAATCCCCCCCTCGCGACTTCCAG" + + "AGAAGAAGACTACTGACTTGAGCGTTCCCAGCACTTCAGCCAAGGAAGTTACCAATTTTT" + + "TGTTTCCGAATGACACGCGTCTCCTTGCGGGTAGATCGCCGACCGCAGAACTTACGAGCC" + + "AGGGGAAACAGTAAGGCCTAATTAGGTAAAGGGAGTAAGTGCTCGAACGCTTCAGATGTA" + + "ACCATATACTTACGCTGGATCTTCTCCCGCGAATTTTAACCCTCACCAACTACGAGATTT" + + "GAGGTAAACCAAATAAGCACGTAGTGGCGCTATCCGACTGTTCCCAAATTGTAACTTATC" + + "GTTCCGTGAAGGCCAGAGTTACTTCCCGGCCCTTTCCATGCGCGCACCATACCCTCCTAG" + + "TTCCCCGGTTATCTCTCCGAGGAGGGAGTGAGCGATCCTCCGTTTACGTTTTGTTACCAA" + + "TGACGTAGCTATGTATTTTGTACAGGTTGCCAACGGGTTTCACAATTCACAGATAGTGGG" + + "GATCCCGGCAAAGGGCCTATATTTGCGGTCCAACTTAGGCGTAAACTACGATGGTACCTA" + + "CTCAGACCCAGCTCGCGCGGCGTAAATAACGCACTCATCCCAGCTGATTCTCGGCGATCT" + + "ACGCAGCGACATGATTATCAACAGCTGTCTGGCAGCTCTAATCTTTTACCATGGTCGTAA" + + "AAGCCTCCAAGAGTTAGATCATACCTAACGCCACAAAAGTGACACGACGCCGATGGGTAC" + + "CGGACTTTAGGTCGACCACAGTTCGGTAAGGGAGAGGCCCTGCGGCGTACTTCATTTTGT" + + "ATATGCAACGTGCCCAAGTGGCGCCAGGCAAGTCTCAGCTGGTTCCTGTGTTAGCTCGAG" + + "GCTAGGCATGGGAGCTGATTGAACATGGGTTGGGGGCCTCGAACCGTCGAGGACCCCATA" + + "GTACCTCGGACACCAAGTAGGGCAGCCTATAGTTTGAAGCAGTACTATTTCAGGGGGGGA" + + "GCCCTCATGGTCTCTTCTACTGATGACTCAACACGCTAGGGACGTGAAGTCGATTCCTTC" + + "GATGGTTATAAATCAAAGGCTCAGAGTGCAGTCTGGAGCGCCCATCTAACGGTACGCATC" + + "TCGATTGCTCGGTCGCCTTTCACACTCCGCGAAAATTCATACCGCTCATTCACTAGGTTG" + + "CGAAGCCTACACTGATATATGAATCCAAGCTAGAGCAGGGCTCTTAAAATTCGGAGTTGT" + + "AGATGCTCAATACTCCAATCGGTTTTTTCGTGCACCACCGCGGGTGGCTGACAAGGGTTT" + + "GACATCGAGAAACAAGGCAGTTCCGGGCTGAAAGTAGCGCCGGGTAAGGTACGCGCCTGG" + + "TATGGCAGGACTATGAAGCCAATACAAAGGCTACATCCTCACTCGGGTGGACGGAAACGC" + + "AGAATTATGGTTACTTTTTGGATACGTGAAACATGTCCCATGGTAGCCCAAAGACTTGGG" + + "AGTCTATCACCCCTAGGGCCCATTTCTGGATATAGACGCCAGGTTGAATCCGTATTTGGA" + + "GGTACGATGGATCAGTCTGGGTGGGACGTGCTCCATTTATACCCTGCGCAGGCTGGACCG" + + "AGGACCGCAAGATGCGACGGTGCACAAGTAATTGACAACAAACCATCGTGTTTTCATTAT" + + "GGTACCAGGATCTTCAAGCCGAGTCAATCAAGCTCGGATTACAGTGTTTACCGCGTCTTG" + + "CGGTTACTCACAAACTGTAATCCACCACAAGTCAAGCCATTGCCTCTCTGAGACGCCGTA" + + "TGAATTAATATGTAAACTTTGCGCGGGTTCACTGCGATCCGTTCAGTCTCGTCCAAGGGC" + + "ACAATCGAATTCCCATTTGTATGTTCGGCTAACTTCTACCCATCCCCCGAAGTTTAGCAG" + + "GTCGTGAGGTGTCATGGAGGCTCTCGTTCATCCCGTGGGACATCAAGCTTCGCCTTGATA" + + "AAGCACCCCGCTCGGGTGTAGCAGAGAAGACGCCTACTGAATTGTGCGATCCCTCCACCT" + + "CAGCTAAGGTAGCTACCAATATTTAGTTTTTTAGCCTTGCGACAGACCTCCTACTTAGAT" + + "TGCCACGCATTGAGCTAGCGAGTCAGCGATAAGCATGACGCGCTTTCAAGCGTCGCGAGT" + + "ATGTGAACCAAGGCTCCGGACAGGACTATATACTTGGGTTTGATCTCGCCCCGACAACTG" + + "CAAACCTCAACATTTATAGATTATAAGGTTAGCCGAAATTGCACGTGGTGGCGCCCGCCG" + + "ACTGCTCCCCGAGTGTGGCTCTTTGATCTGACAACGCGCGACCTCCATCGCGGCCGATTG" + + "TTTCTGCGGACCATGTCGTCCTCATAGTTTGGGCATGTTTCCGTTGTAGGAGTGAAGCCA" + + "CTTAGCTTTGCGCCGTAGTCCCAATGAAAAACCTATGGACTTTGTTTTGGGTAGCATCAG" + + "GAATCTGAACCCTGTGAATGTGGGGGTCGCGCGCATAGACCTTTATCTCCGGTTCAAGTT" + + "AGGCATGAGGCTGCATGCTACGTTGTCACACCTACACTGCTCGAAGTAAATATGGGAAGC" + + "GCGCGGCCTGGCCCGAGGCGTTCCGCGCCGCCACGTGTTCGTTAACTGTTGATTGGTGGC" + + "ACATAAGCAATACCGTAGTCCCTCAAATTCAGCTCTGTTATCTCGAGCGTTATGTGTCAA" + + "ATGGCGTAGAACGGGATTGACTGTTTGACACTAGCTGGTGTTCGGTTCGGTAACGGAGAA" + + "TCTGTGGGGCTATGTCACTAATACTTTCGAAACGCCCCGTACCGATGCTGAACAAGTCGA" + + "TGCAGGCTCCCGTCTTTGAATAGGGGTAAACATACAAGTCGATAGAAGATGGGTAGGGGC" + + "CTCCAATTCATCCAACACTCTACGCCTTCTCCAAGAGCTAGTAGGGCACCCTGCAGTTGG" + + "AAAGGGAACTATTTCGTAGGGCGAGCCCATACCGTCTCTCTTGCGGAAGACTTAACACGA" + + "TAGGAAGCTGGAATAGTTTCGAACGATGGTTATTAATCCTAATAACGGAACGCTGTCTGG" + + "AGGATGAGTGTGACGGAGTGTAACTCGATGAGTTACCCGCTAATCGAACTGGGCGAGAGA" + + "TCCCAGCGCTGATGCACTCGATCCCGAGGCCTGACCCGACATATCAGCTCAGACTAGAGC" + + "GGGGCTGTTGACGTTTGGGGTTGAAAAAATCTATTGTACCAATCGGCTTCAACGTGCTCC" + + "ACGGCTGGCGCCTGAGGAGGGGCCCACACCGAGGAAGTAGACTGTTGCACGTTGGCGATG" + + "GCGGTAGCTAACTAAGTCGCCTGCCACAACAACAGTATCAAAGCCGTATAAAGGGAACAT" + + "CCACACTTTAGTGAATCGAAGCGCGGCATCAGAATTTCCTTTTGGATACCTGATACAAAG" + + "CCCATCGTGGTCCTTAGACTTCGTGCACATACAGCTGCACCGCACGCATGTGGAATTAGA" + + "GGCGAAGTACGATTCCTAGACCGACGTACGATACAACTATGTGGATGTGACGAGCTTCTT" + + "TTATATGCTTCGCCCGCCGGACCGGCCTCGCGATGGCGTAGCTGCGCATAAGCAAATGAC" + + "AATTAACCACTGTGTACTCGTTATAACATCTGGCAGTTAAAGTCGGGAGAATAGGAGCCG" + + "CAATACACAGTTTACCGCATCTAGACCTAACTGAGATACTGCCATAGACGACTAGCCATC" + + "CCTCTGGCTCTTAGATAGCCCGATACAGTGATTTTGAAAGGTTTGCGGGGCACAGCTATG" + + "ACTTGCTTAGCTACGTGTGAGGGAAGGAACTTTTGCGTATTTGTATGTTCACCCGTCTAC" + + "TACGCATGCGGGCAGATTATGTAGGTTGAGAGATGCGGGAGAAGTTCTCGACCTTCCCGT" + + "GGGACGTGAACCTATCCCCTAATAGAGCATTCCGTTCGAGCATGGCAGTAAGTACGCCTT" + + "CTCAATTGTGCTAACCTTCATCCCTATCAAAGCTTGGAGCCAATGATCAGGGTTATTCCC" + + "TTGGGACAGACTTCCTACTCACAGTCGGTCACATTGGGCTACTCCATGGGTCTTCAGCTT" + + "GACCCGGTCTGTTGGGCCGCGATTACGTGAGTTAGGGCCCCGGACTGCGCTGTATAGTCG" + + "ATTCTCATCCGGCCCCCACATCTGGAAACCCCAACTTATTTAGATAACATGATTAGCCGA" + + "AGTTGCACGGCGTGTCCACCGTGGAGTCCTCCCCGGGTGTCCCTCCTTCATTTGACGATA" + + "AGCAGCGGCTACCACCATTGATTAACACAAGGAACGGTGATGTTAACATAGATTCGGCAC" + + "ATTACTCTTGTAGGTGTGGAATCACTTAGCTACGCGGCGAAGCCTTATGGCAAAACCGAT" + + "GGGCAATGATTCGGGTAGCGCTAAAAGTCCATAGCACGTGCATCCCAACGTGGCGTGCGT" + + "ACAGCTTGACCACCGCTTCACGCTAAGGTGCTGGCCACATGCTAAATTGATGCGCCTGCA" + + "CTGCTCAAAGGATAATTACGAAGCGGGCGGCCTGGCGGGAGCACTACCCCATCGACGCGT" + + "ACTCGAATACTGTTTATTGCTCACACATGAACAAATTAGTAGAGTGCCACTTTCAGCCCT" + + "CTTGTCGTCGGCGATGTGTGTAAAATGGCGTTGATGTGGATCGACTCTATAAAGGTATCT" + + "ACTGATGCGTAGGGAGATCCGGAATCTATTGGCCTATGTCACTGAAACTATCCAAACACC" + + "CCATGTCGATACTGAACGTATCGACGCATACCTCCTTCCTTGAAAACGCACAATCATACA" + + "ACTGGGCACATAATGCGTACGCCCATCTAGTACACCCATCTCTGTGGGTCCAGTTCAAGA" + + "GCTGGAAGAGCACCCTCCACAAGGTCAAGTGGTATCCTGGTAAGGTAAGCTCGTACCGTG" + + "ATTCATGCGACAGGGGTAAGACCATCAGTAGTAGGGATAGTGCCAAACCTCACTCACCAC" + + "TGCCAATAAGGGGTCCTTACCTGAAGAATAAGTGTCAGCCAGTGTAACCCGATGAGGAAC" + + "CCAAAAGGCGAACCGGGCCAGACAACCCGGCGGTATCGCACTCAAAGCCGGGACACGACG" + + "CGTCACAGCCGGTAAGAGTAACCCCGGAGTGAAGACCTATGGGGCTGGATAAAACTGCCG" + + "TGGTAACCGCCTTCAACAACCCGAATACGTGGCACTTCAGGAGGCGCCCGGAGGGGGGAT" + + "GTTTTCTACTATTCGAGGCCGTTCGTTATAACTAGTTGCGTTCCTAGCCGCTATAATTGT" + + "CTCTTTGCCGACTAATGAGAACAACCACACCATAGCGATTTGACGCGGCGCCTCGGAATA" + + "CCGTTTCAGCAGGCGCTTGGTAAGGCCATCGCGAATACCAGGTATCGTGTAAGTAGCGTA" + + "GGCCCGCACGCAAGATAAACTGCTAGGGAACCGCGTTTCCACGACCGGTGCACGATTTAA" + + "TTTCGCCGACGTGATGACATTCCAGGCAGTGCCTCTGCCGCCGGACCCCTCTCGTGATTG" + + "GGTAGCTGGACATGCCCTTGTAAGATATAACAAGAGCCTGCCTGTCTAATGATCTCACGG" + + "CGAAAGTCGGGGAGACAGCAGCGGCTGCAGACATTATACCGCAACAACACTAAGGTGAGA" + + "TAACTCCGTAATTGACTACGCGTTCCTCTAGACCTTACTTGACCGGATACAGTGTCTTTG" + + "ACACGTTTATGGGTTACAGCAATCACATCCAAGACTGGCTATGCACGAAGCAACTCTTGA" + + "GTGTTAAAATGTTGACCCCTGTATTTGGGATGCGGGTAGTAGATGAGTGCAGGGACTCCG" + + "AGGTCAAGTACATTACCCTCTCATAGGGGGCGTTCTAGATCACGTTACCACCATATCATT" + + "CGAGCATGACACCATCTCCGCTGTGCCCATCCTAGTAGTCATTATTCCTATCACGCTTTC" + + "GAGTGTCTGGTGGCGGATATCCCCCACGAATGAAAATGTTTTTCGCTGACAGTCATATTG" + + "GGGTGCTCCTAAGCTTTTCCACTTGGCTGGGTCAGCTAGGCCTCCGTGCCCGGAGTTTCG" + + "GCGCAGTGCTGCCGACAGCCGGCCATTGTCTTTGGGGCCTCATTCGAGGGTACCCGGACC" + + "TATCTTGTCGGGACCACCCGGGGTAGTCGTTGGGCTTATGCACCGAAAAGCCCTGCGCCG" + + "GCCTCCCCGCTACGGAAGGTGATAAGCTCCGGCAAGCAATTATGAACAACGCAAGGATCG" + + "CGGATATAAACAGAGAAACGGCTGATTACACCTGTTCGTGTGGTATCGGTAAATAGCCTC" + + "GCGGAGCCTTATGCCATACTCGTCCGCGGAGCACTCTGGTAATGCATATGGTCCACAGGA" + + "CATTCGTCGCTTCCGGGTATGCGCTCTATTTGACGGTCCTTTGGCGCACAGATGCTGGCC" + + "ACCATTTAAATTAGAGCGACTCCACATCTGTAAGGTCCGCCACGCAGACGACAGCCCAGG" + + "GAGACCACTGACCGATCTACCTGAACGGCAACCTTCTGTATCGTACTGGGGCGGAGAGAT" + + "AACTACAGTGCCGCTTACAGCCCCTCTGTCGTCGCCGACGTCTGTAGTCTAGCCTCATTA" + + "TGATTGCACGCTATTGAGGCATTGACTGATGCCGGAAGACATCTGAAATGAACTGGTCTA" + + "TGCGACAGAAACCGTGCACCTACCAAATCTCCTTAGTGTAGGTTCTGACCGATTCGTGCT" + + "TCGTTGAGAACTCACATTTTAACAACAGAGGACATATGCCCTACCTCCATGATCTACTGA" + + "CGTCCCTGAGGCTGCAATTCATGTAATGGGGCAGTATCCGCGGCAAGTCCTAGTGCAATG" + + "GCGGTTTTTTACCCTCGTTCTGAAGAAGAGGCGACGCGGGTGCGGTCATCACTAATGTGG" + + "AAATTGGGAAGACTCTCGGGCCTCCGCCTTTAGGCGGTGCTTACTCTTTCATAAAGGGGC" + + "TGTTAGTTATGCCCCGCGAGGATTCGAAAAGGTGAGCCAACTCGGCCGATCCGGAGAGAC" + + "GGGCTTCAAAGCTGCCTGACGACGGTTGTGGGCCCGTAACAAAATCCTCCCAATAAGCCC" + + "CCGTGAGCGTCGGTTGAACAGCCCTGGTCGGCCCGACCAGAAGCCCGAATATATCGCTTT" + + "ACGGCTCTTGGGCCGGGGTGCGTTACCTTGCAGAAATCGAGGCCGTCCGTTAATTCCTGT" + + "TGCATTCATACCGCGTATATTTGTCTCTTTACCCGCTTACTTGGATAAGCATGGCATAGC" + + "TTTTTATCGGAGCGCCTCCGTACACGGTACGATCGCACGCCTCGTGAGATCAATACGTAT" + + "ACCAGGTGTCCTGTGAGCAGCGAAAGCCTAAACGGGAGATACACCGCCAAAAGTCCGTGT" + + "GAATACGAGTCGTGGCAAATTTGGTCTGGCTGTGATCTAGATATTCCAGGCGGTACGTCT" + + "GCTCTCGCGTGCCTCTAGTGGCTCGCTAGATAGTCTAGCCGCTGGTAAACACTCCATGAC" + + "CCCGGCTCTCCATTGATGCCACGGCGATTGTTGGAGAGCCAGCAGCGACTGCAAACGTCA" + + "GATCAGAGTAATACTAGCAAGCGATAAGTCCCTAACTGGTTGTGGCCTTCTGTAGAGTGA" + + "ACTTCACCACATATGCTGTCTCTGGCACGTGGATGGTTTGGAGAAATCAGATTCAAGTCT" + + "GATCAACCTTCAAACAGATCTAGAGTCTAAAACAGTGATCTCCTGCGTGCGAGATAGAAA" + + "TACTAGGTAACTACAGGGACTGCGACGTTTTAAACGTTGGTCCGTCAGAAGCGCCATTCA" + + "GGATCACGTTACCCCGAAAAAAAGGTACCAGGAGCTCTTCTCCTCTGCAGTCAGGTCTAT" + + "AGAAACTACACCATTAACCTTCCTGAGAACCGGGAGGTGGGAATCCGTCACATATGAGAA" + + "GGTATTTGCCCGATAATCAATACTCCAGGCTTCTAACTTTTTCCACTCGCTTGAGCCGGC" + + "TTGGCCTTTCTGCCTGAAGATTCGTTGGACTGGTGCCAACGCGCAGGCATAGTTCCAGGA" + + "GAATTATCCGGGGGCAGTGACAACCAACATCTCGGGTCTTGCCCAACCGGTCTACACGCT" + + "GATATAGCGAATCACCGAGAACCCGGCGCCACGCAATGGAACGTCCTTAACTCTGGCAGG" + + "CAATTAAAGGGAACGTATATATAACGCAAAAAAACTGGAAAATTGGCGAGAGAATCTTCT" + + "CTCTGTCTATCGAAGAATGGCCACGCGGAGGCATGCGTCATGCTAGCGTGCGGGGTACTC" + + "TTGCTATCCATTTGGGTCACAGGACACTCGCTGTTTTCGAATTTACCCTTTATGCGCCGG" + + "TATTGAACCACGCTTATGCCCAGCATCGTTACAACCAGACTGATACTAGATGTATAATGT" + + "CCGCCATGCAGACGAAACCAGTCGGAGATTACCGAGCATTCTATCACGTCGGCGACCACT" + + "AGTGAGCTACTGGAGCCGAGGGGTAACGATGATGCCCCTAAGAACCTCTCGGTCGACGCA" + + "AGCGATTACACTCCTGTCACATCATAATCGTTTGCTATTCAGGGGTTGACCAACACCGGA" + + "TAGCTTTTCACTTGAAGTATTATGCACGACAGGGTGCGTGTACCAACTAAACCTGTTTTA" + + "ACTTACCTCAGACTAGTTGGAAGTGTGGCTAGATCTTAGCTTTCGTCACTAGAGGGCCCA" + + "CGCTTAGTTTTTATGATCCATTGATCTCCTAGACGCTGCAAGATTTGCAACCAGGCAGAC" + + "TTAGCGGTAGGTCCTAGTGCAGCGGGACTTTTTTTCTATAGTCGTTGAGAGGAGGAGTCG" + + "TCAGACCAGATACCTTTGATGTCCTGATTGGAAGGACCGTTGGCCCCCGACCCTTAGACA" + + "GTGTACTCAGTTCTATAAACGAGCTATTAGATATGAGATCCGTAGATTGAAAAGGGTGAC" + + "GGAATTCGCCCGGACGCAAAAGACGGACAGCTAGGTATCCTGAGCACGGTTGCGCGTCCG" + + "AATCAAGCTCCTCTTTACAGGCCCCGGTTTCTGTTGGTCGTAGAGCGCAGAACGGATTGG" + + "GGGGATGTACGACAATATCTCTTAGTCACCTTTGGGTCACGGTCTGCTACCTTACAGGAA" + + "TTCAGACCGTCCTTTAATTTCCCTTGCATATATGTTGCGTTTCTTCGACCTTCTAACCGC" + + "ACCCTTAGGACGAAGACAGATACGTTCTTACCCATACTCCACCGTTGGCAGCGGGATCGC" + + "ATGTCCCACGTGAAACATTGCTAAACCCTCAGGCCTCTGAGCGACAAAAGCTTTAAAGGG" + + "AAATTCGCGCCCATAACTTGGTCCGAATACGGGTTCTAGCATCGTTCGTCTGAGTTTGTT" + + "CTATATAAAACGGGCGCAATGTCTGCTTTGATCAACCTCCAATACCTCGTATGATTGTGC" + + "ACCCGCCGGTGACCACTCAATGATGTGGGGTCCCCGTTGCAACTACGAGGATTTATTGAG" + + "ACCGACCTACGTTCGGCATTGTGGGCAGAGTGAAGTATTGGCAAACGTTAAGTGCCGAAC" + + "TAGATCTGACCTAACGGTAAGAGAGTTTCATAATACGTCCAGCCGCATGCGCAGGGTACA" + + "TTTGGACAGTATTGAATGGACTCTGATCAACCTTCACACCGATCTAGAAACGAGTGCGTA" + + "GATCAGCCAGGTGCAAACCAAAAATTCTAGGTTACTAGAAGTTTTGCGACGTTCTAAGAG" + + "TTGGACGAAATGTTTCGCGACCTAGGATGAGGTCGCCCTAGAAAATAGATTTCTGCTACT" + + "CTCCTCATAAGCAGTCCGGTGTATCGAAAGTACAAGACTAGCCTTGCTAGCAACCGCGGG" + + "CTGGGAGCCTAAGGCATCACTCAAGATACAGGCTCGGTAACGTACGCTCTAGCCATCTAA" + + "CTATCCCCTATGTCTTATAGGGACCTACGTTATCTGCCTG"; + + protected final static String REF_MD5 = Utils.calcMD5(REF); + +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java new file mode 100644 index 000000000..564a475b0 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java @@ -0,0 +1,249 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.RandomDNA; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Tests for {@link AssemblyResultSet}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class AssemblyResultSetUnitTest extends BaseTest +{ + private GenomeLocParser genomeLocParser; + private SAMFileHeader header; + + @BeforeClass + public void init() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + + @Test + public void testEmptyResultSet() { + final AssemblyResultSet subject = new AssemblyResultSet(); + + Assert.assertEquals(subject.getHaplotypeList().size(), 0); + Assert.assertEquals(subject.getHaplotypeCount(),0); + Assert.assertEquals(subject.getReferenceHaplotype(),null); + Assert.assertEquals(subject.getFullReferenceWithPadding(),null); + Assert.assertEquals(subject.getPaddedReferenceLoc(),null); + Assert.assertEquals(subject.getRegionForGenotyping(),null); + Assert.assertEquals(subject.getUniqueReadThreadingGraph(10),null); + Assert.assertFalse(subject.hasMultipleKmerSizes()); + } + + @Test + public void testAddReferenceHaplotype() { + + final Haplotype ref = new Haplotype("ACGT".getBytes(),true); + ref.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,ref.length() + 1 )); + final AssemblyResultSet subject = new AssemblyResultSet(); + + Assert.assertTrue(subject.add(ref)); + Assert.assertFalse(subject.add(ref)); + + Assert.assertEquals(subject.getReferenceHaplotype(),ref); + Assert.assertEquals(subject.getHaplotypeCount(),1); + Assert.assertEquals(subject.getHaplotypeList().size(),1); + } + + @Test(dataProvider="assemblyResults") + public void testAddManyHaplotypes(final java.util.List assemblyResults, + final java.util.List> haplotypes) { + final AssemblyResultSet subject = new AssemblyResultSet(); + for (int i = 0; i < haplotypes.size(); i++) { + final int haplotypeCountBefore = subject.getHaplotypeCount(); + final java.util.List haplos = haplotypes.get(i); + final AssemblyResult ar = assemblyResults.get(i); + for (final Haplotype h : haplos) { + Assert.assertTrue(subject.add(h, ar)); + Assert.assertFalse(subject.add(h,ar)); + if (h.isReference()) + Assert.assertEquals(subject.getReferenceHaplotype(),h); + } + final int haplotypeCountAfter = subject.getHaplotypeCount(); + Assert.assertEquals(haplos.size(),haplotypeCountAfter - haplotypeCountBefore); + Assert.assertTrue(subject.getMaximumKmerSize() >= ar.getKmerSize()); + Assert.assertTrue(subject.getMinimumKmerSize() <= ar.getKmerSize()); + Assert.assertEquals(subject.getUniqueReadThreadingGraph(ar.getKmerSize()), ar.getThreadingGraph()); + } + } + + @Test(dataProvider="trimmingData") + public void testTrimTo(final Map haplotypesAndResultSets, final ActiveRegion original) { + final AssemblyResultSet subject = new AssemblyResultSet(); + for (final Map.Entry entry : haplotypesAndResultSets.entrySet()) + subject.add(entry.getKey(),entry.getValue()); + subject.setRegionForGenotyping(original); + final GenomeLoc originalLocation = original.getExtendedLoc(); + final int length = originalLocation.size(); + final GenomeLoc newLocation = originalLocation.setStop(originalLocation.setStart(originalLocation,originalLocation.getStart() + length / 2),originalLocation.getStop() - length / 2); + final ActiveRegion newRegion = original.trim(newLocation); + + final Map originalHaplotypesByTrimmed = new HashMap<>(haplotypesAndResultSets.size()); + for (final Haplotype h : haplotypesAndResultSets.keySet()) + originalHaplotypesByTrimmed.put(h.trim(newRegion.getExtendedLoc()), h); + + final AssemblyResultSet trimmed = subject.trimTo(newRegion, originalHaplotypesByTrimmed); + + Assert.assertFalse(subject.wasTrimmed()); + Assert.assertTrue(trimmed.wasTrimmed()); + + for (final Haplotype h : trimmed.getHaplotypeList()) { + Assert.assertEquals(h.getGenomeLocation(),newLocation); + Assert.assertEquals(h.getBases().length,newLocation.size()); + } + } + + @DataProvider(name="trimmingData") + public Iterator trimmingData() { + final ActiveRegion activeRegion = new ActiveRegion(genomeLocParser.createGenomeLoc("chr1",1000,1100),genomeLocParser,25); + final int length = activeRegion.getExtendedLoc().size(); + final RandomDNA rnd = new RandomDNA(13); // keep it prepoducible by fixing the seed to lucky 13. + final ActiveRegionTestDataSet actd = new ActiveRegionTestDataSet(10,new String(rnd.nextBases(length)),new String[] { + "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]); + + final List haplotypes = actd.haplotypeList(); + for (final Haplotype h : haplotypes) + h.setGenomeLocation(activeRegion.getExtendedLoc()); + + final ReadThreadingGraph rtg = new ReadThreadingGraph(10); + for (final Haplotype h : haplotypes) + rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference()); + final SeqGraph seqGraph = rtg.convertToSequenceGraph(); + final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph); + ar.setThreadingGraph(rtg); + final Map result = + new HashMap<>(); + for (final Haplotype h : haplotypes) + result.put(h,ar); + return Collections.singleton(new Object[] {result,activeRegion}).iterator(); + + } + + + + + @DataProvider(name="assemblyResults") + public java.util.Iterator assemblyResults() { + final int size = THREE_KS_GRAPH_AND_HAPLOTYPES.length * (1 + TEN_KS_GRAPH_AND_HAPLOTYPES.length); + final Object[][] result = new Object[size][]; + + for (int i = 0; i < THREE_KS_GRAPH_AND_HAPLOTYPES.length; i++) { + final ReadThreadingGraph rtg = new ReadThreadingGraph((String) THREE_KS_GRAPH_AND_HAPLOTYPES[i][0]); + final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + final Object[] haplotypeStrings = (Object[]) THREE_KS_GRAPH_AND_HAPLOTYPES[i][1]; + final Haplotype[] haplotypes = new Haplotype[haplotypeStrings.length]; + for (int j = 0; j < haplotypeStrings.length; j++) { + haplotypes[j] = new Haplotype(((String)haplotypeStrings[j]).getBytes(),j == 0); + haplotypes[j].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,haplotypes[j].length() + 1)); + } + result[i] = new Object[] { Collections.singletonList(ar),Arrays.asList(Arrays.asList(haplotypes))}; + for (int j = 0; j < TEN_KS_GRAPH_AND_HAPLOTYPES.length; j++) { + final ReadThreadingGraph rtg10 = new ReadThreadingGraph((String) TEN_KS_GRAPH_AND_HAPLOTYPES[j][0]); + final AssemblyResult ar10 = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,rtg10.convertToSequenceGraph()); + ar10.setThreadingGraph(rtg10); + final Object[] haplotypeStrings10 = (Object[]) TEN_KS_GRAPH_AND_HAPLOTYPES[j][1]; + final Haplotype[] haplotype10 = new Haplotype[haplotypeStrings10.length]; + for (int k = 0; k < haplotypeStrings10.length; k++) { + haplotype10[k] = new Haplotype(((String)haplotypeStrings10[k]).getBytes(),false); + haplotype10[k].setGenomeLocation(genomeLocParser.createGenomeLoc("chr1", 1, haplotype10[k].length() + 1)); + } + + result[THREE_KS_GRAPH_AND_HAPLOTYPES.length + i * TEN_KS_GRAPH_AND_HAPLOTYPES.length + j] = new Object[] { Arrays.asList(ar,ar10), + Arrays.asList( Arrays.asList(haplotypes), Arrays.asList(haplotype10)) }; + } + } + return Arrays.asList(result).iterator(); + } + + + private static final Object[][] THREE_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { + {"[ks=3]{REF: ACT}",new Object[] {"ACT"}}, + {"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (3) -> A -> G -> (2) }" + + "{ (1) -> A -> G -> (2) }",new Object[] {"ACTTGA","ACTAGGA","ACTTAGGA"}}, + {"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}", new Object[] {"ACTCG"}} , + {"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + + "{A(1) -> T -> A(2) }", new Object[] {"ACTAGACGT","ACTATACGT"}} , + {"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" + + "{ ACT -> A -> T(2) -> C -> T -> A -> C -> G -> T -> A(1) -> T}", + new Object[] {"ACTATCAGTACGTAT","ACTATCTACGTAT"}} , + {"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}", + new Object[] {"ACTATCAGTACGTAT"}}, + {"[ks=3]{REF: ACT -> A -> T(1) }" + + "{ ACT -> A -> T(1) }", new Object[] {"ACTAT"}}, + {"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ", new Object[] {"TTTACT","TTTAT"}} + }; + + private static final Object[][] TEN_KS_GRAPH_AND_HAPLOTYPES = new Object[][] { + {"[ks=10]{ACTAGTAAAT -> A -> T -> A -> A -> T -> A", new Object[] {"ACTAGTAAATATAATA"}}, + {"[ks=10]{ATAGTAATAA(1) -> A -> C -> T -> A(2) -> C}{ (1) -> C -> C -> C -> A(2) -> C}", + new Object[] {"ATAGTAATAAACTAC","ATAGTAATAACCCAC"}}, + + }; + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java new file mode 100644 index 000000000..cfdda7c4d --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/Civar.java @@ -0,0 +1,1135 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; + +import java.util.*; + +/** + * Compact Idosyncratic Variation Alignment Report. + *

+ * Allows to specify variation of a sequence. + *

+ */ +public class Civar { + + + protected List elements; + private String string; + private transient int minimumTemplateSize = -1; + private transient Boolean expands = null; + private transient int starCount = -1; + private Boolean hasEmbeddedCivars = null; + private Boolean hasOptionalVariation = null; + private Boolean hasVariation = null; + private Boolean allVariationIsOptional; + + + protected Civar(final List elements) { + this.elements = elements; + } + + public static Civar fromCharSequence(final CharSequence cs, final int from, final int to) { + return Parser.parse(cs, from, to); + } + + public static Civar fromCharSequence(final CharSequence cs) { + return fromCharSequence(cs, 0, cs.length()); + } + + @Override + public String toString() { + if (string == null) + analyzeCivar(); + return string; + } + + /** + * Correspond to the minimum template sequence size to which this civar can be applied to. + */ + public int minimumTemplateSequenceSize() { + if (minimumTemplateSize < 0) + analyzeCivar(); + return minimumTemplateSize; + } + + public boolean expands() { + if (expands == null) + analyzeCivar(); + return expands; + } + + public boolean hasEmbeddedCivars() { + if (hasEmbeddedCivars == null) + analyzeCivar(); + return hasEmbeddedCivars; + } + + public List unroll() { + if (!isUnrolled()) { + final List result = new LinkedList<>(); + unroll(0, new LinkedList(), "", result); + return result; + } else { + return Collections.singletonList(this); + } + } + + public Civar optionalizeAll() { + if (allElementsAreOptional()) { + return this; + } + final Element[] newElements = new Element[this.elements.size()]; + int next = 0; + for (final Element e : elements) { + final Element newElement = e.clone(); + if (newElement.operator() == Operator.EMBEDDED) { + newElement.embedded = newElement.embedded.optionalizeAll(); + } + newElement.makeOptional(); + newElements[next++] = newElement; + } + return new Civar(Collections.unmodifiableList(Arrays.asList(newElements))); + } + + private boolean allElementsAreOptional() { + if (allVariationIsOptional == null) + analyzeCivar(); + return allVariationIsOptional; + } + + + private void unroll(final int elementIndex, final LinkedList leftElements, final String leftString, final List dest) { + if (elementIndex == elements.size()) { + final Civar result = new Civar(Collections.unmodifiableList(new ArrayList<>(leftElements))); + dest.add(result); + return; + } + final Element currentElement = elements.get(elementIndex); + if (currentElement.operator() == Operator.EMBEDDED) { + List embeddedUnroll = currentElement.embedded.unroll(); + Civar novar = null; + for (final Civar c : embeddedUnroll) { + if (!c.hasVariation()) { + novar = c; + break; + } + } + if (novar == null && currentElement.isOptional()) { + + embeddedUnroll = new LinkedList<>(embeddedUnroll); + embeddedUnroll.add(0, currentElement.embedded.novarEquivalent()); + } + + for (final Civar embedded: embeddedUnroll) { + final Element embeddedElement = new Element(embedded,false); + leftElements.add(embeddedElement); + unroll(elementIndex + 1, leftElements, leftString + embeddedElement.toString(), dest); + leftElements.removeLast(); + } + } else if (currentElement.isOptional() && currentElement.isVariant()) { + leftElements.add(currentElement.matchEquivalent()); + unroll(elementIndex + 1, leftElements, leftString + currentElement.matchEquivalent().toString(), dest); + leftElements.removeLast(); + leftElements.add(currentElement.mandatoryEquivalent()); + unroll(elementIndex + 1, leftElements, leftString + currentElement.mandatoryEquivalent().toString(), dest); + leftElements.removeLast(); + } else { + leftElements.add(currentElement); + unroll(elementIndex + 1, leftElements, leftString + currentElement.toString(), dest); + leftElements.removeLast(); + } + } + + private Civar novarEquivalent() { + int mtss = minimumTemplateSequenceSize(); + int sc = starCount(); + if (mtss > 0) { + if (sc > 0) { + final Civar newEmbedded = new Civar(Collections.unmodifiableList(Arrays.asList( + new Element(Operator.MATCH,mtss,false,false),new Element(Operator.MATCH,sc,true,false)))); + //newEmbedded.string = newEmbedded.elements.get(0).toString() + newEmbedded.elements.get(1).toString(); + return newEmbedded; + } else { + return new Civar(Collections.unmodifiableList(Collections.singletonList(new Element(Operator.MATCH, mtss, false, false)))); + } + } else if (sc > 0) { + return new Civar(Collections.unmodifiableList(Collections.singletonList(new Element(Operator.MATCH, sc, true, false)))); + } else { + return new Civar((List) (List) Collections.emptyList()); + } + } + + public String applyTo(final CharSequence seq) { + return applyTo(seq,0,seq.length()); + } + + public List eventOffsets(final CharSequence seq, final int from, final int to) { + if (!isUnrolled()) + throw new UnsupportedOperationException("you cannot apply an unrolled Civar to a DNA sequence"); + final List result = new ArrayList<>(elements().size()); + final CharSequence sequence = seq; + + int sequenceLength = sequence.length(); + int minSeqLen = minimumTemplateSequenceSize(); + if (!expands() && sequenceLength != minSeqLen) + throw new IllegalArgumentException("the sequence provided does not match this Civar size " + sequence.length() + " != " + minSeqLen); + if (sequenceLength < minSeqLen) + throw new IllegalArgumentException("the sequence provided is too small for this Civar " + sequence.length() + " < " + minSeqLen); + int starCount = starCount(); + int paddingTotal = sequenceLength - minSeqLen; + int starPadding = starCount == 0 ? 0 : paddingTotal / starCount; + int excessPadding = starCount == 0 ? paddingTotal : paddingTotal % starCount; + int nextInSeq = 0; + int nextElement = 0; + + int outputLength = 0; + while (nextInSeq < sequenceLength && nextElement < elements().size()) { + final Element e = elements().get(nextElement++); + final int outputStart = outputLength; + final int sequenceStart = nextInSeq; + int size = e.expands() ? starPadding * e.size() : e.size(); + if (e.expands() && excessPadding != 0) { + size++; + excessPadding--; + } + switch (e.operator()) { + case EMBEDDED: + throw new IllegalStateException("supposed to be unrolled Civar"); + case DELETION: + nextInSeq += size; + break; + case INSERTION: + outputLength += size; + break; + default: + outputLength += size; + nextInSeq += size; + break; + } + final int outputEnd = outputLength; + final int sequenceEnd = nextInSeq; + if (outputEnd > from && outputStart < to) { + result.add(new ElementOffset(e,Math.max(outputStart - from,0), + Math.min(outputEnd - from, to -from), + sequenceStart + Math.max(from - outputStart,0), + sequenceEnd - Math.max(outputEnd - to,0))); + } + } + if (nextInSeq != sequenceLength) { + throw new IllegalStateException("probable bug: mismatched sequence and Civar application length " + nextInSeq + " != " + to); + } + return result; + } + + public String applyTo(final CharSequence seq, final int from, final int to) { + + final CharSequence sequence = seq.subSequence(from, to); + if (!isUnrolled()) + throw new UnsupportedOperationException("you cannot apply an unrolled Civar to a DNA sequence"); + + int referenceLength = sequence.length(); + int minSeqLen = minimumTemplateSequenceSize(); + if (!expands() && referenceLength != minSeqLen) + throw new IllegalArgumentException("the sequence provided does not match this Civar size " + sequence.length() + " != " + minSeqLen); + if (referenceLength < minSeqLen) + throw new IllegalArgumentException("the sequence provided is too small for this Civar " + sequence.length() + " < " + minSeqLen); + int starCount = starCount(); + int paddingTotal = referenceLength - minSeqLen; + int starPadding = starCount == 0 ? 0 : paddingTotal / starCount; + int excessPadding = starCount == 0 ? paddingTotal : paddingTotal % starCount; + int nextInSeq = 0; + int nextElement = 0; + + final StringBuffer sb = new StringBuffer(sequence.length() * 2); + while (nextInSeq < to && nextElement < elements.size()) { + final Element e = elements.get(nextElement++); + int size = e.expands() ? starPadding * e.size() : e.size(); + if (e.expands() && excessPadding != 0) { + size++; + excessPadding--; + } + switch (e.operator()) { + case EMBEDDED: + throw new IllegalStateException("supposed to be unrolled Civar"); + case MATCH: + sb.append(sequence.subSequence(nextInSeq, nextInSeq += size)); + break; + case DELETION: + nextInSeq += size; + break; + case INSERTION: + final CharSequence xmer = e.xmer().toString().toUpperCase(); + while (size >= xmer.length()) { + sb.append(xmer); + size -= xmer.length(); + } + sb.append(xmer.subSequence(0, size)); + break; + case TRANSITION: + transition(sequence, nextInSeq, sb, size); + nextInSeq += size; + break; + case TRANSVERSION: + transversion(sequence, nextInSeq, sb, size); + nextInSeq += size; + break; + case COMPLEMENT: + complement(sequence, nextInSeq, sb, size); + nextInSeq += size; + break; + } + } + if (nextInSeq != referenceLength) { + throw new IllegalStateException("probable bug: mismatched sequence and Civar application length " + nextInSeq + " != " + to); + } + return sb.toString(); + } + + + private static char transversion(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': + return 'C'; + case 'G': + return 'T'; + case 'C': + return 'A'; + case 'T': + return 'G'; + default: + return c; + } + } + + private static char complement(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': + return 'T'; + case 'G': + return 'C'; + case 'T': + return 'A'; + case 'C': + return 'G'; + default: + return c; + } + + } + + private static char transition(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': + return 'G'; + case 'G': + return 'A'; + case 'T': + return 'C'; + case 'C': + return 'T'; + default: + return c; + } + + } + + private void transition(final CharSequence charSequence, final int from, final StringBuffer dest, final int length) { + for (int i = from; i < length + from; i++) { + dest.append(transition(charSequence.charAt(i))); + } + } + + private void transversion(final CharSequence cs, final int from, final StringBuffer dest, final int length) { + for (int i = from; i < length + from; i++) { + dest.append(transversion(cs.charAt(i))); + } + } + + private void complement(final CharSequence cs, final int from, final StringBuffer dest, final int length) { + for (int i = from; i < length + from; i++) { + dest.append(complement(cs.charAt(i))); + } + } + + private static List cle(final List original) { + if (original.size() == 0) + return original; + if (original.size() == 1) { + final Element e = original.get(0); + if (e.operator() == Operator.EMBEDDED) { + return original; + } else if (e.size() == 0) { + return Collections.emptyList(); + } else { + return original; + } + } else { + final ArrayList result = new ArrayList<>(original); + for (int i = 0; i < result.size(); ) { + final Element e = result.get(i); + if (e.operator() == Operator.EMBEDDED) { + i++; + } else if (e.size() == 0) { + result.remove(i); + } else if (i == result.size() - 1) { + i++; + } else { + final Element next = result.get(i + 1); + if (next.operator() != e.operator()) { + i++; + } else if (e.operator() != Operator.INSERTION) { + if (next.expands() == e.expands()) { + result.remove(i + 1); + result.set(i, new Element(e.operator(), e.size() + next.size(), e.expands(), false, null)); + } else if (e.expands()) { + int j; + for (j = i + 1; j < result.size(); j++) { + if (result.get(j).operator() == Operator.MATCH && result.get(j).expands()) { + break; + } + } + result.add(j, e); + result.remove(i); + } + } else { // INSERTION my be fussed if their sizes correspond to the xmers without expansions + if (!e.expands() && !next.expands() && e.xmer().length() == e.size() && next.xmer().length() == next.size()) { + result.remove(i + 1); + result.set(i, new Element(Operator.INSERTION, e.size() + next.size(), false, false, e.xmer().toString() + next.xmer().toString())); + } + } + } + } + return result; + } + } + + + protected void analyzeCivar() { + int minimumTemplateSize = 0; + boolean expands = false; + int starCount = 0; + boolean hasEmbeddedCivars = false; + boolean hasOptionalElements = false; + boolean allElementsAreOptional = true; + boolean hasVariation = false; + StringBuffer strBuffer = new StringBuffer(100); + for (final Element e : elements) { + strBuffer.append(e.toString()); + if (e.operator() == Operator.EMBEDDED) { + hasEmbeddedCivars = true; + if (e.embedded.hasVariation()) { + hasVariation = true; + allElementsAreOptional &= (e.optional || e.embedded.allElementsAreOptional()); + hasOptionalElements |= e.optional || e.embedded.hasOptionalVariationElements(); + } + minimumTemplateSize += e.embedded.minimumTemplateSequenceSize(); + if (e.embedded.expands()) { + expands = true; + starCount += e.embedded.starCount(); + } + } else { + if (e.isVariant()) { + hasVariation = true; + allElementsAreOptional &= e.optional; + hasOptionalElements |= e.optional; + } + if (e.expands()) { + starCount += e.size(); + expands = true; + continue; + } + if (e.operator() == Operator.INSERTION) + continue; + minimumTemplateSize += e.size(); + } + } + this.string = strBuffer.toString(); + this.hasVariation = hasVariation; + this.allVariationIsOptional = allElementsAreOptional; + this.hasOptionalVariation = hasOptionalElements; + this.hasEmbeddedCivars = hasEmbeddedCivars; + this.starCount = starCount; + this.expands = expands; + this.minimumTemplateSize = minimumTemplateSize; + } + + public boolean isUnrolled() { + return !hasOptionalVariationElements(); + } + + public boolean hasVariation() { + if (hasVariation == null) + analyzeCivar(); + return hasVariation; + } + + private boolean hasOptionalVariationElements() { + if (hasOptionalVariation == null) + analyzeCivar(); + return hasOptionalVariation; + } + + private int starCount() { + if (starCount == -1) + analyzeCivar(); + return starCount; + } + + public List elements() { + return elements; + } + + + public enum Operator { + + MATCH('='), INSERTION('I'), DELETION('D'), TRANSITION('T'), EMBEDDED('('), + + /** + * Transversion that is not to the complement nucleotide + */ + TRANSVERSION('V'), + + /** + * Transverison to the complement nucleotide A <-> T or C <-> G + */ + COMPLEMENT('C'), + + /** + * Marks + */ + START('^'), END('$'); + + public final char charValue; + + Operator(final char c) { + charValue = c; + } + + /** + * Does this operator represents a SNP. + * + * @return true is so, false otherwise. + */ + public boolean isSnp() { + switch (this) { + case TRANSITION: + case TRANSVERSION: + case COMPLEMENT: + return true; + default: + return false; + } + } + + /** + * Checks whether the operation requires an x-mer. + * + * @return true if so, false otherwise + */ + public boolean requiresXmer() { + return this == INSERTION; + } + + public boolean acceptsXmer() { + return this == INSERTION; + } + + public static Operator fromChar(final char c) { + switch (c) { + case 'I': + return INSERTION; + case 'V': + return TRANSVERSION; + case 'T': + return TRANSITION; + case 'C': + return COMPLEMENT; + case 'M': + case '=': + return MATCH; + case '?': + return START; + case '$': + return END; + case 'D': + return DELETION; + default: + throw new IllegalArgumentException("the chacter " + c + " does not denote a valid Civar operator"); + } + } + } + + + public static class Element implements Cloneable { + + protected int size; + protected boolean expands; + protected Operator o; + + protected CharSequence xmer; + protected Civar embedded; + protected boolean optional; + + + public Element(final Civar c, boolean optional) { + this(Operator.EMBEDDED, 1, false, optional); + embedded = c; + } + + @Override + protected Element clone() { + try { + return (Element) super.clone(); + } catch (CloneNotSupportedException e) { + throw new RuntimeException("unexpected exception ", e); + } + } + + + /** + * Calculate the length provided the the length of the padding unit and how much + * exceed padding is still available. + * + * @param starPadding + * @param excessPaddingRemaining + * + * @return never {@code null}. + */ + public int size(final int starPadding, final int excessPaddingRemaining) { + if (!expands) + return size; + else + return size * (starPadding) + Math.min(excessPaddingRemaining,size); + } + + /** + * Returns the size of this element. + *

+ * If the element is non-explandable this value is the exact size of the indicated operation in term of bases + * (deleted, inserted, changed, etc...) + *

+ * If the element is expandable, this is relative size respect with other expandable elements. + * + * @return never negative, and typically greater than 0; 0 would be use for possible future symbolic operands. + */ + public int size() { + return size; + } + + /** + * Checks whether this is an expandable element. + * + * @return true if so, false otherwise. + */ + public boolean expands() { + return expands; + } + + public boolean isOptional() { + return optional; + } + + /** + * Returns the appendix sequence for those elements for which it applies. + * + * @return never null but perhaps an empty sequence. You must not attempt to modify the returned element. + */ + public CharSequence xmer() { + return xmer; + } + + + /** + * Returns the operator for this element. + * + * @return never null. + */ + public Operator operator() { + return o; + } + + protected Element(final Operator o, final int size, final boolean expands, final boolean optional) { + if (o == null) + throw new IllegalArgumentException("operator cannot be null"); + if (size < 0) + throw new IllegalArgumentException("element size cannot be negative"); + this.o = o; + this.optional = optional; + this.expands = expands; + this.size = size; + } + + /** + * Constructs an element given its properties. + * + * @param o the operator for this element. + * @param size the size of the element + * @param expands if the element is expandable. + * @param xmer the xmer for this element. + */ + public Element(final Operator o, final int size, final boolean expands, final boolean optional, final String xmer) { + this(o,size,expands,optional); + if ((xmer == null || xmer.length() == 0) && o.requiresXmer()) + throw new IllegalArgumentException("operator " + o + " requires a x-mer"); + if ((xmer != null && !o.acceptsXmer())) + throw new IllegalArgumentException("operator " + o + " does not accept a x-mer"); + this.xmer = xmer; + } + + public String toString() { + if (operator() == Operator.EMBEDDED) { + return "(" + embedded.toString() + ")"; + } + final String sizeString = expands ? (size == 1 ? "*":"" + size) : ("" + size); + final String sizeAndOperator = sizeString + operator().charValue; + final String sizeAndOperatorAndXmer = o == Operator.INSERTION ? sizeAndOperator + xmer : sizeAndOperator; + return optional ? sizeAndOperatorAndXmer + "?" : sizeAndOperatorAndXmer; + } + + protected void makeOptional() { + if (o != Operator.MATCH) + optional = true; + } + + public boolean isVariant() { + switch (o) { + case MATCH: return false; + default: + return true; + } + } + + private Element matchEquivalent() { + switch (o) { + case MATCH: return this; + case INSERTION: return new Element(Operator.MATCH,0,false,false); + case EMBEDDED: + int mtss = embedded.minimumTemplateSequenceSize(); + int sc = embedded.starCount(); + if (mtss > 0) { + if (sc > 0) { + final Civar newEmbedded = new Civar(Collections.unmodifiableList(Arrays.asList( + new Element(Operator.MATCH,mtss,false,false),new Element(Operator.MATCH,sc,true,false)))); + //newEmbedded.string = newEmbedded.elements.get(0).toString() + newEmbedded.elements.get(1).toString(); + return new Element(newEmbedded,false); + } else { + return new Element(Operator.MATCH,mtss,false,false); + } + } else if (sc > 0) { + return new Element(Operator.MATCH,sc,true,false); + + } else { + return new Element(Operator.MATCH,0,false,false); + } + default: + return new Element(Operator.MATCH,size,expands,false); + } + } + + public Element mandatoryEquivalent() { + final Element result = this.clone(); + result.optional = false; + return result; + } + + public int excessPaddingUsed(final int excessPadding) { + return expands ? Math.min(excessPadding,size) : 0; + } + } + + protected static class Parser { + + public static Civar parse(final CharSequence cs, final int from, final int to) { + if (cs == null) + throw new NullPointerException("the input char-sequence cannot be null"); + if (from < 0) + throw new IndexOutOfBoundsException("the from index cannot be negative"); + if (to < from) + throw new IllegalArgumentException("the to index cannot less than the from index"); + if (to > cs.length()) + throw new IllegalArgumentException("the to index cannot be greater than the end of the sequence"); + if (cs == null) + throw new IllegalArgumentException("cs cannot be null"); + + final String s = cs.subSequence(from, to).toString(); + final LinkedList tokens = tokenize(s, 0, s.length()); + final LinkedList elements = elementize(tokens, s); + final Civar result; + if (elements.size() == 0) { + result = new Civar((List) (List) Collections.emptyList()); + } else if (elements.size() == 1) { + result = new Civar(Collections.singletonList(elements.getFirst())); + } else { + result = new Civar(Collections.unmodifiableList(Arrays.asList(elements.toArray(new Element[elements.size()])))); + } + //result.string = s; + return result; + } + + @Requires("tokens != null") + private static LinkedList elementize(final LinkedList tokens, final CharSequence cs) { + LinkedList elements = new LinkedList<>(); + Stack stack = new Stack<>(); + while (!tokens.isEmpty()) { + Token nextToken = tokens.pollFirst(); + stack.push(nextToken); + switch (nextToken.type) { + case OPERATOR: + if (nextToken.asOperator() == Operator.INSERTION) break; + stack.push(Token.element(popOperation(stack, null,cs))); + break; + case XMER: + stack.push(Token.element(popOperation(stack, stack.pop().asXmer(),cs))); + break; + case CLOSE_BRACKET: + stack.push(Token.element(popEmbedded(stack, cs))); + break; + case QMARK: + stack.pop(); + final Token t; + stack.push(t = Token.element(popOperation(stack,null,cs))); + t.asElement().makeOptional(); + break; + default: + } + } + + for (int i = 0; i < stack.size(); i++) { + final Token t = stack.get(i); + if (t.type == TokenType.ELEMENT) { + elements.add(t.asElement()); + } else if (t.type == TokenType.STAR) { + elements.add(new Element(Operator.MATCH,1,true, false, null)); + } else if (t.type == TokenType.NUMBER) { + if (i < stack.size() - 1) { + if (stack.get(i+1).type == TokenType.STAR) { + elements.add(new Element(Operator.MATCH,t.asNumber(),true, false, null)); + i++; + } else { + elements.add(new Element(Operator.MATCH,t.asNumber(),false, false, null)); + } + } else { + elements.add(new Element(Operator.MATCH,t.asNumber(),false, false, null)); + } + } else { + throw new IllegalArgumentException("Invalid Civar string: " + cs); + } + } + return elements; + } + + private static Element popEmbedded(final Stack stack, final CharSequence cs) { + Token closeBracket = stack.pop();// remove close parentesis. + LinkedList embeddedElements = new LinkedList<>(); + while (!stack.isEmpty()) { + final Token nextToken = stack.pop(); + if (nextToken.type == TokenType.OPEN_BRACKET) { + final Civar embeddedCivar = new Civar(Collections.unmodifiableList(embeddedElements)); + //embeddedCivar.string = cs.subSequence(nextToken.asNumber() + 1, closeBracket.asNumber()).toString(); + return new Element(embeddedCivar,false); + } else if (nextToken.type != TokenType.ELEMENT) { + throw new IllegalArgumentException("Civar format error"); + } else { + embeddedElements.add(0, nextToken.asElement()); + } + } + throw new IllegalArgumentException("Civar format error"); + } + + private static Element popOperation(final Stack stack, final String xmer, final CharSequence cs) { + if (stack.isEmpty()) { + throw new IllegalArgumentException("Invalid Civar string: " + cs); + } + Token operator = stack.pop(); + if (operator.type == TokenType.ELEMENT) { + return operator.asElement(); + } + if (operator.type != TokenType.OPERATOR) { + throw new IllegalArgumentException("Invalid Civar string:" + operator.type + " " + cs); + } + if (stack.isEmpty()) { + return new Element(operator.asOperator(), 1, false, false, xmer); + } else { + Token sizeOrStar = stack.pop(); + if (sizeOrStar.type == TokenType.STAR) { + if (stack.isEmpty()) { + return new Element(operator.asOperator(), 1, true, false, xmer); + } else if (stack.peek().type == TokenType.NUMBER) { + return new Element(operator.asOperator(), stack.pop().asNumber(), true, false, xmer); + } else { + return new Element(operator.asOperator(), 1, true, false, xmer); + } + } else if (sizeOrStar.type == TokenType.NUMBER) { + return new Element(operator.asOperator(), sizeOrStar.asNumber(), false, false, xmer); + } else { + stack.push(sizeOrStar); + return new Element(operator.asOperator(), 1, false, false, xmer); + } + } + } + + + private static LinkedList tokenize(final CharSequence cs, final int from, final int to) { + final LinkedList tokens = new LinkedList<>(); + int i = from; + while (i < to) { + char c = cs.charAt(i++); + // NUMBER tokens + if (Character.isDigit(c)) { + int num = 0; + do { + num = num * 10 + (c - '0'); + if (i == to) + break; + c = cs.charAt(i++); + } + while (Character.isDigit(c) || (i-- == 0)); // || (i-- == 0) is a trick to "pushback" the first non digit character. + tokens.add(Token.number(num)); + } else if (c == '*') { + tokens.add(Token.star()); + } else if (c == '(') { + tokens.add(Token.openBracket(i - 1)); + } else if (c == ')') { + tokens.add(Token.closeBracket(i - 1)); + } else if (c == '?') { + tokens.add(Token.qmark()); + } else if (c == '=') { + tokens.add(Token.operator(Operator.MATCH)); + } else if (Character.isLetter(c)) { + if (Character.isUpperCase(c)) { + tokens.add(Token.operator(Operator.fromChar(c))); + } else { + int start = i - 1; + do { + if (i == to) + break; + c = cs.charAt(i++); + } while (Character.isLowerCase(c) || (i-- == 0)); + tokens.add(Token.xmer(cs.subSequence(start, i))); + } + } else { + throw new IllegalArgumentException("the Civar string contains invalid characters starting with '" + c + '"'); + } + } + return tokens; + } + + + public Civar parse(final CharSequence cs) { + if (cs == null) + throw new IllegalArgumentException("cs cannot be null"); + return parse(cs, 0, cs.length()); + } + + + } + + /** + * Transforms a civar into the equivalent Cigar. + * @return never {@code null}. + */ + public Cigar toCigar(final int templateLength) { + + int minSeqLen = minimumTemplateSequenceSize(); + if (!expands() && templateLength != minSeqLen) + throw new IllegalArgumentException("the sequence provided does not match this Civar size " + templateLength + " != " + minSeqLen); + if (templateLength < minSeqLen) + throw new IllegalArgumentException("the sequence provided is too small for this Civar " + templateLength + " < " + minSeqLen); + int starCount = starCount(); + int paddingTotal = templateLength - minSeqLen; + int starPadding = starCount == 0 ? 0 : paddingTotal / starCount; + int excessPadding = starCount == 0 ? paddingTotal : paddingTotal % starCount; + + // We first get the equivalent cigar elements for the elements in the Civar. + final List cigarElements = new LinkedList<>(); + + for (final Element e : this.elements()) { + final int size = e.size(starPadding,excessPadding); + excessPadding -= e.excessPaddingUsed(excessPadding); + + switch (e.operator()) { + case EMBEDDED: + cigarElements.addAll(e.embedded.toCigar(size).getCigarElements()); + break; + case MATCH: + case TRANSITION: + case COMPLEMENT: + case TRANSVERSION: + cigarElements.add(new CigarElement(size, CigarOperator.M)); + break; + case INSERTION: + cigarElements.add(new CigarElement(size,CigarOperator.I)); + break; + case DELETION: + cigarElements.add(new CigarElement(size,CigarOperator.D)); + break; + default: + } + } + + // No we look for consequitive elements with the same operator and we merge them. + final ListIterator it = cigarElements.listIterator(); + while (it.hasNext()) { + final CigarElement thisElement = it.next(); + if (!it.hasNext()) + continue; + final CigarElement nextElement = it.next(); + if (thisElement.getOperator() == nextElement.getOperator()) { + final int nextLength = nextElement.getLength(); + it.remove(); + it.previous(); + it.set(new CigarElement(thisElement.getLength() + nextLength, thisElement.getOperator())); + } else + it.previous(); + } + return new Cigar(cigarElements); + } + + + + protected static enum TokenType { + OPERATOR, XMER, NUMBER, STAR, OPEN_BRACKET, CLOSE_BRACKET, ELEMENT, START, END, QMARK; + } + + protected static class Token { + public final TokenType type; + public final Object content; + + @Requires("type != null && content != null") + protected Token(final TokenType type, final Object content) { + this.type = type; + this.content = content; + + } + + public String toString() { + switch (this.type) { + case STAR: return "*"; + case OPEN_BRACKET: return "("; + case CLOSE_BRACKET: return ")"; + default: + return String.valueOf(content); + } + } + + + public Operator asOperator() { + return (Operator) content; + } + + public String asXmer() { + return ((CharSequence) content).toString(); + } + + public int asNumber() { + return ((Number) content).intValue(); + } + + public Element asElement() { + return ((Element) content); + } + + protected static Token xmer(final CharSequence cs) { + return new Token(TokenType.XMER, cs); + + } + + protected static Token operator(final Operator o) { + return new Token(TokenType.OPERATOR, o); + } + + protected static Token number(final int n) { + return new Token(TokenType.NUMBER, n); + } + + + + protected static Token star() { + return new Token(TokenType.STAR, null); + } + + protected static Token openBracket(int offset) { + return new Token(TokenType.OPEN_BRACKET, offset); + } + + protected static Token closeBracket(int offset) { + return new Token(TokenType.CLOSE_BRACKET, offset); + } + + protected static Token element(final Element e) { + return new Token(TokenType.ELEMENT, e); + } + + + public static Token qmark() { + return new Token(TokenType.QMARK, null); + } + } + + + public static class ElementOffset { + public final Element element; + public final int sequenceFrom; + public final int sequenceTo; + public final int templateFrom; + public final int templateTo; + + + @Requires("e != null && from >= 0 && to >= from && tFrom >= 0 && tTo >= tFrom") + protected ElementOffset(final Element e, final int from, final int to, final int tFrom, final int tTo) { + element = e; + sequenceFrom = from; + sequenceTo = to; + templateFrom = tFrom; + templateTo = tTo; + } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java new file mode 100644 index 000000000..10f9a3803 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/CivarUnitTest.java @@ -0,0 +1,401 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 8/7/13 + * Time: 5:58 PM + * To change this template use File | Settings | File Templates. + */ +public class CivarUnitTest extends BaseTest { + + + @Test(dataProvider="validCivarExamples") + public void testValidCivarInstanciation(final String civarString) { + + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertNotNull(civar); + } + + + @Test(dataProvider="expectedElementLengths") + public void testValidCivarElementLength(final String civarString, final int expected) { + + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.elements().size(), expected); + } + + + @Test(dataProvider="expectedElementSizes") + public void testValidCivarElementSizes(final String civarString, final int[] expected) { + + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.elements().size(),expected.length); + for (int i = 0; i < expected.length; i++) { + Assert.assertEquals(civar.elements().get(i).size(),expected[i]); + } + } + + @Test(dataProvider="expectedElementOperators") + public void testValidCivarElementOperators(final String civarString, final String expected) { + + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.elements().size(),expected.length()); + for (int i = 0; i < expected.length(); i++) { + Assert.assertEquals(civar.elements().get(i).operator().charValue,expected.charAt(i)); + } + } + + @Test(dataProvider="expectedMinimumSequenceLength") + public void testValidCivarMinimumSequenceLength(final String civarString, final int expected) { + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.minimumTemplateSequenceSize(),expected); + } + + @Test(dataProvider="expectedHasVariation") + public void testValidCivarHasVariation(final String civarString, final boolean expected) { + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.hasVariation(),expected); + } + + + @Test(dataProvider="invalidCivarExamples", expectedExceptions = {IllegalArgumentException.class}) + public void testInvalidInstanciation(final String civarString) { + + final Civar civar = Civar.fromCharSequence(civarString); + } + + @Test(dataProvider="unrolledTestDataIsUnrolledExamples") + public void testInUnrolled(final String civarString, final boolean expected) { + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.isUnrolled(),expected); + } + + @Test(dataProvider="unrolledTestDataUnrolledCivarExamples") + public void testValidCivarUnrolling(final String civarString, final String[] expected) { + Set expectedSet = new HashSet<>(); + expectedSet.addAll(Arrays.asList(expected)); + + final Civar civar = Civar.fromCharSequence(civarString); + java.util.List unrolledList = civar.unroll(); + Assert.assertEquals(unrolledList.size(),expected.length); + for (int i = 0; i < expected.length; i++) { + Assert.assertTrue(expectedSet.contains(unrolledList.get(i).toString()), + "Unrolled civar " + unrolledList.get(i).toString() + " not present in expected Set: " + + Arrays.toString(expected) + ". Unrolled set is: " + Arrays.toString(unrolledList.toArray())); + } + } + + @Test(dataProvider="applyToDataExamples") + public void testValidCivarUnrolling(final String civarString, final String before, final String expectedAfter) { + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.applyTo(before),expectedAfter); + } + + @Test(dataProvider="optionizeDataExamples") + public void testValidOptionizeAll(final String civarString, final String expected) { + final Civar civar = Civar.fromCharSequence(civarString); + Assert.assertEquals(civar.optionalizeAll().toString(),expected); + } + + @DataProvider(name="validCivarExamples") + public Iterator validCivarExamples() { + return new Iterator() { + + int i = 0; + + @Override + public boolean hasNext() { + return i < VALID_CIVAR_EXAMPLES.length; + } + + @Override + public Object[] next() { + return new Object[] { VALID_CIVAR_EXAMPLES[i++][0] }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name="expectedHasVariation") + public Iterator expectedHasVariation () { + return validCivarExamples(5); + } + + @DataProvider(name="expectedMinimumSequenceLength") + public Iterator expectedMinimumSequenceLength () { + return validCivarExamples(4); + } + + @DataProvider(name="expectedElementOperators") + public Iterator expectedElementOperators() { + return validCivarExamples(3); + } + + @DataProvider(name="expectedElementSizes") + public Iterator expectedElementSizes() { + return validCivarExamples(2); + } + + @DataProvider(name="expectedElementLengths") + public Iterator expectedElementLengths() { + return validCivarExamples(1); + } + + public Iterator validCivarExamples(final int field) { + return new Iterator() { + + int i = 0; + + @Override + public boolean hasNext() { + return i < VALID_CIVAR_EXAMPLES.length; + } + + @Override + public Object[] next() { + return new Object[] { VALID_CIVAR_EXAMPLES[i][0], VALID_CIVAR_EXAMPLES[i++][field] }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name="unrolledTestDataIsUnrolledExamples") + public Iterator unrolledTestDataIsUnrolledExamples() { + return unrolledTestDataExamples(1); + } + + @DataProvider(name="unrolledTestDataUnrolledCivarExamples") + public Iterator unrolledTestDataUnrolledCivarExamples() { + return unrolledTestDataExamples(2); + } + + public Iterator unrolledTestDataExamples(final int field) { + return new Iterator() { + + int i = 0; + + @Override + public boolean hasNext() { + return i < UNROLLED_TEST_DATA.length; + } + + @Override + public Object[] next() { + return new Object[] { UNROLLED_TEST_DATA[i][0], UNROLLED_TEST_DATA[i++][field] }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name="optionizeDataExamples") + public Iterator optionizeDataExamples() { + return optionizeDataExamples(1); + } + + public Iterator optionizeDataExamples(final int field) { + return new Iterator() { + + int i = 0; + + @Override + public boolean hasNext() { + return i < OPTIONIZED_TEST_DATA.length; + } + + @Override + public Object[] next() { + return new Object[] { OPTIONIZED_TEST_DATA[i][0], OPTIONIZED_TEST_DATA[i++][field] }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name="applyToDataExamples") + public Iterator applyToDataExamples() { + return new Iterator() { + + int i = 0; + + @Override + public boolean hasNext() { + return i < APPLY_TO_TEST_DATA.length; + } + + @Override + public Object[] next() { + return APPLY_TO_TEST_DATA[i++]; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name="invalidCivarExamples") + public Object[][] invalidCivarExamples() { + return INVALID_CIVAR_EXAMPLES; + } + + // columns : Civar string, number of elements. + private static final Object[][] INVALID_CIVAR_EXAMPLES = new Object[][] { + {"(100="}, + {"*=)"}, + {"10(=2T30="}, + {"2*=2T/3*="}, + {"3I(acc)"}, + {"a"}, + {")"}, + {"100&=1"}, + {"?100="}, + + }; + + + private static final Object[][] VALID_CIVAR_EXAMPLES = new Object[][] { + {"100=", 1, ints(100), "=", 100, false }, + {"*=", 1 , ints(1), "=", 0, false }, + {"10=2T30=", 3, ints(10,2,30), "=T=",42 , true}, + {"*=2T3*=", 3, ints(1,2,3), "=T=",2 , true}, + {"3Iacc",1 , ints(3), "I", 0, true}, + {"Ia",1, ints(1), "I", 0, true}, + {"10D",1, ints(10), "D", 10, true}, + {"*", 1, ints(1), "=", 0, false}, + {"*D", 1, ints(1), "D", 0, true}, + {"10(1D)10=",3, ints(10,1,10), "=(=", 21, true}, + {"1*",1, ints(1), "=", 0, false}, + {"1*2*",2, ints(1,2), "==", 0, false}, + {"*11",2, ints(1,11), "==", 11, false}, + {"100=1T100=", 3, ints(100,1,100), "=T=", 201, true}, + {"100=3Iacg101=", 3, ints(100,3,101), "=I=", 201, true}, + {"100=30Igctcggatgccttgcggggctccagagtcc101=", 3 , ints(100,30,101), "=I=", 201, true}, + {"99=3D99=", 3, ints(99,3,99), "=D=", 201, true}, + {"84=30D84=", 3, ints(84,30,84), "=D=", 198, true}, + {"91=1T9=3Iacg100=", 5, ints(91,1,9,3,100), "=T=I=", 201, true}, + {"71=1T29=3Iacg100=",5, ints(71,1,29,3,100), "=T=I=",201, true}, + {"75=1T8=1T8=1T8=1T8=1T75=", 11, ints(75,1,8,1,8,1,8,1,8,1,75), "=T=T=T=T=T=",187, true}, + {"75=1T?8=", 3, ints(75,1,8), "=T=", 84, true} + }; + + private static final Object[][] UNROLLED_TEST_DATA = new Object[][] { + { "10=1D10=", true, strs( "10=1D10=") }, + { "10=(1D)10=", true, strs( "10=(1D)10=") }, + { "10=1D?10=", false, strs("10=1=10=", "10=1D10=") }, + { "10=1D?10=3Iacg?10=", false , strs("10=1=10=0=10=","10=1=10=3Iacg10=", "10=1D10=0=10=", "10=1D10=3Iacg10=") }, + { "10=1D?10=" , false, strs("10=1D10=","10=1=10=") }, + { "100=1T?100=" , false, strs("100=1T100=","100=1=100=") }, + { "100=3Iacg?101=" , false, strs("100=3Iacg101=","100=0=101=") }, + { "100=30Igctcggatgccttgcggggctccagagtcc?101=", false ,strs("100=30Igctcggatgccttgcggggctccagagtcc101=", "100=0=101=") }, + { "99=3D?99=", false , strs("99=3D99=","99=3=99=") }, + { "84=30D?84=", false, strs("84=30D84=", "84=30=84=")}, + { "91=1T?9=3Iacg?100=", false, strs("91=1T9=3Iacg100=", "91=1=9=3Iacg100=", "91=1=9=0=100=", "91=1T9=0=100=") }, + { "71=1T?29=3Iacg?100=", false , strs("71=1T29=3Iacg100=","71=1=29=3Iacg100=","71=1=29=0=100=", "71=1T29=0=100=") }, + // { "75=1T?8=1T?8=1T?8=1T?8=1T?75=", false, }, + { "75=1T?8=", false, strs("75=1T8=","75=1=8=") } + }; + + private static final Object[][] OPTIONIZED_TEST_DATA = new Object[][] { + { "10=1D10=", "10=1D?10=" }, + {"100=1T100=","100=1T?100=" }, + {"100=3Iacg101=", "100=3Iacg?101=" }, + {"100=30Igctcggatgccttgcggggctccagagtcc101=","100=30Igctcggatgccttgcggggctccagagtcc?101="}, + {"99=3D99=", "99=3D?99="}, + {"84=30D84=", "84=30D?84="}, + {"91=1T9=3Iacg100=", "91=1T?9=3Iacg?100="}, + {"71=1T29=3Iacg100=","71=1T?29=3Iacg?100="}, + {"75=1T8=1T8=1T8=1T8=1T75=", "75=1T?8=1T?8=1T?8=1T?8=1T?75="}, + {"75=1T?8=", "75=1T?8="} + }; + + private static final Object[][] APPLY_TO_TEST_DATA = new Object[][] { + {"3=1D3=", "ACTAACT", "ACTACT" }, + {"*=1C*=","ACTTACT", "ACTAACT" }, + {"4=3Iacg3=","ACTGACT","ACTGACGACT" }, + {"*=30Igctcggatgccttgcggggctccagagtcc*=","AA","AGCTCGGATGCCTTGCGGGGCTCCAGAGTCCA"}, + {"*=3D*=", "ACTTTAC","ACAC"}, + {"1=30D1=", "AGCTCGGATGCCTTGCGGGGCTCCAGAGTCCA","AA"}, + }; + + + private static int[] ints(final int ... iii) { + return iii; + } + + private static String[] strs(final String ... sss) { + return sss; + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java new file mode 100644 index 000000000..a98c4e03a --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java @@ -0,0 +1,131 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.sting.utils.pairhmm.FastLoglessPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; + +import java.util.Collections; +import java.util.Random; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 8/6/13 + * Time: 3:00 PM + * To change this template use File | Settings | File Templates. + */ +public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark { +// ./private/shell/googleCaliperCommand.csh org.broadinstitute.sting.gatk.walkers.haplotypecaller.HCLikelihoodCalculationEnginesBenchmark --saveResults build/benchmark/HCLikelihoodCalculationEnginesBenchmark + +// @Param({"10", "25"}) + @Param({"10"}) + protected int kmerSize; + + +// @Param({"100","250"}) + @Param({"100"}) + protected int readLength; + + @Param({"*1T*", "*3Iacg*","*30Igctcggatgccttgcggggctccagagtcc*", + "*3D*","*30D*","*1T3=3Iacg*","*1T*3Iacg*","*1T8=1T8=1T8=1T8=1T*","*1T*1T*1T*1T*1T*"}) +// @Param({"*1T*"}) + protected String variation; + + @Param({"10000"}) +// @Param({"100", "300", "1000"})// "3000", "10000"}) + protected int readCount; + +// @Param({"300","1000","3000"}) + @Param({"300"}) + protected int regionSize; + + // Invariants: + + protected final byte bq = 20; + + protected final byte iq = 35; + + protected final byte dq = 35; + + protected ActiveRegionTestDataSet dataSet; + + @Param({"true"}) + public boolean withErrors; + + @Param({"13"}) + public int randomSeed; + + public void setUp() { + dataSet = ActiveRegionTestDataSetUnitTest.createActiveRegionTestDataSet(kmerSize, readLength, variation, readCount, regionSize, bq, iq, dq); + final Random rnd = new Random(randomSeed); + if (withErrors) dataSet.introduceErrors(rnd); + } + + @SuppressWarnings("unused") + public void timeGraphBasedLikelihoods(final int reps) { + for (int i = 0; i < reps; i++) { + GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX); + rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonMap("anonymous", dataSet.readList())); + } + } + + @SuppressWarnings("unused") + public void timeLoglessPairHMM(final int reps) { + for (int i = 0; i < reps; i++) { + final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, false, + PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); + engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonMap("anonymous", dataSet.readList())); + } + } + + + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index 1f7236c39..3907ffbd6 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -57,18 +57,18 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "12ed9d67139e7a94d67e9e6c06ac6e16"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "88c10027c21712b1fe475c06cadd503c"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); } @@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa } private void HCTestComplexGGA(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); } @@ -88,12 +88,12 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "b7a01525c00d02b3373513a668a43c6a"); + "b787be740423b950f8529ccc838fabdd"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "a2a42055b068334f415efb07d6bb9acd"); + "f74d68cbc1ecb66a7128258e111cd030"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index 8ab2c0779..97744f126 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -47,6 +47,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -57,18 +59,18 @@ import java.util.List; public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { @DataProvider(name = "MyDataProvider") public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000"; final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "2b54e4e948144030a829175bcd295e47"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "ba1bb72caa06c1962a202b2012c266cb"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a841d9e94fb832066a04f13bdc62b101"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "6cc95c47368a568fb9e1eb8578f96b0b"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2703f1c0c27b3c977689604b5f78b61f"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b54e36bbb4dc6c3b786349fa267d1f6c"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ce9c42e7e97a45a82315523dbd77fcf"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "c5a55196e10680a02c833a8a44733306"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "9b9923ef41bfc7346c905fdecf918f92"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "7cb1e431119df00ec243a6a115fa74b8"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "90e22230149e6c32d1115d0e2f03cab1"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "b39a4bc19a0acfbade22a011cd229262"}); return tests.toArray(new Object[][]{}); @@ -79,10 +81,51 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { */ @Test(dataProvider = "MyDataProvider") public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { - final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header", - b37KGReference, bam, intervals, mode); + final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d", + b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); executeTest(name, spec); } + + @Test + public void testERCRegionWithNoCalledHaplotypes() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + spec.disableShadowBCF(); + executeTest("testERCRegionWithNoCalledHaplotypes", spec); + } + + @Test() + public void testMissingGVCFIndexException() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + @Test() + public void testWrongParameterGVCFIndexException() { + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER + 1); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } + + @Test() + public void testWrongTypeGVCFIndexException() { + // ensure non-optimal, if optimal changes + GATKVCFIndexType type = GATKVCFIndexType.DYNAMIC_SEEK; + if (HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE == GATKVCFIndexType.DYNAMIC_SEEK) + type = GATKVCFIndexType.DYNAMIC_SIZE; + + final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d", + b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001", type, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", 1, UserException.GVCFIndexException.class); + spec.disableShadowBCF(); + executeTest("testMissingGVCFIndexingStrategyException", spec); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 43095dcf3..dfbbd7084 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -47,43 +47,59 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; +import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; - final static String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final static String NA12878_BAM = privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; final static String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; final static String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; final static String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + final static String NA12878_PCRFREE250_ADAPTER_TRIMMED = privateTestDir + "PCRFree.2x250.b37_decoy.NA12878.adapter_trimmed-10000000-11000000.bam"; final static String CEUTRIO_MT_TEST_BAM = privateTestDir + "CEUTrio.HiSeq.b37.MT.1_50.bam"; final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; private void HCTest(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCaller: args=" + args, spec); } @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "baa5a2eedc8f06ce9f8f98411ee09f8a"); + HCTest(CEUTRIO_BAM, "", "c0b1b64c6005cd3640ffde5dbc10174b"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "f09e03d41238697b23f95716a12667cb"); + HCTest(NA12878_BAM, "", "439ce9024f04aad08eab1526d887e295"); + } + + @Test + public void testHaplotypeCallerGraphBasedSingleSample() { + HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "213df0bdaa78a695e9336128333e4407"); + } + + @Test + public void testHaplotypeCallerGraphBasedMultiSample() { + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "ceee711cac50b4bb66a084acb9264941"); } @Test(enabled = false) // can't annotate the rsID's yet @@ -94,7 +110,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "130d36448faeb7b8d4bce4be12dacd3a"); + "b09437f11db40abd49195110e50692c2"); } @Test @@ -103,14 +119,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private void HCTestIndelQualityScores(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); } @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "7c20aa62633f4ce8ebf12950fbf05ec0"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "c57c463542304fb7b2576e531faca89e"); } private void HCTestNearbySmallIntervals(String bam, String args, String md5) { @@ -118,7 +134,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { if( containsDuplicateRecord(vcf, parser) ) { @@ -131,23 +147,23 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private boolean containsDuplicateRecord( final File vcf, final GenomeLocParser parser ) { - final List> VCs = new ArrayList>(); + final List> VCs = new ArrayList<>(); try { for( final VariantContext vc : GATKVCFUtils.readVCF(vcf).getSecond() ) { - VCs.add(new Pair(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); + VCs.add(new Pair<>(parser.createGenomeLoc(vc), new GenotypingEngine.Event(vc))); } } catch( IOException e ) { throw new IllegalStateException("Somehow the temporary VCF from the integration test could not be read."); } - final Set> VCsAsSet = new HashSet>(VCs); + final Set> VCsAsSet = new HashSet<>(VCs); return VCsAsSet.size() != VCs.size(); // The set will remove duplicate Events. } @Test public void testHaplotypeCallerNearbySmallIntervals() { - HCTestNearbySmallIntervals(NA12878_BAM, "", "0ddc56f0a0fbcfefda79aa20b2ecf603"); + HCTestNearbySmallIntervals(NA12878_BAM, "", "75820a4558a559b3e1636fdd1b776ea2"); } // This problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -156,14 +172,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { // any of the calls in that region because it is so messy. @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a")); + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { - final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); executeTest("HCTestStructuralIndels: ", spec); } @@ -176,6 +192,26 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { executeTest("HCTestDoesNotFailOnBadRefBase: ", spec); } + @Test + public void HCTestDanglingTailMergingForDeletions() throws IOException { + final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); + + // confirm that the call is the correct one + final VCFCodec codec = new VCFCodec(); + final FileInputStream s = new FileInputStream(outputVCF); + final LineIterator lineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(s)); + codec.readHeader(lineIterator); + final String line = lineIterator.next(); + Assert.assertFalse(line == null); + final VariantContext vc = codec.decode(line); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertTrue(vc.getReference().basesMatch("ATGTATG")); + Assert.assertTrue(vc.getAlternateAllele(0).basesMatch("A")); + } + + // -------------------------------------------------------------------------------------------------------------- // // testing reduced reads @@ -185,16 +221,16 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("5fe9310addf881bed4fde2354e59ce34")); + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e")); executeTest("HC calling on a ReducedRead BAM", spec); } @Test public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("26a9917f6707536636451266de0116c3")); + "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, + Arrays.asList("6a9222905c740b9208bf3c67478514eb")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } @@ -207,17 +243,65 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestDBSNPAnnotationWGS() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, - Arrays.asList("c5c63d03e1c4bbe32f06902acd4a10f9")); + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("a43d6226a51eb525f0774f88e3778189")); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); } @Test public void HCTestDBSNPAnnotationWEx() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + " -L " + hg19Intervals + " -isr INTERSECTION", 1, - Arrays.asList("f0b2a96040429908cce17327442eec29")); + Arrays.asList("1352cbe1404aefc94eb8e044539a9882")); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); } + + @Test + public void HCTestDBSNPAnnotationWGSGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, + Arrays.asList("a6c4d5d2eece2bd2c51a81e34e80040f")); + executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); + } + + @Test + public void HCTestDBSNPAnnotationWExGraphBased() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 + + " -L " + hg19Intervals + " -isr INTERSECTION", 1, + Arrays.asList("69db1045b5445a4f90843f368bd62814")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + @Test + public void HCTestGraphBasedPCRFreePositiveLogLkFix() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + hg19Reference + " --no_cmdline_in_header -I " + NA12878_PCRFREE250_ADAPTER_TRIMMED + " -o %s -L 20:10,000,000-11,000,000 " + , 1, + Arrays.asList("")); + executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // test PCR indel model + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void HCTestAggressivePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("19c2992541ede7407192660fdc1fadbf")); + executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec); + } + + @Test + public void HCTestConservativePcrIndelModelWGS() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1, + Arrays.asList("f4ab037915db3a40ba26e9ee30d40e16")); + executeTest("HC calling with conservative indel error modeling on WGS intervals", spec); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java index 3b17725f9..21648b2b9 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerParallelIntegrationTest.java @@ -58,10 +58,10 @@ import java.util.List; public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @DataProvider(name = "NCTDataProvider") public Object[][] makeNCTDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, "e800f6bb3a820da5c6b29f0195480796"}); + tests.add(new Object[]{nct, "29cb04cca87f42b4762c34dfea5d15b7"}); } return tests.toArray(new Object[][]{}); @@ -70,7 +70,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest { @Test(dataProvider = "NCTDataProvider") public void testHCNCT(final int nct, final String md5) { WalkerTestSpec spec = new WalkerTestSpec( - "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, Arrays.asList(md5)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java index 2fda56665..9f13efc9c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngineUnitTest.java @@ -55,10 +55,8 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; @@ -87,15 +85,6 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); } - private enum Assembler {DEBRUIJN_ASSEMBLER, READ_THREADING_ASSEMBLER} - private LocalAssemblyEngine createAssembler(final Assembler type) { - switch ( type ) { - case DEBRUIJN_ASSEMBLER: return new DeBruijnAssembler(); - case READ_THREADING_ASSEMBLER: return new ReadThreadingAssembler(); - default: throw new IllegalStateException("Unexpected " + type); - } - } - @DataProvider(name = "AssembleIntervalsData") public Object[][] makeAssembleIntervalsData() { List tests = new ArrayList(); @@ -107,12 +96,10 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { final int stepSize = 200; final int nReadsToUse = 5; - for ( final Assembler assembler : Assembler.values() ) { - for ( int startI = start; startI < end; startI += stepSize) { - final int endI = startI + windowSize; - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI); - tests.add(new Object[]{assembler, refLoc, nReadsToUse}); - } + for ( int startI = start; startI < end; startI += stepSize) { + final int endI = startI + windowSize; + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI); + tests.add(new Object[]{new ReadThreadingAssembler(), refLoc, nReadsToUse}); } return tests.toArray(new Object[][]{}); @@ -130,13 +117,11 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { final int variantStepSize = 1; final int nReadsToUse = 5; - for ( final Assembler assembler : Assembler.values() ) { - for ( int startI = start; startI < end; startI += stepSize) { - final int endI = startI + windowSize; - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI); - for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) { - tests.add(new Object[]{assembler, refLoc, nReadsToUse, variantStart}); - } + for ( int startI = start; startI < end; startI += stepSize) { + final int endI = startI + windowSize; + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, startI, endI); + for ( int variantStart = windowSize / 2 - 10; variantStart < windowSize / 2 + 10; variantStart += variantStepSize ) { + tests.add(new Object[]{new ReadThreadingAssembler(), refLoc, nReadsToUse, variantStart}); } } @@ -144,7 +129,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { } @Test(dataProvider = "AssembleIntervalsData") - public void testAssembleRef(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse) { + public void testAssembleRef(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse) { final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); final List reads = new LinkedList(); @@ -163,7 +148,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { } @Test(dataProvider = "AssembleIntervalsWithVariantData") - public void testAssembleRefAndSNP(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { + public void testAssembleRefAndSNP(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); final Allele refBase = Allele.create(refBases[variantSite], true); final Allele altBase = Allele.create((byte)(refBase.getBases()[0] == 'A' ? 'C' : 'A'), false); @@ -172,7 +157,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { } @Test(dataProvider = "AssembleIntervalsWithVariantData") - public void testAssembleRefAndDeletion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { + public void testAssembleRefAndDeletion(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); for ( int deletionLength = 1; deletionLength < 10; deletionLength++ ) { final Allele refBase = Allele.create(new String(refBases).substring(variantSite, variantSite + deletionLength + 1), true); @@ -183,7 +168,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { } @Test(dataProvider = "AssembleIntervalsWithVariantData") - public void testAssembleRefAndInsertion(final Assembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { + public void testAssembleRefAndInsertion(final ReadThreadingAssembler assembler, final GenomeLoc loc, final int nReadsToUse, final int variantSite) { final byte[] refBases = seq.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); for ( int insertionLength = 1; insertionLength < 10; insertionLength++ ) { final Allele refBase = Allele.create(refBases[variantSite], false); @@ -193,7 +178,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { } } - private void testAssemblyWithVariant(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) { + private void testAssemblyWithVariant(final ReadThreadingAssembler assembler, final byte[] refBases, final GenomeLoc loc, final int nReadsToUse, final VariantContext site) { final String preRef = new String(refBases).substring(0, site.getStart()); final String postRef = new String(refBases).substring(site.getEnd() + 1, refBases.length); final byte[] altBases = (preRef + site.getAlternateAllele(0).getBaseString() + postRef).getBytes(); @@ -217,7 +202,7 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { } - private List assemble(final Assembler assembler, final byte[] refBases, final GenomeLoc loc, final List reads) { + private List assemble(final ReadThreadingAssembler assembler, final byte[] refBases, final GenomeLoc loc, final List reads) { final Haplotype refHaplotype = new Haplotype(refBases, true); final Cigar c = new Cigar(); c.add(new CigarElement(refHaplotype.getBases().length, CigarOperator.M)); @@ -225,9 +210,9 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { final ActiveRegion activeRegion = new ActiveRegion(loc, null, true, genomeLocParser, 0); activeRegion.addAll(reads); - final LocalAssemblyEngine engine = createAssembler(assembler); // logger.warn("Assembling " + activeRegion + " with " + engine); - return engine.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.emptyList(), null); + final AssemblyResultSet assemblyResultSet = assembler.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.emptyList(), null); + return assemblyResultSet.getHaplotypeList(); } @DataProvider(name = "SimpleAssemblyTestData") @@ -239,30 +224,25 @@ public class LocalAssemblyEngineUnitTest extends BaseTest { final int windowSize = 200; final int end = start + windowSize; - final Map edgeExcludesByAssembler = new EnumMap<>(Assembler.class); - edgeExcludesByAssembler.put(Assembler.DEBRUIJN_ASSEMBLER, 26); - edgeExcludesByAssembler.put(Assembler.READ_THREADING_ASSEMBLER, 25); // TODO -- decrease to zero when the edge calling problem is fixed + final int excludeVariantsWithinXbp = 25; // TODO -- decrease to zero when the edge calling problem is fixed final String ref = new String(seq.getSubsequenceAt(contig, start, end).getBases()); final GenomeLoc refLoc = genomeLocParser.createGenomeLoc(contig, start, end); - for ( final Assembler assembler : Assembler.values() ) { - final int excludeVariantsWithXbp = edgeExcludesByAssembler.get(assembler); for ( int snpPos = 0; snpPos < windowSize; snpPos++) { - if ( snpPos > excludeVariantsWithXbp && (windowSize - snpPos) >= excludeVariantsWithXbp ) { + if ( snpPos > excludeVariantsWithinXbp && (windowSize - snpPos) >= excludeVariantsWithinXbp ) { final byte[] altBases = ref.getBytes(); altBases[snpPos] = altBases[snpPos] == 'A' ? (byte)'C' : (byte)'A'; final String alt = new String(altBases); - tests.add(new Object[]{"SNP at " + snpPos, assembler, refLoc, ref, alt}); + tests.add(new Object[]{"SNP at " + snpPos, new ReadThreadingAssembler(), refLoc, ref, alt}); } } - } return tests.toArray(new Object[][]{}); } @Test(dataProvider = "SimpleAssemblyTestData") - public void testSimpleAssembly(final String name, final Assembler assembler, final GenomeLoc loc, final String ref, final String alt) { + public void testSimpleAssembly(final String name, final ReadThreadingAssembler assembler, final GenomeLoc loc, final String ref, final String alt) { final byte[] refBases = ref.getBytes(); final byte[] altBases = alt.getBytes(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java similarity index 82% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java index 48c9d3c1a..16a3e9af2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java @@ -54,13 +54,36 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.*; + /** - * Unit tests for LikelihoodCalculationEngine + * Unit tests for PairHMMLikelihoodCalculationEngine */ -public class LikelihoodCalculationEngineUnitTest extends BaseTest { +public class PairHMMLikelihoodCalculationEngineUnitTest extends BaseTest { + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + + @BeforeSuite + public void setup() { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + } @Test public void testNormalizeDiploidLikelihoodMatrixFromLog10() { @@ -76,7 +99,7 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { }; - Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix)); + Assert.assertTrue(compareDoubleArrays(PairHMMLikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix)); double[][] likelihoodMatrix2 = { {-90.2, 0, 0, 0}, @@ -90,10 +113,50 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { {-4.9, -15.4, -33.8, 0}, {-4.9, -15.4, -33.8, -997.9}, }; - Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); + Assert.assertTrue(compareDoubleArrays(PairHMMLikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); + } + + @DataProvider(name = "PcrErrorModelTestProvider") + public Object[][] createPcrErrorModelTestData() { + List tests = new ArrayList(); + + for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) { + for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) { + tests.add(new Object[]{repeat, repeatLength}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PcrErrorModelTestProvider", enabled = true) + public void createPcrErrorModelTest(final String repeat, final int repeatLength) { + + final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte)0, false, + PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, + PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE); + + final String readString = Utils.dupString(repeat, repeatLength); + final byte[] insQuals = new byte[readString.length()]; + final byte[] delQuals = new byte[readString.length()]; + Arrays.fill(insQuals, (byte)PairHMMLikelihoodCalculationEngine.INITIAL_QSCORE); + Arrays.fill(delQuals, (byte)PairHMMLikelihoodCalculationEngine.INITIAL_QSCORE); + + engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals); + + final RepeatCovariate repeatCovariate = new RepeatLengthCovariate(); + repeatCovariate.initialize(PairHMMLikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, PairHMMLikelihoodCalculationEngine.MAX_REPEAT_LENGTH); + + for ( int i = 1; i < insQuals.length; i++ ) { + + final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond(); + final byte adjustedScore = PairHMMLikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0); + + Assert.assertEquals(insQuals[i-1], adjustedScore); + Assert.assertEquals(delQuals[i-1], adjustedScore); + } } - // BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests! /* private class BasicLikelihoodTestProvider extends TestDataProvider { public Double readLikelihoodForHaplotype1; @@ -147,7 +210,7 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { } final HashSet sampleSet = new HashSet(1); sampleSet.add("myTestSample"); - return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); + return PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java new file mode 100644 index 000000000..a6d2644cd --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java @@ -0,0 +1,375 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller; + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.sting.utils.pairhmm.FastLoglessPairHMM; +import org.broadinstitute.sting.utils.pairhmm.FlexibleHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; +import org.broadinstitute.sting.utils.sam.ClippedGATKSAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.SkipException; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 8/4/13 + * Time: 10:20 PM + * To change this template use File | Settings | File Templates. + */ +@Test(enabled=false) +public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegionTestDataSetUnitTest { + + +// private static FastHMM hmm = new MLLog10PairHMM((byte)10); // new FastLoglessPairHMM((byte)10); + + private static FlexibleHMM hmm = new FastLoglessPairHMM((byte)10); + + // for debugging purposes: + private static final boolean DUMP_LIKELIHOODS = false; + private PrintWriter likelihoodDumpWriter = null; + private File likelihoodDumpFile = null; + + + @BeforeClass + private void setUp() throws IOException { + if (DUMP_LIKELIHOODS) { + likelihoodDumpFile = File.createTempFile("rtlce-test", ".txt"); + Reporter.log("Dumping Likelihoods in file '" + likelihoodDumpFile + "'",true); + likelihoodDumpWriter = new PrintWriter(likelihoodDumpFile);//new FileWriter(f)); + } + } + + @AfterClass + private void tearDown() throws IOException { + if (DUMP_LIKELIHOODS) { + likelihoodDumpWriter.close(); + Reporter.log("Dumped Likelihoods in file '" + likelihoodDumpFile + "'", true); + } + } + + @Test(dataProvider="activeRegionTestDataSets",enabled=false) + public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + super.testActiveRegionsDataSet(as,kmerSize,readLength,variation,readCount,regionSize,bq,iq,dq); + } + /** How many missing read record are tolerated in the graph based approach. For example a read is missed + * if it does not map to the reference path with at least two kmers in non overlapping positions. This constant + * indictes the proportion of reads reacords that we can miss with respect to all possible + */ + private static final double READ_SKIP_TOLERANCE = 0.01; + + //final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, false, + // PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3); + final PairHMMLikelihoodCalculationEngine fullPairHMM = new PairHMMLikelihoodCalculationEngine((byte)10, false, + PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, Double.NEGATIVE_INFINITY,true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); + + // When using likelihoods it should be around 0.05 since + // When using maximum-likelihoods it can be as low as 0.00001 + private static final double SIGNIFICANT_LnLK_RATIO_DIFF_FRACTION = hmm instanceof FastLoglessPairHMM ? 0.1 : 0.00001; + + + // Some case is kind of expected to have differences between PairHMM and GraphBased Flexible PairHMM. + // Is therefore difficult to test for the to give similar results in a unit test.... + // This is left for for example Integration tests like GraphBasedVsLoglessAccuracyIntegrationTest. + // There code herein is maintain around for historical purposes, but disabled. + @Test(dataProvider="readLikekihoodRatioTestData",enabled=false) + public void testReadLikelihoodRatios(final ActiveRegionTestDataSet ds, final GATKSAMRecord read, final Allele a1, + final Allele a2, final PerReadAlleleLikelihoodMap loglessLks, + final PerReadAlleleLikelihoodMap graphLks, final List readEventOffsets, final List firstAlleleCivar, final List secondAlleleCivar ) { + + checkForLongEventsThatMightCauseFailures(readEventOffsets, firstAlleleCivar, secondAlleleCivar); + final Map logless = loglessLks.getLikelihoodReadMap().get(read); + final Map graph = graphLks.getLikelihoodReadMap().get(read); + final double loglessA1Lk = logless.get(a1); + final double loglessA2Lk = logless.get(a2); + if (graph == null) + throw new SkipException("no likelihoods produced for this read using the graph method: Lla1= " + loglessA1Lk + " Lla2= " + loglessA2Lk + "LlDiff=" + (loglessA2Lk - loglessA1Lk) ); + + final Double graphA1Lk = graph.get(a1); + final Double graphA2Lk = graph.get(a2); + if (graphA1Lk == null) + throw new SkipException("no likelihoods produced for this read in the first haplotype: Lla1= " + loglessA1Lk + " Lla2= " + loglessA2Lk + "LlDiff=" + (loglessA2Lk - loglessA1Lk) ); + if (graphA2Lk == null) + throw new SkipException("no likelihoods produced for this read in the second haplotype: Lla1= " + loglessA1Lk + " Lla2= " + loglessA2Lk + "LlDiff=" + (loglessA2Lk - loglessA1Lk) ); + + final double loglessDiff = loglessA1Lk - loglessA2Lk; + final double graphDiff = graphA1Lk - graphA2Lk; + final double epsilon = calculateEpsilon(graphDiff,loglessDiff); + dumpLikelihoods(read,loglessA1Lk,loglessA2Lk,graphA1Lk,graphA2Lk,read.getReadString() + " " + a1.getBaseString() + " " + a2.getBaseString()); + Assert.assertEquals(graphDiff,loglessDiff,epsilon,String.format("Delta(%f,%f) = %f > %f",graphDiff,loglessDiff,Math.abs(graphDiff - loglessDiff),epsilon)); + + } + + private double calculateEpsilon(final double graphDiff, final double loglessDiff) { + if (hmm instanceof FastLoglessPairHMM) + return Math.max(0.01,Math.max(Math.abs(loglessDiff),Math.abs(graphDiff)) * SIGNIFICANT_LnLK_RATIO_DIFF_FRACTION); + else + return SIGNIFICANT_LnLK_RATIO_DIFF_FRACTION; + } + + private static final double MIN_READ_ACROSS_SIZE_FOR_INDEL_EVENTS = 0.8; // 50% percent. + private static final double MIN_LARGE_INDEL = 4; + + private void checkForLongEventsThatMightCauseFailures(final List read, final List a1, final List a2) { + + int sequenceLength = Math.max(a1.get(a1.size() - 1).templateTo, a2.get(a2.size() - 1).templateTo) + 1; + + boolean tai1 = thereAreIndels(a1); + boolean tai2 = thereAreIndels(a2); + boolean tair = thereAreIndels(read); + boolean thereAreIndels = tai1 || tai2 || tair; + if (!thereAreIndels) return; + + final boolean[] inserts = new boolean[sequenceLength]; + final boolean[] deletions = new boolean[sequenceLength]; + final int[] range = new int[2]; + + int refStart = Integer.MAX_VALUE; + int refEnd = -1; + + for (final Civar.ElementOffset ce : read) { + if (refStart > ce.templateFrom) + refStart = ce.templateFrom; + if (refEnd < ce.templateTo) + refEnd = ce.templateTo; + switch (ce.element.operator()) { + case DELETION: + deletions[ce.templateFrom] = deletions[ce.templateTo] = true; + break; + case INSERTION: + inserts[ce.templateFrom] = inserts[ce.templateTo] = true; + break; + case MATCH: + break; + } + } + + range[0] = refStart; + range[1] = refEnd; + + checkForLongEventsThatMightCauseFailures_allele(refStart,refEnd,inserts,deletions,a1); + checkForLongEventsThatMightCauseFailures_allele(refStart,refEnd,inserts,deletions,a2); + } + + private void checkForLongEventsThatMightCauseFailures_allele(final int refStart, final int refEnd, final boolean[] inserts, final boolean[] deletions, final List a1) { + for (final Civar.ElementOffset ce : a1) { + if (ce.templateFrom <= refStart) continue; + if (ce.templateTo >= refEnd) continue; + int size; + switch (ce.element.operator()) { + case DELETION: + size = ce.templateTo - ce.templateFrom; + if (deletions[ce.templateFrom] || deletions[ce.templateTo]) continue; + break; + case INSERTION: + size = ce.sequenceTo - ce.sequenceFrom; + if (inserts[ce.templateFrom] || inserts[ce.templateTo]) continue; + break; + default: + continue; + } + int minMargin = (int) Math.ceil(size * MIN_READ_ACROSS_SIZE_FOR_INDEL_EVENTS); + if (ce.templateFrom - refStart < minMargin) + throw new SkipException("Large Indel"); + if (refEnd - ce.templateTo < minMargin) + throw new SkipException("Large Indel"); + } + + } + + private boolean thereAreIndels(final List a1) { + for (final Civar.ElementOffset ce : a1) { + switch (ce.element.operator()) { + case DELETION: + if (ce.templateTo - ce.templateFrom >= MIN_LARGE_INDEL) return true; + break; + case INSERTION: + if (ce.sequenceTo - ce.sequenceFrom >= MIN_LARGE_INDEL) return true; + break; + } + } + return false; + } + + private void dumpLikelihoods(final GATKSAMRecord read, final Double loglessA1lk, final Double loglessA2lk, final Double a1lk, final Double a2lk, final String hapString) { + if (!DUMP_LIKELIHOODS) return; + likelihoodDumpWriter.println(Utils.join("\t","" + loglessA1lk,"" + loglessA2lk,"" + a1lk,"" + a2lk,read.getReadName(),read.getReadString(),hapString)); + likelihoodDumpWriter.flush(); + } + + @DataProvider(name="readLikekihoodRatioTestData") + public Iterator readLikelihoodRatioTestDataSets() { + final Iterator activeRegionTestDataSetIterator = super.activeRegionTestDataSets(); + return new java.util.Iterator() { + + public static final boolean INTRODUCE_READ_ERRORS = true; + + private List> allelePairs; + private Iterator> allelePairsIt; + private Iterator readIt; + private GATKSAMRecord read; + private Iterator> civarEventOffsetsIt; + private List civarEventOffsets; + private ActiveRegionTestDataSet dataSet; + private GraphBasedLikelihoodCalculationEngineInstance graphEngine; + private PerReadAlleleLikelihoodMap graphLks; + private PerReadAlleleLikelihoodMap loglessLks; + private Map civarByAllele; + private String reference; + + @Override + public boolean hasNext() { + return activeRegionTestDataSetIterator.hasNext() || (readIt != null && readIt.hasNext()) || (allelePairsIt != null && allelePairsIt.hasNext()); + } + + @Override + public Object[] next() { + if (allelePairsIt != null && allelePairsIt.hasNext()) { + final Pair allelePair = allelePairsIt.next(); + return new Object[] { dataSet, read, allelePair.getFirst(), allelePair.getSecond(), loglessLks, graphLks, civarEventOffsets, civarByAllele.get(allelePair.getFirst()).eventOffsets(reference,0,Integer.MAX_VALUE), civarByAllele.get(allelePair.getSecond()).eventOffsets(reference,0,Integer.MAX_VALUE)}; + } + if (readIt != null && readIt.hasNext()) { + allelePairsIt = allelePairs.iterator(); + final Pair allelePair = allelePairsIt.next(); + return new Object[] {dataSet, read = readIt.next(), allelePair.getFirst(), allelePair.getSecond(), loglessLks, graphLks, civarEventOffsets = civarEventOffsetsIt.next(), civarByAllele.get(allelePair.getFirst()).eventOffsets(reference,0,Integer.MAX_VALUE), civarByAllele.get(allelePair.getSecond()).eventOffsets(reference,0,Integer.MAX_VALUE) }; + } + final Object[] params = activeRegionTestDataSetIterator.next(); + dataSet = (ActiveRegionTestDataSet) params[0]; + if (INTRODUCE_READ_ERRORS) dataSet.introduceErrors(new Random(13)); + graphEngine = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(),hmm,Double.NEGATIVE_INFINITY, HeterogeneousKmerSizeResolution.COMBO_MAX); + graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonMap("anonymous",dataSet.readList())).get("anonymous"); + + // clip reads at the anchors. + final Map clippedReads = anchorClippedReads(graphEngine.getHaplotypeGraph(),dataSet.readList()); + final List clippedReadList = new ArrayList<>(dataSet.readList().size()); + + for (final GATKSAMRecord r : dataSet.readList()) { + clippedReadList.add(clippedReads.containsKey(r) ? clippedReads.get(r) : r); + } + + loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonMap("anonymous",clippedReadList)).get("anonymous"); + + // Change clipped by unclipped in the resulting likelihood map. + for (final GATKSAMRecord r : clippedReads.keySet()) { + loglessLks.getLikelihoodReadMap().put(r,loglessLks.getLikelihoodReadMap().remove(clippedReads.get(r))); + } + final List haplotypes = dataSet.haplotypeList(); + final Map alleleByHaplotype = new HashMap<>(haplotypes.size()); + final Map civarBySequence = new HashMap<>(haplotypes.size()); + final Map haplotypeBySequence = new HashMap<>(haplotypes.size()); + civarByAllele = new HashMap<>(haplotypes.size()); + final List unrolledCivars = dataSet.unrolledCivars(); + for (int i = 0; i < haplotypes.size(); i++) { + final Haplotype h = haplotypes.get(i); + haplotypeBySequence.put(h.getBaseString(),h); + civarBySequence.put(h.getBaseString(),unrolledCivars.get(i)); + } + for (final Allele a : loglessLks.getAllelesSet()) { + alleleByHaplotype.put(haplotypeBySequence.get(a.getBaseString()),a); + civarByAllele.put(a,civarBySequence.get(a.getBaseString())); + } + allelePairs = new ArrayList<>(haplotypes.size() * 2); + final Haplotype[] haplotypeArray = haplotypes.toArray(new Haplotype[haplotypes.size()]); + for (int i = 0; i < haplotypeArray.length; i++) + for (int j = i + 1; j < haplotypeArray.length; j++) + allelePairs.add(new Pair<>(alleleByHaplotype.get(haplotypeArray[i]),alleleByHaplotype.get(haplotypeArray[j]))); + allelePairsIt = allelePairs.iterator(); + readIt = dataSet.readList().iterator(); + final Pair allelePair = allelePairsIt.next(); + civarEventOffsetsIt = dataSet.readEventOffsetList().iterator(); + reference = dataSet.getReference(); + return new Object[] { dataSet , read = readIt.next(), allelePair.getFirst(), allelePair.getSecond(), loglessLks, graphLks, civarEventOffsets = civarEventOffsetsIt.next(), civarByAllele.get(allelePair.getFirst()).eventOffsets(reference,0,Integer.MAX_VALUE), civarByAllele.get(allelePair.getSecond()).eventOffsets(reference,0,Integer.MAX_VALUE)}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + + /** + * Returns the reads clipped at their anchors. + * + * @param reads target reads. + * @return never {@code null}. + */ + protected Map anchorClippedReads(final HaplotypeGraph haplotypeGraph, final List reads) { + final Map result = new HashMap<>(reads.size()); + for (final GATKSAMRecord r : reads) { + final ReadAnchoring anchoring = new ReadAnchoring(r,haplotypeGraph); + if (anchoring.isAnchoredSomewhere()) + continue; + final int start = anchoring.leftAnchorIndex; + final int end = anchoring.rightAnchorIndex + haplotypeGraph.getKmerSize(); + final GATKSAMRecord clipped = new ClippedGATKSAMRecord(r, start, end); + result.put(r, clipped); + } + return result; + } + + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 3a4ed7e59..d163c0497 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -47,7 +47,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.SAMFileHeader; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -99,7 +98,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { @DataProvider(name = "CalcNIndelInformativeReadsData") public Object[][] makeMyDataProvider() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); { // very basic testing final String ref = "ACGT"; @@ -164,6 +163,26 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { } } + @Test + public void testCalcNIndelInformativeReducedReads() { + final String bases = "ACGGGTTTGGAC"; + final byte[] quals = Utils.dupBytes((byte)30, bases.length()); + final int count = 10; + final int[] counts = new int[bases.length()]; + for ( int i = 0; i < counts.length; i++ ) + counts[i] = count; + final int position = 100; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, position, counts.length, counts); + read.setReadString(bases); + read.setBaseQualities(quals); + read.setCigarString(bases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, position, position); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), 0); + final int actual = model.calcNIndelInformativeReads(pileup, 0, bases.getBytes(), 3); + Assert.assertEquals(actual, count); + } + @Test public void testClose() { model.close(); @@ -187,7 +206,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(prev.getAsPLs(), new int[]{0, 0, 0}); Assert.assertEquals(-10 * prev.getLog10GQ(GenotypeType.HOM_REF), 0.0); - for ( int i = 1; i < 10000; i++ ) { + for ( int i = 1; i <= ReferenceConfidenceModel.MAX_N_INDEL_INFORMATIVE_READS; i++ ) { final GenotypeLikelihoods current = model.getIndelPLs(i); final double prevGQ = -10 * prev.getLog10GQ(GenotypeType.HOM_REF); final double currGQ = -10 * current.getLog10GQ(GenotypeType.HOM_REF); @@ -379,7 +398,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(refModel.getEnd(), loc.getStart() + i); Assert.assertFalse(refModel.hasLog10PError()); Assert.assertEquals(refModel.getAlternateAlleles().size(), 1); - Assert.assertEquals(refModel.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(refModel.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); Assert.assertTrue(refModel.hasGenotype(sample)); final Genotype g = refModel.getGenotype(sample); @@ -388,7 +407,6 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { Assert.assertEquals(g.getDP(), expectedDP); Assert.assertTrue(g.hasGQ()); Assert.assertTrue(g.hasPL()); - Assert.assertTrue(g.hasExtendedAttribute(ReferenceConfidenceModel.INDEL_INFORMATIVE_DEPTH)); } final VariantContext vc = call == null ? refModel : call; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java index f9cbc6c73..5e91ad4f7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraphUnitTest.java @@ -49,7 +49,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -61,7 +60,7 @@ public class BaseGraphUnitTest extends BaseTest { @BeforeMethod public void setUp() throws Exception { - graph = new SeqGraph(); + graph = new SeqGraph(11); v1 = new SeqVertex("A"); v2 = new SeqVertex("C"); @@ -96,7 +95,7 @@ public class BaseGraphUnitTest extends BaseTest { } @Test - public void testRemoveSingletonOrphanVertices() throws Exception { + public void testRemoveSingletonOrphanVertices() throws Exception { // all vertices in graph are connected final List kept = new LinkedList(graph.vertexSet()); final SeqVertex rm1 = new SeqVertex("CAGT"); @@ -117,9 +116,55 @@ public class BaseGraphUnitTest extends BaseTest { Assert.assertFalse(graph.containsVertex(rm2)); } + @Test + public void testRemoveSingletonOrphanVerticesOnSingleRefNode() throws Exception { + final SeqGraph original = new SeqGraph(11); + original.addVertex(v1); + original.removeSingletonOrphanVertices(); + Assert.assertTrue(original.containsVertex(v1)); + Assert.assertEquals(original.vertexSet().size(), 1); + } + + @Test + public void testIsRefSourceAndSink() throws Exception { + + final SeqGraph g = new SeqGraph(11); + g.addVertex(v1); + Assert.assertTrue(g.isRefSource(v1)); + Assert.assertTrue(g.isRefSink(v1)); + Assert.assertTrue(g.isReferenceNode(v1)); + + g.addVertices(v2, v3, v4, v5); + g.addEdge(v1, v2); + g.addEdge(v2, v3); + final BaseEdge refEdge = new BaseEdge(true, 1); + g.addEdge(v3, v4, refEdge); + g.addEdge(v4, v5); + + Assert.assertFalse(g.isRefSource(v1)); + Assert.assertFalse(g.isRefSink(v1)); + Assert.assertFalse(g.isReferenceNode(v1)); + + Assert.assertFalse(g.isRefSource(v2)); + Assert.assertFalse(g.isRefSink(v2)); + Assert.assertFalse(g.isReferenceNode(v2)); + + Assert.assertTrue(g.isRefSource(v3)); + Assert.assertFalse(g.isRefSink(v3)); + Assert.assertTrue(g.isReferenceNode(v3)); + + Assert.assertFalse(g.isRefSource(v4)); + Assert.assertTrue(g.isRefSink(v4)); + Assert.assertTrue(g.isReferenceNode(v4)); + + Assert.assertFalse(g.isRefSource(v5)); + Assert.assertFalse(g.isRefSink(v5)); + Assert.assertFalse(g.isReferenceNode(v5)); + } + @Test public void testRemovePathsNotConnectedToRef() throws Exception { - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); SeqVertex src = new SeqVertex("A"); SeqVertex end = new SeqVertex("A"); @@ -171,7 +216,7 @@ public class BaseGraphUnitTest extends BaseTest { @Test public void testRemoveVerticesNotConnectedToRefRegardlessOfEdgeDirection() throws Exception { - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); SeqVertex src = new SeqVertex("A"); SeqVertex end = new SeqVertex("A"); @@ -230,8 +275,8 @@ public class BaseGraphUnitTest extends BaseTest { public void testPrintEmptyGraph() throws Exception { final File tmp = File.createTempFile("tmp", "dot"); tmp.deleteOnExit(); - new SeqGraph().printGraph(tmp, 10); - new DeBruijnGraph().printGraph(tmp, 10); + new SeqGraph(11).printGraph(tmp, 10); + new TestGraph().printGraph(tmp, 10); } @Test @@ -248,71 +293,6 @@ public class BaseGraphUnitTest extends BaseTest { Assert.assertEquals(actualSet, expectedSet); } - @Test(enabled = true) - public void testPruneGraph() { - DeBruijnGraph graph = new DeBruijnGraph(); - DeBruijnGraph expectedGraph = new DeBruijnGraph(); - - DeBruijnVertex v = new DeBruijnVertex("ATGG"); - DeBruijnVertex v2 = new DeBruijnVertex("ATGGA"); - DeBruijnVertex v3 = new DeBruijnVertex("ATGGT"); - DeBruijnVertex v4 = new DeBruijnVertex("ATGGG"); - DeBruijnVertex v5 = new DeBruijnVertex("ATGGC"); - DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC"); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new BaseEdge(false, 1)); - graph.addEdge(v2, v3, new BaseEdge(false, 3)); - graph.addEdge(v3, v4, new BaseEdge(false, 5)); - graph.addEdge(v4, v5, new BaseEdge(false, 3)); - graph.addEdge(v5, v6, new BaseEdge(false, 2)); - - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3)); - expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5)); - expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3)); - - graph.pruneGraph(2); - - Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); - - graph = new DeBruijnGraph(); - expectedGraph = new DeBruijnGraph(); - - graph.addVertex(v); - graph.addVertex(v2); - graph.addVertex(v3); - graph.addVertex(v4); - graph.addVertex(v5); - graph.addVertex(v6); - graph.addEdge(v, v2, new BaseEdge(true, 1)); - graph.addEdge(v2, v3, new BaseEdge(false, 3)); - graph.addEdge(v3, v4, new BaseEdge(false, 5)); - graph.addEdge(v4, v5, new BaseEdge(false, 3)); - - expectedGraph.addVertex(v); - expectedGraph.addVertex(v2); - expectedGraph.addVertex(v3); - expectedGraph.addVertex(v4); - expectedGraph.addVertex(v5); - expectedGraph.addEdge(v, v2, new BaseEdge(true, 1)); - expectedGraph.addEdge(v2, v3, new BaseEdge(false, 3)); - expectedGraph.addEdge(v3, v4, new BaseEdge(false, 5)); - expectedGraph.addEdge(v4, v5, new BaseEdge(false, 3)); - - graph.pruneGraph(2); - - Assert.assertTrue(BaseGraph.graphEquals(graph, expectedGraph)); - } - @Test(enabled = true) public void testGetBases() { @@ -324,7 +304,7 @@ public class BaseGraphUnitTest extends BaseTest { vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize))); } - final String result = new String(new DeBruijnGraph().getBasesForPath(vertexes)); + final String result = new String(new TestGraph().getBasesForPath(vertexes)); Assert.assertEquals(result, testString.substring(kmerSize - 1)); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index e1398e119..63fd21d8f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -93,7 +93,7 @@ public class CommonSuffixMergerUnitTest extends BaseTest { for ( int nTops = 0; nTops < nMids; nTops++ ) { for ( int nTopConnections = 1; nTopConnections <= nMids; nTopConnections++ ) { int multi = 1; - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); final SeqVertex v = new SeqVertex("GGGG"); graph.addVertex(v); @@ -169,7 +169,7 @@ public class CommonSuffixMergerUnitTest extends BaseTest { @Test public void testDoesntMergeSourceNodes() { - final SeqGraph g = new SeqGraph(); + final SeqGraph g = new SeqGraph(11); final SeqVertex v1 = new SeqVertex("A"); final SeqVertex v2 = new SeqVertex("A"); final SeqVertex v3 = new SeqVertex("A"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java index 9703d76cb..cae39d26a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixSplitterUnitTest.java @@ -81,7 +81,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testSplitPrevHaveMultipleEdges() { - final SeqGraph original = new SeqGraph(); + final SeqGraph original = new SeqGraph(11); final SeqVertex v1 = new SeqVertex("A"); final SeqVertex v2 = new SeqVertex("A"); final SeqVertex v3 = new SeqVertex("A"); @@ -100,7 +100,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testSplitNoCycles() { - final SeqGraph original = new SeqGraph(); + final SeqGraph original = new SeqGraph(11); final SeqVertex v1 = new SeqVertex("A"); final SeqVertex v2 = new SeqVertex("AC"); final SeqVertex v3 = new SeqVertex("TC"); @@ -118,7 +118,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { @Test(timeOut = 10000, enabled = !DEBUG) public void testSplitComplexCycle() { - final SeqGraph original = new SeqGraph(); + final SeqGraph original = new SeqGraph(11); final SeqVertex r1 = new SeqVertex("ACTG"); final SeqVertex r2 = new SeqVertex("ATGC"); final SeqVertex cat1 = new SeqVertex("CAT"); @@ -142,7 +142,7 @@ public class CommonSuffixSplitterUnitTest extends BaseTest { @Test(timeOut = 10000) public void testSplitInfiniteCycleFailure() { - final SeqGraph original = new SeqGraph(); + final SeqGraph original = new SeqGraph(11); final SeqVertex v1 = new SeqVertex("GC"); final SeqVertex v2 = new SeqVertex("X"); final SeqVertex v3 = new SeqVertex("N"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java new file mode 100644 index 000000000..e756737a2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/HaplotypeGraphUnitTest.java @@ -0,0 +1,238 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.HaplotypeGraph; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.MultiDeBruijnVertex; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Tests for {@link HaplotypeGraph} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class HaplotypeGraphUnitTest extends BaseTest { + + @Test(dataProvider="buildByStringDataProvider") + public void testBuildByString(final String string, final int kmerSize, final int vertexCount, final int edgeCount) { + final HaplotypeGraph g = new HaplotypeGraph(string); + Assert.assertEquals(g.getKmerSize(),kmerSize,g.toString()); + Assert.assertEquals(g.vertexSet().size(),vertexCount,g.toString()); + Assert.assertEquals(g.edgeSet().size(),edgeCount,g.toString()); + } + + @Test(dataProvider="equalTestDataProvider") + public void testEquals(final HaplotypeGraph one, final HaplotypeGraph two, final boolean expected) { + Assert.assertEquals(one.equals(two),expected); + } + + @Test(dataProvider="equalTestDataProvider") + public void testEqualReciprocal(final HaplotypeGraph one, final HaplotypeGraph two, final boolean expected) { + Assert.assertEquals(two.equals(one),expected); + } + + + @Test(dataProvider="equalTestDataProvider") + public void testReflexibeEquals(final HaplotypeGraph one, final HaplotypeGraph two, + @SuppressWarnings("unused") final boolean expected) { + Assert.assertTrue(one.equals(one)); + Assert.assertTrue(two.equals(two)); + } + + @Test(dataProvider="mergingCommonChainsDataProvider") + public void testMergingCommonChains(final HaplotypeGraph actual, HaplotypeGraph expected) { + + + final Map beforeMap = new HashMap<>(actual.uniqueKmerMap()); + actual.mergeCommonChains(); + final Map afterMap = new HashMap<>(actual.uniqueKmerMap()); + final Map mergedMap = new HashMap<>(expected.uniqueKmerMap()); + + Assert.assertEquals(actual, expected,""+actual.vertexSet() + " EDGES " + actual.edgeSet()); + Assert.assertEquals(beforeMap.size(),afterMap.size()); + Assert.assertEquals(afterMap.size(),mergedMap.size()); + for (final Kmer k : beforeMap.keySet()) { + Assert.assertTrue(afterMap.containsKey(k)); + Assert.assertTrue(mergedMap.containsKey(k)); + final byte[] seq1 = beforeMap.get(k).getSequence(); + final byte[] seq2 = afterMap.get(k).getSequence(); + final byte[] seq3 = mergedMap.get(k).getSequence(); + Assert.assertEquals(seq1.length,seq2.length); + Assert.assertEquals(seq2.length,seq3.length); + for (int i = 0; i < seq3.length; i++) { + final byte bk = k.base(i); + final byte b1 = seq1[i]; + final byte b2 = seq2[i]; + final byte b3 = seq3[i]; + final byte theByte = b1 == 'N' || b2 == 'N' || b3 == 'N' ? (byte)'N' : b1; + if (theByte == 'N') continue; + Assert.assertEquals(b1,b2); + Assert.assertEquals(b2,b3); + Assert.assertEquals(bk,b1); + } + } + } + + + @DataProvider(name="mergingCommonChainsDataProvider") + public Iterator mergingCommonChainsDataProvider() { + final List list = new LinkedList<>(); + for (int i = 0; i < MERGING_COMMON_CHAINS_DATA.length; i += 2) { + final HaplotypeGraph before = new HaplotypeGraph(MERGING_COMMON_CHAINS_DATA[i]); + final HaplotypeGraph after = new HaplotypeGraph(MERGING_COMMON_CHAINS_DATA[i+1]); + list.add(new Object[] { before , after}); + } + return list.iterator(); + } + + @DataProvider(name="equalTestDataProvider") + public Iterator equalsTestDataProvider() { + final List result = new LinkedList<>(); + for (int i = 0; i < EQUAL_TEST_DATA.length; i += 3) { + final HaplotypeGraph g1 = new HaplotypeGraph(EQUAL_TEST_DATA[i]); + final HaplotypeGraph g2 = new HaplotypeGraph(EQUAL_TEST_DATA[i+1]); + final boolean outcome = Boolean.parseBoolean(EQUAL_TEST_DATA[i+2]); + result.add(new Object[] { g1, g2, outcome}); + } + return result.iterator(); + } + + @DataProvider(name="buildByStringDataProvider") + public Iterator buildByStringDataProvider() { + return Arrays.asList(BUILD_BY_STRING_TEST_DATA).iterator(); + } + + private static final Object[][] BUILD_BY_STRING_TEST_DATA = new Object[][] { + {"[ks=3]{REF: ACT}",3,1,0}, + {"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (3) -> A -> G -> (2) }" + + "{ (1) -> A -> G -> (2) }",3,8,9}, + {"[ks=3]{REF: ACT -> C(1) -> G}{ACT -> C(1) -> G}{ACT -> C(1) -> G}",3,5,4} , + {"[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + + "{A(1) -> T -> A(2) }",3,8,8} , + {"[ks=3]{REF: ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}" + + "{ ACT -> A -> T(2) -> C -> A -> G -> T -> A -> C -> G -> T -> A(1) -> T}",3,15,14} , + {"[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}",3,13,12}, + {"[ks=3]{REF: ACT -> A -> T(1) }" + + "{ ACT -> A -> T(1) }",3,5,4}, + {"[ks=3]{REF: TTT -> A(1) -> C -> T(2)}{ A(1) -> T(2) } ",3,4,4} + }; + + private static final String[] EQUAL_TEST_DATA = new String[] { + "[ks=3]{REF: ACT}","[ks=3]{REF: ACT}", "true", + "[ks=3]{REF: TCA}","[ks=3]{REF: ACT}", "false", + "[ks=4]{REF: ACTG}","[ks=3]{REF: ACT}", "false", + "[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (3) -> A -> G -> (2) }" + + "{ (1) -> A -> G -> (2) }" + ,"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (1) -> A -> G -> (2) }" + + "{ (3) -> A -> G -> (2) }", "true", + "[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (3) -> A -> T -> (2) }" + + "{ (1) -> A -> G -> (2) }" + ,"[ks=3]{REF: ACT(3) -> T(1) -> G(2) -> A}" + + "{ (1) -> A -> G -> (2) }" + + "{ (3) -> A -> G -> (2) }", "true", + "[ks=3]{REF: ACT -> G -> C(2) }{ ACT -> T -> C(2) }","[ks=3]{REF: ACT -> T -> C(2) }{ ACT -> G -> C(2) }","false", + + }; + + private static final String[] MERGING_COMMON_CHAINS_DATA = new String[] { // pairs before and after. + "[ks=3]{REF: ACT -> A(1) -> G -> A -> G(2) -> T }" + + "{A(1) -> T -> A -> G(2) }", + "[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> G -> T }" + + "{A(1) -> T -> A(2) }", + + "[ks=3]{REF: ACT -> A(1) -> G -> A -> C -> G(2) -> T }" + + "{A(1) -> T -> A -> C -> G(2) }", + "[ks=3]{REF: ACT -> A(1) -> G -> A(2) -> C -> G -> T }" + + "{A(1) -> T -> A(2) }", + + "[ks=3]{REF: ACT -> A -> T(1) -> C -> A -> G -> T -> A -> C(2) -> G -> T -> A}" + + "{ T(1) -> A -> A -> G -> T -> A -> C(2) }", + "[ks=3]{REF: ACT -> A -> T(1) -> C -> A(2) -> G -> T -> A -> C -> G -> T -> A}" + + "{ T(1) -> A -> A(2) } ", + +// "[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A(1)}" + +// "{ ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A(1)}" , +// "[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A}" , + + "[ks=3]{REF: ACT -> A -> T(1) }" + + "{ AGT -> A -> T(1) }" , + "[ks=3]{REF: ACT -> A(1) -> T }" + + "{ AGT -> A(1) }" , + "[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}" , + "[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}" , + "[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T(1)}" + "{ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T(1)}" , + "[ks=3]{REF: ACT -> A -> T -> C -> A -> G -> T -> A -> C -> G -> T -> A -> T}" , + + "[ks=3]{REF: TTT -> T -> T -> T -> T -> T -> T -> T -> T -> T -> T(1) -> T -> T -> T(2) -> T -> T}" + + "{ TTT -> T -> T -> T -> T -> T -> T -> T -> T -> T -> T(1) -> G -> T -> T -> T -> T -> T(2) -> T -> T}", + "[ks=3]{REF: TTT -> T -> T -> T -> T -> T -> T -> T -> T -> T -> T(1) -> T(2) -> T -> T -> T -> T}" + + "{ T(1) -> G -> T -> T -> T(2) }", + + "[ks=3]{REF: TTT -> T -> G(1) -> A -> C -> C -> T(2)}" + + "{ G(1) -> T -> C -> C -> T(2)}" + + "{ G(1) -> G -> C -> C -> T(2)}" + + "{ G(1) -> C -> T(2)} ", + "[ks=3]{REF: TTT -> T -> G(1) -> A -> C(2) -> C(3) -> T }" + + "{ G(1) -> T -> C(2) }" + + "{ G(1) -> G -> C(2) }" + + "{ G(1) -> C(3) }", + + "[ks=3]{REF: TTT -> T -> G(1) -> A -> C -> G}{ TTT -> T -> G(1) -> G -> C -> G}", + "[ks=3]{REF: TTT -> T -> G(1) -> A -> C -> G}{ G(1) -> G -> C -> G}", + + }; + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index d6709672a..fa7ad9a3d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -102,7 +102,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(dataProvider = "BasicPathFindingData", enabled = !DEBUG) public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle, final boolean allowCycles) { - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final SeqVertex middleTop = new SeqVertex("GTAC"); final SeqVertex middleBottom = new SeqVertex("ACTG"); @@ -134,7 +134,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testPathFindingComplexCycle() { - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final SeqVertex v1 = new SeqVertex("A"); final SeqVertex v2 = new SeqVertex("C"); @@ -154,7 +154,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testPathFindingCycleLastNode() { - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final SeqVertex v1 = new SeqVertex("A"); final SeqVertex v2 = new SeqVertex("C"); @@ -276,7 +276,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(dataProvider = "TripleBubbleDataProvider", enabled = !DEBUG) public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { // Construct the assembly graph - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final String preAltOption = "ATCGATCGATCGATCGATCG"; final String postAltOption = "CCCC"; final String preRef = "ATGG"; @@ -384,7 +384,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testIntraNodeInsertionDeletion() { // Construct the assembly graph - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex("T"); final SeqVertex bot = new SeqVertex("T"); final SeqVertex alt = new SeqVertex("AAACCCCC"); @@ -410,7 +410,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(enabled = !DEBUG) public void testHardSWPath() { // Construct the assembly graph - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex( "NNN" ); final SeqVertex bot = new SeqVertex( "NNN" ); final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); @@ -472,7 +472,7 @@ public class KBestPathsUnitTest extends BaseTest { @Test(dataProvider = "SystematicRefAltSWTestData", enabled = !DEBUG) public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) { // Construct the assembly graph - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); final int padSize = 0; SeqVertex top = new SeqVertex(Utils.dupString("N", padSize)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java index 06d81499c..a4c0464a5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/LowWeightChainPrunerUnitTest.java @@ -51,7 +51,6 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; import java.util.*; public class LowWeightChainPrunerUnitTest extends BaseTest { @@ -71,7 +70,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { for ( final boolean isRef : Arrays.asList(true, false)) { { // just an isolated chain final int nExpected = edgeWeight < pruneFactor && ! isRef ? 3 : 0; - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3); graph.addEdges(new BaseEdge(isRef, edgeWeight), v1, v2, v3); tests.add(new Object[]{"combinatorial", graph, pruneFactor, nExpected > 0 ? Collections.emptySet() : graph.vertexSet()}); @@ -81,7 +80,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // connects to ref chain - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3); graph.addVertices(v4, v5); graph.addEdges(new BaseEdge(true, 1), v4, v5); @@ -90,7 +89,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // has bad cycle - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4); graph.addEdges(new BaseEdge(false, 1), v4, v1, v2, v3, v1); // note that we'll remove v4 because it's low weight @@ -98,7 +97,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // has good cycle - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(111); graph.addVertices(v1, v2, v3, v4); graph.addEdges(new BaseEdge(false, 3), v4, v1, v2, v3, v1); // note that we'll remove v4 because it's low weight @@ -106,7 +105,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // has branch - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4, v5, v6); graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v4, v6); graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v5, v6); @@ -114,7 +113,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // middle vertex above threshold => no one can be removed - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4, v5); graph.addEdges(new BaseEdge(false, 1), v1, v2); graph.addEdges(new BaseEdge(false, 3), v2, v3); @@ -123,7 +122,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // the branching node has value > pruneFactor - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4, v5, v6); graph.addEdges(new BaseEdge(false, 3), v1, v2); graph.addEdges(new BaseEdge(false, 3), v2, v3); @@ -133,7 +132,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // A single isolated chain with weights all below pruning should be pruned - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4, v5); graph.addEdges(new BaseEdge(false, 1), v1, v2, v3); graph.addEdges(new BaseEdge(false, 5), v4, v5); @@ -141,7 +140,7 @@ public class LowWeightChainPrunerUnitTest extends BaseTest { } { // A chain with weights all below pruning should be pruned, even if it connects to another good chain - SeqGraph graph = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4, v5, v6); graph.addEdges(new BaseEdge(false, 1), v1, v2, v3, v5); graph.addEdges(new BaseEdge(false, 5), v4, v5, v6); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java index fc40edc42..c1d822eec 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/MultiSampleEdgeUnitTest.java @@ -46,7 +46,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; -import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java index ee07bea33..da59b523a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/PathUnitTest.java @@ -57,7 +57,7 @@ public class PathUnitTest extends BaseTest { final String ref = "ATGGTGGCTCATACCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAACATCACCTGAGGCCAGGAGTTCAAAACCAGCCTGGCTAACATAGCAAAACCCCATCTCTAATGAAAATACAAAAATTAGCTGGGTGTGGTGGTGTCCGCCTGTAGTCCCAGCTACTCAGGAGACTAAGGCATGAGAATCACTTGAACCCAGGATGCAGAGGCTGTAGTGAGCCGAGATTGCACCACGGCTGCACTCCAGCCTGGGCAACAGAGCGAGACTCTGTCTCAAATAAAATAGCGTAACGTAACATAACATAACATAACATAACATAACATAACATAACATAACATAACATAACATAACACAACAACAAAATAAAATAACATAAATCATGTTGTTAGGAAAAAAATCAGTTATGCAGCTACATGCTATTTACAAGAGATATACCTTAAAATATAAGACACAGAGGCCGGGCGCGGTAGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGCGGATCATGAGGTCAGGAGATCGAGACCATCC"; final String hap = "ATGGTGGCTCATACCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAAGCGGATCATGAGGTCAGGAGATCGAGACCATCCT"; - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); final SeqVertex v = new SeqVertex(hap); graph.addVertex(v); final Path path = new Path(v, graph); @@ -70,7 +70,7 @@ public class PathUnitTest extends BaseTest { final String ref = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGCCAGGCTGGTCTTGAACTCCTGACCTCAGGTGATCCACTCGCCTCGGTCTCCCAAAGTGTTGGGATTACAGGCATGAACCACTGCACCTGGCCTAGTGTTTGGGAAAACTATACTAGGAAAAGAATAGTTGCTTTAAGTCATTCTTTGATTATTCTGAGAATTGGCATATAGCTGCCATTATAACCTACTTTTGCTAAATATAATAATAATAATCATTATTTTTATTTTTTGAGACAGGGTCTTGTTTTGTCACCCCGGCTGGAGTGAAGTGGCGCAATCTCGGCTCACTGCAACCTCCACCTCCGGGTGCAAGCAATTCTCCTGCCTCAGCCTCTTGAGTAGCTAGGATTACAGGCACAAGCCATCATGCCCAGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT"; final String hap = "CGGCTAATTTTTGTATTTTTAGTAGAGACAGGGTTTCACCATGTTGGTCAGGCTGGTCTTGAACTCCTGACCTCAGGT"; - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); final SeqVertex v = new SeqVertex(hap); graph.addVertex(v); final Path path = new Path(v, graph); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java new file mode 100644 index 000000000..2918501b2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RouteUnitTest.java @@ -0,0 +1,261 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.BaseTest; +import org.jgrapht.EdgeFactory; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 9/5/13 + * Time: 11:04 AM + * To change this template use File | Settings | File Templates. + */ +public class RouteUnitTest extends BaseTest { + + + @Test(dataProvider="isSuffixTestData") + public void testIsSuffix(final Route route, final Path path, final boolean expectedResult) { + Assert.assertEquals(route.isSuffix(path), expectedResult); + } + + @DataProvider(name="isSuffixTestData") + public Iterator isSuffixTestData() { + return TEST_DATA.iterator(); + } + + private static final int[] TEST_EDGE_PAIRS1 = new int[] { + 3 , 4, + 4 , 5, + 5, 7, + 7, 8, + 8, 9, + 4 , 6, + 6, 9, + 9, 11, + 11, 12, + }; + + + + private static final int[] TEST_EDGE_PAIRS = new int[] { + 1 , 2, + 2 , 3, + 3 , 4, + 4 , 5, + 5, 7, + 7, 8, + 8, 9, + 4 , 6, + 6, 9, + 9, 10, + 10, 11, + 11, 12, + 2, 5, + 5, 12, + + 3, 13, + 13, 14, + 14, 15 + }; + + public static final EdgeFactory TEST_GRAPH_EDGE_FACTORY = new EdgeFactory() { + @Override + public BaseEdge createEdge(final BaseVertex baseVertex, final BaseVertex baseVertex2) { + return new BaseEdge(false, 0); + } + }; + + + private static Map vertexByInteger = new HashMap<>(); + private static final BaseGraph TEST_GRAPH = new BaseGraph<>(1, TEST_GRAPH_EDGE_FACTORY); + private static final List TEST_DATA; + + + static { + for (int i = 0; i < TEST_EDGE_PAIRS.length; i += 2) { + final int sourceInteger = TEST_EDGE_PAIRS[i]; + final int targetInteger = TEST_EDGE_PAIRS[i + 1]; + final BaseVertex sourceVertex = resolveVertexByInteger(sourceInteger); + final BaseVertex targetVertex = resolveVertexByInteger(targetInteger); + TEST_GRAPH.addEdge(sourceVertex, targetVertex); + } + Assert.assertEquals(1,TEST_GRAPH.getSources().size()); + final Deque> pendingPaths = new LinkedList<>(); + final Deque> pendingRoutes = new LinkedList<>(); + final List> allPossiblePaths = new LinkedList<>(); + final List> allPossibleRoutes = new LinkedList<>(); + for (final BaseVertex vertex : TEST_GRAPH.vertexSet()) { + pendingPaths.add(new Path(vertex, TEST_GRAPH)); + pendingRoutes.add(new Route(vertex,TEST_GRAPH)); + } + while (!pendingPaths.isEmpty()) { // !pendingRoutes.isEmpty(); + final Path path = pendingPaths.remove(); + final Route route = pendingRoutes.remove(); + final BaseVertex lastVertex = path.getLastVertex(); + allPossiblePaths.add(path); + allPossibleRoutes.add(route); + + if (allPossiblePaths.size() % 100 == 0) + Reporter.log("" + allPossiblePaths.size(), true); + for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) + pendingPaths.add(new Path<>(path,edge)); + for (final BaseEdge edge : TEST_GRAPH.outgoingEdgesOf(lastVertex)) + pendingRoutes.add(new Route<>(route,edge)); + } + + final int numberOfPaths = allPossiblePaths.size(); + final boolean[][] isSuffix = buildIsSuffixMatrix(allPossiblePaths, numberOfPaths); + TEST_DATA = createTestData(allPossiblePaths,allPossibleRoutes,isSuffix); + } + + private static boolean[][] buildIsSuffixMatrix(final List> allPossiblePaths, final int numberOfPaths) { + final boolean[][] isSuffix = new boolean[numberOfPaths][numberOfPaths]; + final ListIterator> iIterator = allPossiblePaths.listIterator(); + for (int i = 0; i < numberOfPaths; i++) { + isSuffix[i][i] = true; + final ListIterator> jIterator = allPossiblePaths.listIterator(i + 1); + final Path iPath = iIterator.next(); + for (int j = i + 1; j < numberOfPaths; j++) { + final Path jPath = jIterator.next(); + if (iPath.getLastVertex() != jPath.getLastVertex()) { + isSuffix[i][j] = isSuffix[j][i] = false; + } else { + isSuffix[i][j] = isSuffix[j][i] = true; // let assume they are suffix of each other by default. + final Path shortPath; + final Path longPath; + if (iPath.getEdges().size() <= jPath.getEdges().size()) { + shortPath = iPath; + longPath = jPath; + } else { + longPath = iPath; + shortPath = jPath; + } + final ListIterator longPathEdgesIterator = longPath.getEdges().listIterator(longPath.getEdges().size()); + final ListIterator shortPathEdgesIterator = shortPath.getEdges().listIterator(shortPath.getEdges().size()); + + while (shortPathEdgesIterator.hasPrevious()) { + final BaseEdge shortEdge = shortPathEdgesIterator.previous(); + final BaseEdge longEdge = longPathEdgesIterator.previous(); + if (shortEdge != longEdge) { + isSuffix[i][j] = isSuffix[j][i] = false; + break; + } + } + if (isSuffix[i][j]) { + if (longPathEdgesIterator.hasPrevious()) { + if (longPath == iPath) + isSuffix[j][i] = false; + else + isSuffix[i][j] = false; + } + } + } + + } + } + return isSuffix; + } + + private static List createTestData(final List> allPossiblePaths, final List> allPossibleRoutes, final boolean[][] isSuffix) { + final List result = new ArrayList<>(allPossiblePaths.size() * allPossiblePaths.size() * 2 ); + final Path[] allPaths = allPossiblePaths.toArray(new Path[allPossiblePaths.size()]); + final Route[] allRoutes = allPossibleRoutes.toArray(new Route[allPossibleRoutes.size()]); + final int numberOfPaths = allPaths.length; + for (int i = 0; i < numberOfPaths; i++) + for (int j = 0; j < numberOfPaths; j++) { + result.add(new Object[] { allRoutes[i], allPaths[j], isSuffix[i][j] }); + result.add(new Object[] { allRoutes[i], allRoutes[j], isSuffix[i][j] }); + result.add(new Object[] { allRoutes[i], inverseRebuild(allRoutes[j]), isSuffix[i][j]}); + } + + return result; + } + + private static Route inverseRebuild(final Route original) { + final ListIterator it = original.getEdges().listIterator(original.length()); + Route result = new Route<>(original.getLastVertex(),original.getGraph()); + while (it.hasPrevious()) { + result = new Route<>(it.previous(),result); + } + return result; + } + + private static BaseVertex resolveVertexByInteger(final int targetInteger) { + if (vertexByInteger.containsKey(targetInteger)) + return vertexByInteger.get(targetInteger); + else { + int value = targetInteger; + final StringBuffer stringBuffer = new StringBuffer(); + while (value > 0) { + int c = value % 4; + switch (c) { + case 0: stringBuffer.append('A'); break; + case 1: stringBuffer.append('C'); break; + case 2: stringBuffer.append('G'); break; + case 3: stringBuffer.append('T'); break; + } + value = value / 4; + } + if (stringBuffer.length() == 0) stringBuffer.append('A'); + final byte[] sequence = stringBuffer.reverse().toString().getBytes(); + final BaseVertex result = new BaseVertex(sequence); + vertexByInteger.put(targetInteger, result); + TEST_GRAPH.addVertex(result); + return result; + } + + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java index c72f426be..84b2ee449 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SeqGraphUnitTest.java @@ -72,7 +72,7 @@ public class SeqGraphUnitTest extends BaseTest { } public SeqGraph calcGraph() { - final DeBruijnGraph deBruijnGraph = new DeBruijnGraph(); + final TestGraph deBruijnGraph = new TestGraph(); final int kmersInSequence = sequence.length - KMER_LENGTH + 1; for (int i = 0; i < kmersInSequence - 1; i++) { // get the kmers @@ -113,6 +113,8 @@ public class SeqGraphUnitTest extends BaseTest { Assert.assertEquals(actualV.getSequence(), cfg.sequence); } + + @DataProvider(name = "IsDiamondData") public Object[][] makeIsDiamondData() throws Exception { List tests = new ArrayList(); @@ -120,7 +122,7 @@ public class SeqGraphUnitTest extends BaseTest { SeqGraph graph; SeqVertex pre1, pre2, top, middle1, middle2, middle3, bottom, tail1, tail2; - graph = new SeqGraph(); + graph = new SeqGraph(11); pre1 = new SeqVertex("ACT"); pre2 = new SeqVertex("AGT"); @@ -170,7 +172,7 @@ public class SeqGraphUnitTest extends BaseTest { // top connects to bottom directly as well { - final SeqGraph topConnectsToBottomToo = new SeqGraph(); + final SeqGraph topConnectsToBottomToo = new SeqGraph(11); final SeqVertex top2 = new SeqVertex("A"); final SeqVertex middle4 = new SeqVertex("C"); final SeqVertex bottom2 = new SeqVertex("G"); @@ -194,7 +196,7 @@ public class SeqGraphUnitTest extends BaseTest { public Object[][] makeMergingData() throws Exception { List tests = new ArrayList(); - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); SeqVertex pre1 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "CT"); SeqVertex pre2 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "GT"); @@ -216,7 +218,7 @@ public class SeqGraphUnitTest extends BaseTest { graph.addVertices(top); graph.addEdges(pre1, top); final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); expected.addVertex(pre1_top); tests.add(new Object[]{graph.clone(), expected.clone()}); } @@ -225,7 +227,7 @@ public class SeqGraphUnitTest extends BaseTest { { graph.addVertices(middle1); graph.addEdges(top, middle1); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); final SeqVertex pre1_top_middle1 = new SeqVertex(pre1.getSequenceString() + top.getSequenceString() + middle1.getSequenceString()); expected.addVertex(pre1_top_middle1); tests.add(new Object[]{graph.clone(), expected}); @@ -235,7 +237,7 @@ public class SeqGraphUnitTest extends BaseTest { { graph.addVertices(middle2); graph.addEdges(top, middle2); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); expected.addVertices(pre1_top, middle1, middle2); expected.addEdges(pre1_top, middle1); @@ -248,7 +250,7 @@ public class SeqGraphUnitTest extends BaseTest { graph.addVertices(bottom); graph.addEdges(middle1, bottom); graph.addEdges(middle2, bottom); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); final SeqVertex pre1_top = new SeqVertex(pre1.getSequenceString() + top.getSequenceString()); final SeqVertex newMiddle1 = new SeqVertex("G"); final SeqVertex newMiddle2 = new SeqVertex("T"); @@ -274,12 +276,12 @@ public class SeqGraphUnitTest extends BaseTest { } { // all the nodes -> lots of merging and motion of nodes - final SeqGraph all = new SeqGraph(); + final SeqGraph all = new SeqGraph(11); all.addVertices(pre1, pre2, top, middle1, middle2, bottom, tail1, tail2); all.addEdges(pre1, top, middle1, bottom, tail1); all.addEdges(pre2, top, middle2, bottom, tail2); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); SeqVertex newPre1 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "C"); SeqVertex newPre2 = new SeqVertex(Utils.dupString("A", SeqGraph.MIN_COMMON_SEQUENCE_TO_MERGE_SOURCE_SINK_VERTICES) + "G"); final SeqVertex newTop = new SeqVertex("TA"); @@ -294,7 +296,7 @@ public class SeqGraphUnitTest extends BaseTest { // test the case where we delete a middle node away because the common sequence is all of its sequence { - final SeqGraph graph2 = new SeqGraph(); + final SeqGraph graph2 = new SeqGraph(11); final SeqVertex mytop = new SeqVertex("A"); final SeqVertex mid1 = new SeqVertex("AC"); final SeqVertex mid2 = new SeqVertex("C"); @@ -303,7 +305,7 @@ public class SeqGraphUnitTest extends BaseTest { graph2.addEdges(mytop, mid1, bot); graph2.addEdges(mytop, mid2, bot); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); final SeqVertex newMid1 = new SeqVertex("A"); final SeqVertex newBottom = new SeqVertex("CG"); expected.addVertices(mytop, newMid1, newBottom); @@ -339,7 +341,7 @@ public class SeqGraphUnitTest extends BaseTest { // @Test(enabled = !DEBUG) public void testBubbleSameBasesWithRef() { - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex("A"); final SeqVertex mid1 = new SeqVertex("ACT"); final SeqVertex mid2 = new SeqVertex("ACT"); @@ -349,7 +351,7 @@ public class SeqGraphUnitTest extends BaseTest { graph.addEdge(top, mid1, new BaseEdge(true, 1)); graph.addEdge(mid1, bot, new BaseEdge(true, 1)); - final SeqGraph expected = new SeqGraph(); + final SeqGraph expected = new SeqGraph(11); expected.addVertex(new SeqVertex("AACTC")); final SeqGraph actual = ((SeqGraph)graph.clone()); actual.simplifyGraph(); @@ -360,8 +362,8 @@ public class SeqGraphUnitTest extends BaseTest { public Object[][] makeLinearZipData() throws Exception { List tests = new ArrayList(); - SeqGraph graph = new SeqGraph(); - SeqGraph expected = new SeqGraph(); + SeqGraph graph = new SeqGraph(11); + SeqGraph expected = new SeqGraph(11); // empty graph => empty graph tests.add(new Object[]{graph.clone(), expected.clone()}); @@ -377,7 +379,7 @@ public class SeqGraphUnitTest extends BaseTest { tests.add(new Object[]{graph.clone(), expected.clone()}); graph.addEdges(a1, c1); - expected = new SeqGraph(); + expected = new SeqGraph(11); expected.addVertices(ac1); tests.add(new Object[]{graph.clone(), expected.clone()}); @@ -385,25 +387,25 @@ public class SeqGraphUnitTest extends BaseTest { SeqVertex g1 = new SeqVertex("G"); graph.addVertices(g1); graph.addEdges(c1, g1); - expected = new SeqGraph(); + expected = new SeqGraph(11); expected.addVertex(new SeqVertex("ACG")); tests.add(new Object[]{graph.clone(), expected.clone()}); // adding something that isn't connected isn't a problem SeqVertex t1 = new SeqVertex("T"); graph.addVertices(t1); - expected = new SeqGraph(); + expected = new SeqGraph(11); expected.addVertices(new SeqVertex("ACG"), new SeqVertex("T")); tests.add(new Object[]{graph.clone(), expected.clone()}); // splitting chain with branch produces the correct zipped subgraphs final SeqVertex a2 = new SeqVertex("A"); final SeqVertex c2 = new SeqVertex("C"); - graph = new SeqGraph(); + graph = new SeqGraph(11); graph.addVertices(a1, c1, g1, t1, a2, c2); graph.addEdges(a1, c1, g1, t1, a2); graph.addEdges(g1, c2); - expected = new SeqGraph(); + expected = new SeqGraph(11); SeqVertex acg = new SeqVertex("ACG"); SeqVertex ta = new SeqVertex("TA"); expected.addVertices(acg, ta, c2); @@ -413,11 +415,11 @@ public class SeqGraphUnitTest extends BaseTest { // Can merge chains with loops in them { - graph = new SeqGraph(); + graph = new SeqGraph(11); graph.addVertices(a1, c1, g1); graph.addEdges(a1, c1, g1); graph.addEdges(a1, a1); - expected = new SeqGraph(); + expected = new SeqGraph(11); SeqVertex ac = new SeqVertex("AC"); SeqVertex cg = new SeqVertex("CG"); @@ -433,7 +435,7 @@ public class SeqGraphUnitTest extends BaseTest { graph.removeEdge(c1, c1); graph.addEdges(g1, g1); - expected = new SeqGraph(); + expected = new SeqGraph(11); expected.addVertices(ac, g1); expected.addEdges(ac, g1, g1); tests.add(new Object[]{graph.clone(), expected.clone()}); @@ -443,8 +445,8 @@ public class SeqGraphUnitTest extends BaseTest { { final List bases = Arrays.asList("A", "C", "G", "T", "TT", "GG", "CC", "AA"); for ( final int len : Arrays.asList(1, 2, 10, 100, 1000)) { - graph = new SeqGraph(); - expected = new SeqGraph(); + graph = new SeqGraph(11); + expected = new SeqGraph(11); SeqVertex last = null; String expectedBases = ""; for ( int i = 0; i < len; i++ ) { @@ -465,8 +467,8 @@ public class SeqGraphUnitTest extends BaseTest { int edgeWeight = 1; for ( final int nIncoming : Arrays.asList(0, 2, 5, 10) ) { for ( final int nOutgoing : Arrays.asList(0, 2, 5, 10) ) { - graph = new SeqGraph(); - expected = new SeqGraph(); + graph = new SeqGraph(11); + expected = new SeqGraph(11); graph.addVertices(a1, c1, g1); graph.addEdges(a1, c1, g1); @@ -530,7 +532,7 @@ public class SeqGraphUnitTest extends BaseTest { final SeqVertex v5 = new SeqVertex("CCTCCACCATCCTCCCCTCTAGGCTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATCCTCCCCTCTAGACTTCTCCTCCTCCTCCACCATC"); final SeqVertex v6 = new SeqVertex("CTCCCCT"); - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); graph.addVertices(v1, v2, v3, v4, v5, v6); graph.addEdges(v1, v3, v4, v6, v3); graph.addEdges(v2, v4); @@ -538,4 +540,5 @@ public class SeqGraphUnitTest extends BaseTest { graph.simplifyGraph(); } + } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index 5bc13f884..bb504b78c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -125,7 +125,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @Test(dataProvider = "PrefixSuffixData") public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); final List v = new ArrayList(); for ( final String s : strings ) { @@ -202,7 +202,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @Test(dataProvider = "CompleteCycleData") public void testSplitterCompleteCycle(final List strings, final boolean hasTop, final boolean hasBot) { - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); int edgeWeight = 1; final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; @@ -276,7 +276,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @Test(dataProvider = "MeetsMinSequenceData") public void testSplitterCompleteCycle(final List mids, final int minSeqLength, final boolean prefixMeets, final boolean suffixMeets) { - final SeqGraph graph = new SeqGraph(); + final SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex("AAAAAAAA"); final SeqVertex bot = new SeqVertex("GGGGGGGG"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index ed91cccb3..0d9c07251 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -74,6 +74,20 @@ public class ReadThreadingGraphUnitTest extends BaseTest { Assert.assertEquals(actual, expected); } + @Test + public void testSimpleHaplotypeRethreading() { + final ReadThreadingGraph assembler = new ReadThreadingGraph(11); + final String ref = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG"; + final String alt = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG"; + assembler.addSequence("anonymous", getBytes(ref), null, true); + assembler.addSequence("anonymous", getBytes(alt), null, false); + assembler.buildGraphIfNecessary(); + Assert.assertNotEquals(ref.length() - 11 + 1,assembler.vertexSet().size(),"the number of vertex in the graph is the same as if there was no alternative sequence"); + Assert.assertEquals(ref.length() - 11 + 1 + 11,assembler.vertexSet().size(),"the number of vertex in the graph is not the same as if there is an alternative sequence"); + MultiDeBruijnVertex startAlt = assembler.findKmer(new Kmer(alt.getBytes(),20,11)); + Assert.assertNotNull(startAlt); + } + @Test(enabled = ! DEBUG) public void testNonUniqueMiddle() { final ReadThreadingGraph assembler = new ReadThreadingGraph(3); @@ -212,8 +226,8 @@ public class ReadThreadingGraphUnitTest extends BaseTest { tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps - tests.add(new Object[]{"AAAAA", "C", "1M4D1M", true, -1}); // funky SW alignment - tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", true, 1}); // very little data + tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment + tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex @@ -253,7 +267,13 @@ public class ReadThreadingGraphUnitTest extends BaseTest { Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); // confirm that the SW alignment agrees with our expectations - final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink); + final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink, 0); + + if ( result == null ) { + Assert.assertFalse(cigarIsGood); + return; + } + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); // confirm that the goodness of the cigar agrees with our expectations diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java similarity index 59% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java index 95592241d..38c06c25f 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnAssemblerUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModelUnitTest.java @@ -44,75 +44,45 @@ * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ -package org.broadinstitute.sting.gatk.walkers.haplotypecaller; +package org.broadinstitute.sting.gatk.walkers.indels; -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 3/27/12 - */ -import net.sf.samtools.*; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.DeBruijnGraph; -import org.broadinstitute.sting.utils.haplotype.Haplotype; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; import java.util.*; -public class DeBruijnAssemblerUnitTest extends BaseTest { - private final static boolean DEBUG = false; +public class PairHMMIndelErrorModelUnitTest extends BaseTest { - @Test(enabled = !DEBUG) - public void testReferenceCycleGraph() { - String refCycle = "ATCGAGGAGAGCGCCCCGAGATATATATATATATATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATATATATATATGGGAGAGGGGATATATATATATCCCCCC"; - String noCycle = "ATCGAGGAGAGCGCCCCGAGATATTATTTGCGAGCGCGAGCGTTTTAAAAATTTTAGACGGAGAGATGGGAGAGGGGATATATAATATCCCCCC"; - final DeBruijnGraph g1 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(refCycle.getBytes(), true), Collections.emptyList()); - final DeBruijnGraph g2 = new DeBruijnAssembler().createGraphFromSequences(new ArrayList(), 10, new Haplotype(noCycle.getBytes(), true), Collections.emptyList()); + private SAMFileHeader header; - Assert.assertTrue(g1 == null, "Reference cycle graph should return null during creation."); - Assert.assertTrue(g2 != null, "Reference non-cycle graph should not return null during creation."); + @BeforeClass + public void setup() throws FileNotFoundException { + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); } - private static class MockBuilder extends DeBruijnGraphBuilder { - public final List addedPairs = new LinkedList(); + private static final int refWindowStart = 1000; + private static final int refWindowEnd = 1100; - private MockBuilder(final int kmerSize) { - super(new DeBruijnGraph(kmerSize)); - } - - @Override - public void addKmerPair(Kmer kmerPair, int multiplicity) { - logger.info("addKmerPair" + kmerPair); - addedPairs.add(kmerPair); - } - - @Override - public void flushKmersToGraph(boolean addRefEdges) { - // do nothing - } - } - - @DataProvider(name = "AddReadKmersToGraph") - public Object[][] makeAddReadKmersToGraphData() { + @DataProvider(name = "ClipUpstreamProvider") + public Object[][] ClipUpstreamTestData() { List tests = new ArrayList(); - // this functionality can be adapted to provide input data for whatever you might want in your data - final String bases = "ACGTAACCGGTTAAACCCGGGTTT"; - final int readLen = bases.length(); - final List allBadStarts = new ArrayList(readLen); - for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i); - - for ( final int kmerSize : Arrays.asList(3, 4, 5) ) { - for ( final int nBadQuals : Arrays.asList(0, 1, 2) ) { - for ( final List badStarts : Utils.makePermutations(allBadStarts, nBadQuals, false) ) { - tests.add(new Object[]{bases, kmerSize, badStarts}); + for ( final int readStart : Arrays.asList(900, 950, 990, 1000) ) { + for ( final int readLength : Arrays.asList(10, 50, 100) ) { + for ( final int delLength : Arrays.asList(0, 5, 10) ) { + tests.add(new Object[]{readStart, readLength, delLength}); } } } @@ -120,80 +90,44 @@ public class DeBruijnAssemblerUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "AddReadKmersToGraph", enabled = ! DEBUG) - public void testAddReadKmersToGraph(final String bases, final int kmerSize, final List badQualsSites) { - final int readLen = bases.length(); - final DeBruijnAssembler assembler = new DeBruijnAssembler(); - final MockBuilder builder = new MockBuilder(kmerSize); + @Test(dataProvider = "ClipUpstreamProvider", enabled = true) + public void clipUpstreamTest(final int readStart, final int readLength, final int delLength) { - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + if ( delLength == 0 ) + read.setCigarString(readLength + "M"); + else + read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); - final byte[] quals = Utils.dupBytes((byte)20, bases.length()); - for ( final int badSite : badQualsSites ) quals[badSite] = 0; - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 1, readLen); - read.setReadBases(bases.getBytes()); - read.setBaseQualities(quals); - - final Set expectedBases = new HashSet(); - final Set expectedStarts = new LinkedHashSet(); - for ( int i = 0; i < readLen; i++) { - boolean good = true; - for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing - good &= i + j < readLen && quals[i+j] >= assembler.getMinBaseQualityToUseInAssembly(); - } - if ( good ) { - expectedStarts.add(i); - expectedBases.add(bases.substring(i, i + kmerSize + 1)); - } - } - - assembler.addReadKmersToGraph(builder, Arrays.asList(read)); - Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size()); - for ( final Kmer addedKmer : builder.addedPairs ) { - Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases); - } + final boolean result = PairHMMIndelErrorModel.mustClipUpstream(read, refWindowStart); + Assert.assertEquals(result, read.getSoftStart() < refWindowStart && read.getSoftEnd() > refWindowStart); } - @DataProvider(name = "AddGGAKmersToGraph") - public Object[][] makeAddGGAKmersToGraphData() { + @DataProvider(name = "ClipDownstreamProvider") + public Object[][] ClipDownstreamTestData() { List tests = new ArrayList(); - // this functionality can be adapted to provide input data for whatever you might want in your data - final String bases = "ACGTAACCGGTTAAACCCGGGTTT"; - final int readLen = bases.length(); - final List allBadStarts = new ArrayList(readLen); - for ( int i = 0; i < readLen; i++ ) allBadStarts.add(i); - - for ( final int kmerSize : Arrays.asList(3, 4, 5) ) { - tests.add(new Object[]{bases, kmerSize}); + for ( final int readStart : Arrays.asList(1000, 1050, 1090, 1100) ) { + for ( final int readLength : Arrays.asList(10, 50, 100) ) { + for ( final int delLength : Arrays.asList(0, 5, 10) ) { + tests.add(new Object[]{readStart, readLength, delLength}); + } + } } return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "AddGGAKmersToGraph", enabled = ! DEBUG) - public void testAddGGAKmersToGraph(final String bases, final int kmerSize) { - final int readLen = bases.length(); - final DeBruijnAssembler assembler = new DeBruijnAssembler(); - final MockBuilder builder = new MockBuilder(kmerSize); + @Test(dataProvider = "ClipDownstreamProvider", enabled = true) + public void clipDownstreamTest(final int readStart, final int readLength, final int delLength) { - final Set expectedBases = new HashSet(); - final Set expectedStarts = new LinkedHashSet(); - for ( int i = 0; i < readLen; i++) { - boolean good = true; - for ( int j = 0; j < kmerSize + 1; j++ ) { // +1 is for pairing - good &= i + j < readLen; - } - if ( good ) { - expectedStarts.add(i); - expectedBases.add(bases.substring(i, i + kmerSize + 1)); - } - } + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); + if ( delLength == 0 ) + read.setCigarString(readLength + "M"); + else + read.setCigarString((readLength / 2) + "M" + delLength + "D" + (readLength / 2) + "M"); - assembler.addGGAKmersToGraph(builder, Arrays.asList(new Haplotype(bases.getBytes()))); - Assert.assertEquals(builder.addedPairs.size(), expectedStarts.size()); - for ( final Kmer addedKmer : builder.addedPairs ) { - Assert.assertTrue(expectedBases.contains(new String(addedKmer.bases())), "Couldn't find kmer " + addedKmer + " among all expected kmers " + expectedBases); - } + final boolean result = PairHMMIndelErrorModel.mustClipDownstream(read, refWindowEnd); + Assert.assertEquals(result, read.getSoftStart() < refWindowEnd && read.getSoftStart() + readLength > refWindowEnd); } -} +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java index 6f14b111f..3ce0211c4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmissionIntegrationTest.java @@ -46,9 +46,16 @@ package org.broadinstitute.sting.gatk.walkers.phasing; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.Arrays; public class PhaseByTransmissionIntegrationTest extends WalkerTest { @@ -57,6 +64,7 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { private static String TNTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TN.vcf"; private static String TPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.TP.vcf"; private static String FPTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.FP.vcf"; + private static String MultiAllelicsTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.multiAllelics.vcf"; private static String SpecialTest = phaseByTransmissionTestDataRoot + "PhaseByTransmission.IntegrationTest.Special.vcf"; //Tests using PbT on all genotypes with default parameters @@ -200,4 +208,44 @@ public class PhaseByTransmissionIntegrationTest extends WalkerTest { executeTest("testFatherAlleleFirst", spec); } + @Test + public void testMultiAllelics() throws IOException { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T PhaseByTransmission", + "--no_cmdline_in_header", + "-R " + b37KGReference, + "--variant " + MultiAllelicsTest, + "-ped "+ goodFamilyFile, + "-L 1:10109-10315", + "-o %s" + ), + 1, + Arrays.asList("") // don't care about the md5, just testing that records aren't being dropped + ); + + final File outputVCF = executeTest("testMultiAllelics", spec).getFirst().get(0); + + final VCFCodec codec = new VCFCodec(); + final FileInputStream originalStream = new FileInputStream(MultiAllelicsTest); + final LineIterator originalLineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(originalStream)); + codec.readHeader(originalLineIterator); + int numOriginalRecords = 0; + while ( originalLineIterator.hasNext() ) { + originalLineIterator.next(); + numOriginalRecords++; + } + + final FileInputStream newStream = new FileInputStream(outputVCF); + final LineIterator newLineIterator = codec.makeSourceFromStream(new PositionalBufferedStream(newStream)); + codec.readHeader(newLineIterator); + int numNewRecords = 0; + while ( newLineIterator.hasNext() ) { + newLineIterator.next(); + numNewRecords++; + } + + Assert.assertTrue(numOriginalRecords > 0); + Assert.assertEquals(numNewRecords, numOriginalRecords); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d695f2d13..9b5290dee 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -48,8 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.annotations.Test; import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; import java.util.ArrayList; import java.util.Arrays; @@ -376,6 +376,21 @@ public class VariantEvalIntegrationTest extends WalkerTest { executeTestParallel("testEvalTrackWithoutGenotypes",spec); } + @Test + public void testEvalTrackWithoutGenotypesWithSampleFields() { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T VariantEval", + "-R " + b37KGReference, + "-eval " + variantEvalTestDataRoot + "noGenotypes.vcf", + "-o %s" + ), + 1, + Arrays.asList("")); //There is no md5 because we only care that this completes without an exception. + executeTest("testEvalTrackWithoutGenotypesWithSampleFields", spec); + + } + @Test public void testMultipleEvalTracksWithoutGenotypes() { String extraArgs = "-T VariantEval -R " + b37KGReference + diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java new file mode 100644 index 000000000..120e4bea2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibrationUnitTest.java @@ -0,0 +1,68 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import junit.framework.Assert; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.testng.annotations.Test; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 11/29/13 + */ + +public class ApplyRecalibrationUnitTest extends BaseTest { + @Test + public final void testGenerateFilterString() { + final ApplyRecalibration ar = new ApplyRecalibration(); + ar.VQSLOD_CUTOFF = 0.0; + Assert.assertTrue(ar.generateFilterString(5.0).equals(VCFConstants.PASSES_FILTERS_v4)); + Assert.assertTrue(ar.generateFilterString(-5.0).equals(ApplyRecalibration.LOW_VQSLOD_FILTER_NAME)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java new file mode 100644 index 000000000..754fe30a2 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManagerUnitTest.java @@ -0,0 +1,145 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantrecalibration; + +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.BaseTest; +import org.junit.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 7/25/13 + */ + +public class VariantDataManagerUnitTest extends BaseTest { + + @Test + public final void testCalculateSortOrder() { + final double passingQual = 400.0; + final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + + VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); + + final List theData = new ArrayList<>(); + final VariantDatum datum1 = new VariantDatum(); + datum1.atTrainingSite = true; + datum1.failingSTDThreshold = false; + datum1.originalQual = passingQual; + datum1.annotations = new double[]{0.0,-10.0,10.0}; + datum1.isNull = new boolean[]{false, false, false}; + theData.add(datum1); + + final VariantDatum datum2 = new VariantDatum(); + datum2.atTrainingSite = true; + datum2.failingSTDThreshold = false; + datum2.originalQual = passingQual; + datum2.annotations = new double[]{0.0,-9.0,15.0}; + datum2.isNull = new boolean[]{false, false, false}; + theData.add(datum2); + + final VariantDatum datum3 = new VariantDatum(); + datum3.atTrainingSite = false; + datum3.failingSTDThreshold = false; + datum3.originalQual = passingQual; + datum3.annotations = new double[]{0.0,1.0,999.0}; + datum3.isNull = new boolean[]{false, false, false}; + theData.add(datum3); + + final VariantDatum datum4 = new VariantDatum(); + datum4.atTrainingSite = false; + datum4.failingSTDThreshold = false; + datum4.originalQual = passingQual; + datum4.annotations = new double[]{0.015,2.0,1001.11}; + datum4.isNull = new boolean[]{false, false, false}; + theData.add(datum4); + + vdm.setData(theData); + + final double[] meanVector = new double[3]; + for( int iii = 0; iii < meanVector.length; iii++ ) { + meanVector[iii] = vdm.mean(iii, true); + } + final List order = vdm.calculateSortOrder(meanVector); + Assert.assertArrayEquals(new int[]{2,1,0}, ArrayUtils.toPrimitive(order.toArray(new Integer[order.size()]))); + } + + @Test + public final void testDownSamplingTrainingData() { + final int MAX_NUM_TRAINING_DATA = 5000; + final double passingQual = 400.0; + final VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); + VRAC.MAX_NUM_TRAINING_DATA = MAX_NUM_TRAINING_DATA; + + VariantDataManager vdm = new VariantDataManager(new ArrayList(), VRAC); + final List theData = new ArrayList<>(); + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 10; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = true; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + for( int iii = 0; iii < MAX_NUM_TRAINING_DATA * 2; iii++) { + final VariantDatum datum = new VariantDatum(); + datum.atTrainingSite = false; + datum.failingSTDThreshold = false; + datum.originalQual = passingQual; + theData.add(datum); + } + + vdm.setData(theData); + final List trainingData = vdm.getTrainingData(); + + Assert.assertTrue( trainingData.size() == MAX_NUM_TRAINING_DATA ); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index e7a3f23a4..f3e57b48a 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -47,10 +47,17 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; +import java.util.List; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { private static class VRTest { @@ -72,9 +79,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", - "4d08c8eee61dd1bdea8c5765f34e41f0", // tranches - "ca7de32b6143cce58aa4bc59b311feb7", // recal file - "cc7f413ba50b3d12f11f95aaa31e67d1"); // cut VCF + "6f029dc7d16e63e19c006613cd0a5cff", // tranches + "73c7897441622c9b37376eb4f071c560", // recal file + "11a28df79b92229bd317ac49a3ed0fa1"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -95,8 +102,6 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -L 20:1,000,000-40,000,000" + " --no_cmdline_in_header" + " -an QD -an HaplotypeScore -an HRun" + - " -percentBad 0.07" + - " --minNumBadVariants 0" + " --trustAllPolymorphic" + // for speed " -recalFile %s" + " -tranchesFile %s", @@ -121,9 +126,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", - "6a1eef4d02857dbb117a15420b5c0ce9", // tranches - "db9faaee11ee5427a81ddee328245f8c", // recal file - "42e0fcd8e048a5f6abc41a4d1c3e97a5"); // cut VCF + "3ad7f55fb3b072f373cbce0b32b66df4", // tranches + "e747c08131d58d9a4800720f6ca80e0c", // recal file + "e5808af3af0f2611ba5a3d172ab2557b"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -173,15 +178,15 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { VRTest indelUnfiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . - "b7589cd098dc153ec64c02dcff2838e4", // tranches - "5a9ba210a3c68109289a71039a04509d", // recal file - "d816bd43c844069d65711a7975707437"); // cut VCF + "9a331328370889168a7aa3a625f73620", // tranches + "2cbbd146d68c40200b782e0226f71976", // recal file + "64dd98a5ab80cf5fd9a36eb66b38268e"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS - "b7589cd098dc153ec64c02dcff2838e4", // tranches - "5a9ba210a3c68109289a71039a04509d", // recal file - "6bcb344511c727c28523825f73c7daee"); // cut VCF + "9a331328370889168a7aa3a625f73620", // tranches + "2cbbd146d68c40200b782e0226f71976", // recal file + "c0ec662001e829f5779a9d13b1d77d80"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -200,9 +205,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -L 20:1,000,000-40,000,000" + " --no_cmdline_in_header" + " -an QD -an ReadPosRankSum -an HaplotypeScore" + - " -percentBad 0.08" + " -mode INDEL -mG 3" + - " --minNumBadVariants 0" + " --trustAllPolymorphic" + // for speed " -recalFile %s" + " -tranchesFile %s", @@ -224,7 +227,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -recalFile " + getMd5DB().getMD5FilePath(params.recalMD5, null), Arrays.asList(params.cutVCFMD5)); spec.disableShadowBCF(); // has to be disabled because the input VCF is missing LowQual annotation - executeTest("testApplyRecalibrationIndel-"+params.inVCF, spec); + executeTest("testApplyRecalibrationIndel-" + params.inVCF, spec); } @Test @@ -239,8 +242,32 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("20c23643a78c5b95abd1526fdab8960d")); + Arrays.asList("03a0ed00af6aac76d39e569f90594a02")); executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); } + + @Test(enabled = true) + public void testApplyRecalibrationSnpAndIndelTogetherExcludeFiltered() throws Exception { + final String base = "-R " + b37KGReference + + " -T ApplyRecalibration" + + " -L 20:1000100-1000500" + + " -mode BOTH" + + " --excludeFiltered -ts_filter_level 90.0" + + " --no_cmdline_in_header" + + " -input " + privateTestDir + "VQSR.mixedTest.input" + + " -o %s" + + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal"; + + final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File VCF = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).first.get(0); + + for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { + if( VC != null ) { + Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file + } + } + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 917cbd542..2eeb9221e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -67,7 +67,11 @@ public class CombineVariantsIntegrationTest extends WalkerTest { // TODO TODO TODO TODO TODO TODO TODO TODO // private static String baseTestString(String args) { - return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; + return baseTestString(args, b36KGReference); + } + + private static String baseTestString(String args, String ref) { + return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; //return "-T CombineVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -U LENIENT_VCF_PROCESSING -R " + b36KGReference + args; } @@ -181,6 +185,19 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } + @Test public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -multipleAllelesMergeType MIX_TYPES" + + " --excludeNonVariants -combineAnnotations -setKey null" + + " -L 20:10,000,000-10,001,000", b37KGReference), + 1, + Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); + cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); + } + @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java index 6d38940bc..f369ad210 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java @@ -46,16 +46,20 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.variant.vcf.*; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.broadinstitute.variant.vcf.VCFHeader; +import org.broadinstitute.variant.vcf.VCFHeaderLine; +import org.broadinstitute.variant.vcf.VCFUtils; import org.testng.Assert; - import org.testng.annotations.Test; import java.io.StringBufferInputStream; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** * test out pieces of the combine variants code @@ -154,7 +158,8 @@ public class CombineVariantsUnitTest { private VCFHeader createHeader(String headerStr) { VCFCodec codec = new VCFCodec(); - VCFHeader head = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(headerStr)))); + VCFHeader head = null; + head = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(headerStr)))); return head; } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java index bd9ff4f80..50c896450 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ConcordanceMetricsUnitTest.java @@ -46,25 +46,20 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import net.sf.picard.reference.ReferenceSequenceFile; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeBuilder; -import org.broadinstitute.variant.variantcontext.GenotypeType; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFCodec; import org.broadinstitute.variant.vcf.VCFHeader; -import org.testng.annotations.Test; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.PositionalBufferedStream; import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; @@ -72,7 +67,6 @@ import java.io.StringBufferInputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import net.sf.picard.reference.ReferenceSequenceFile; public class ConcordanceMetricsUnitTest extends BaseTest { @@ -139,8 +133,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VariantContext eval = data.getFirst(); VariantContext truth = data.getSecond(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); metrics.update(eval,truth); Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); @@ -189,8 +183,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VariantContext eval = data.getFirst(); VariantContext truth = data.getSecond(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); metrics.update(eval,truth); Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); @@ -209,8 +203,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { eval = data.getSecond(); truth = data.getFirst(); codec = new VCFCodec(); - evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); metrics = new ConcordanceMetrics(evalHeader,compHeader); metrics.update(eval,truth); Assert.assertEquals(eval.getGenotype("test1_sample2").getType().ordinal(), 2); @@ -264,8 +258,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VariantContext eval = data.getFirst(); VariantContext truth = data.getSecond(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); metrics.update(eval,truth); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample1").getnMismatchingAlt(),1); @@ -317,8 +311,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VariantContext eval = data.getFirst(); VariantContext truth = data.getSecond(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); metrics.update(eval,truth); Assert.assertEquals(metrics.getGenotypeConcordance("test1_sample2").getnMismatchingAlt(),0); @@ -366,8 +360,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VariantContext eval = data.getFirst(); VariantContext truth = data.getSecond(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); metrics.update(eval,truth); Assert.assertTrue(eval.getGenotype("test1_sample2").getType().equals(GenotypeType.UNAVAILABLE)); @@ -520,8 +514,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { List> data = getData6(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); for ( Pair contextPair : data ) { @@ -554,8 +548,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { VariantContext eval = data.getFirst(); VariantContext truth = data.getSecond(); VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_1_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); int[][] table = metrics.getOverallGenotypeConcordance().getTable(); // set up the table @@ -588,9 +582,9 @@ public class ConcordanceMetricsUnitTest extends BaseTest { @Test(enabled=true) public void testRobustness() { VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_1)))); - VCFHeader disjointCompHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_2)))); - VCFHeader overlapCompHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_3)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_1)))); + VCFHeader disjointCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_2)))); + VCFHeader overlapCompHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_3_HEADER_3)))); ConcordanceMetrics disjointMetrics = new ConcordanceMetrics(evalHeader,disjointCompHeader); ConcordanceMetrics overlapMetrics = new ConcordanceMetrics(evalHeader,overlapCompHeader); @@ -720,8 +714,8 @@ public class ConcordanceMetricsUnitTest extends BaseTest { @Test(enabled = true) public void testSites() { VCFCodec codec = new VCFCodec(); - VCFHeader evalHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); - VCFHeader compHeader = (VCFHeader)codec.readHeader(new AsciiLineReader(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + VCFHeader evalHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); + VCFHeader compHeader = (VCFHeader)codec.readActualHeader(codec.makeSourceFromStream(new PositionalBufferedStream(new StringBufferInputStream(TEST_2_HEADER)))); ConcordanceMetrics metrics = new ConcordanceMetrics(evalHeader,compHeader); List> data = getData7(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java new file mode 100644 index 000000000..0eca18c46 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/PosteriorLikelihoodsUtilsUnitTest.java @@ -0,0 +1,391 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +/** + * Created by IntelliJ IDEA. + * User: ebanks + * Date: 12/8/13 + */ + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class PosteriorLikelihoodsUtilsUnitTest extends BaseTest { + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + + @BeforeSuite + public void setup() { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + } + + private String arraysEq(int[] a, int[] b) { + if ( a.length != b.length ) { + return String.format("NEQ: %s | %s",Arrays.toString(a),Arrays.toString(b)); + } + for ( int idx = 0; idx < a.length; idx++) { + if ( a[idx] - b[idx] > 1 || b[idx] - a[idx] > 1) { + return String.format("NEQ: %s | %s",Arrays.toString(a),Arrays.toString(b)); + } + } + + return ""; + } + + private int[] _mleparse(List s) { + int[] mle = new int[s.size()]; + for ( int idx = 0; idx < mle.length; idx ++) { + mle[idx] = s.get(idx); + } + + return mle; + } + + private Genotype makeGwithPLs(String sample, Allele a1, Allele a2, double[] pls) { + Genotype gt = new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); + if ( pls != null && pls.length > 0 ) { + Assert.assertNotNull(gt.getPL()); + Assert.assertTrue(gt.getPL().length > 0); + for ( int i : gt.getPL() ) { + Assert.assertTrue(i >= 0); + } + Assert.assertNotEquals(Arrays.toString(gt.getPL()),"[0]"); + } + return gt; + } + + private Genotype makeG(String sample, Allele a1, Allele a2) { + return GenotypeBuilder.create(sample, Arrays.asList(a1, a2)); + } + + private Genotype makeG(String sample, Allele a1, Allele a2, int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); + } + + private VariantContext makeVC(String source, List alleles, Genotype... genotypes) { + int start = 10; + int stop = start; // alleles.contains(ATC) ? start + 3 : start; + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(Arrays.asList(genotypes)).filters(null).make(); + } + + @Test + private void testCalculatePosteriorNoExternalData() { + VariantContext test1 = makeVC("1",Arrays.asList(Aref,T), makeG("s1",Aref,T,20,0,10), + makeG("s2",T,T,60,40,0), + makeG("s3",Aref,Aref,0,30,90)); + test1 = new VariantContextBuilder(test1).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,3).make(); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(test1, new ArrayList(), 0, 0.001, true, false, false); + Genotype test1exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.20686, -0.03073215, -1.20686}); + Assert.assertTrue(test1exp1.hasPL()); + Genotype test1exp2 = makeGwithPLs("s2",T,T,new double[]{-6.000066, -3.823938, -6.557894e-05}); + Genotype test1exp3 = makeGwithPLs("s3",Aref,Aref,new double[]{-0.0006510083, -2.824524, -9.000651}); + Assert.assertEquals("java.util.ArrayList",test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY).getClass().getCanonicalName()); + Assert.assertEquals(arraysEq(test1exp1.getPL(), _mleparse((List)test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List)test1result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List)test1result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + + // AA AB BB AC BC CC + // AA AC CC AT CT TT + VariantContext test2 = makeVC("2",Arrays.asList(Aref,C,T), + makeG("s1",Aref,T,30,10,60,0,15,90), + makeG("s2",Aref,C,40,0,10,30,40,80), + makeG("s3",Aref,Aref,0,5,8,15,20,40), + makeG("s4",C,T,80,40,12,20,0,10)); + test2 = new VariantContextBuilder(test2).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,new ArrayList(Arrays.asList(2,2))).make(); + VariantContext test2result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(test2,new ArrayList(),5,0.001,true,false,false); + Genotype test2exp1 = makeGwithPLs("s1",Aref,T,new double[]{-2.647372, -1.045139, -6.823193, -0.04513873, -2.198182, -9.823193}); + Genotype test2exp2 = makeGwithPLs("s2",Aref,C,new double[]{-3.609957, -0.007723248, -1.785778, -3.007723, -4.660767, -8.785778}); + Genotype test2exp3 = makeGwithPLs("s3",Aref,Aref,new double[] {-0.06094877, -0.9587151, -2.03677,-1.958715, -3.111759, -5.23677}); + Genotype test2exp4 = makeGwithPLs("s4",C,T,new double[]{-7.016534, -3.4143, -1.392355, -1.4143, -0.06734388, -1.192355}); + Assert.assertEquals(arraysEq(test2exp1.getPL(),(int[]) _mleparse((List)test2result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp2.getPL(),(int[]) _mleparse((List)test2result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp3.getPL(),(int[]) _mleparse((List)test2result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test2exp4.getPL(),(int[]) _mleparse((List)test2result.getGenotype(3).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + } + + @Test + private void testCalculatePosteriorSamplePlusExternal() { + VariantContext testOverlappingBase = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,40,20,0), + makeG("s2",Aref,T,18,0,24), + makeG("s3",Aref,T,22,0,12)); + List supplTest1 = new ArrayList<>(3); + supplTest1.add(new VariantContextBuilder(makeVC("2",Arrays.asList(Aref,T))).attribute(VCFConstants.MLE_ALLELE_COUNT_KEY,2).attribute(VCFConstants.ALLELE_NUMBER_KEY,10).make()); + supplTest1.add(new VariantContextBuilder(makeVC("3",Arrays.asList(Aref,T))).attribute(VCFConstants.ALLELE_COUNT_KEY,4).attribute(VCFConstants.ALLELE_NUMBER_KEY,22).make()); + supplTest1.add(makeVC("4",Arrays.asList(Aref,T), + makeG("s_1",T,T), + makeG("s_2",Aref,T))); + VariantContext test1result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testOverlappingBase,supplTest1,0,0.001,true,false,false); + // the counts here are ref=30, alt=14 + Genotype test1exp1 = makeGwithPLs("t1",T,T,new double[]{-3.370985, -1.415172, -0.01721766}); + Genotype test1exp2 = makeGwithPLs("t2",Aref,T,new double[]{-1.763792, -0.007978791, -3.010024}); + Genotype test1exp3 = makeGwithPLs("t3",Aref,T,new double[]{-2.165587, -0.009773643, -1.811819}); + Assert.assertEquals(arraysEq(test1exp1.getPL(),_mleparse((List) test1result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp2.getPL(),_mleparse((List) test1result.getGenotype(1).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + Assert.assertEquals(arraysEq(test1exp3.getPL(),_mleparse((List) test1result.getGenotype(2).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + + VariantContext testNonOverlapping = makeVC("1", Arrays.asList(Aref,T), makeG("s1",T,T,3,1,0)); + List other = Arrays.asList(makeVC("2",Arrays.asList(Aref,C),makeG("s2",C,C,10,2,0))); + VariantContext test2result = PosteriorLikelihoodsUtils.calculatePosteriorGLs(testNonOverlapping,other,0,0.001,true,false,false); + Genotype test2exp1 = makeGwithPLs("SGV",T,T,new double[]{-4.078345, -3.276502, -0.0002661066}); + Assert.assertEquals(arraysEq(test2exp1.getPL(),_mleparse((List) test2result.getGenotype(0).getAnyAttribute(VCFConstants.GENOTYPE_POSTERIORS_KEY))), ""); + } + + private double[] pl2gl(int[] pl) { + double[] gl = new double[pl.length]; + for ( int idx = 0; idx < gl.length; idx++ ) { + gl[idx] = pl[idx]/(-10.0); + } + + return MathUtils.normalizeFromLog10(gl,true); + } + + @Test + private void testCalculatePosterior() { + int[][] likelihood_PLs = new int[][]{ + new int[]{3,0,3}, + new int[]{99,0,99}, + new int[]{50,20,0}, + new int[]{10,0,50}, + new int[]{80,60,0}, + new int[]{0,42,44}}; + + int[] altCounts = new int[]{10,40,90}; + int[] altAlleleNum = new int[]{100,500,1000}; + + double[] expected_post_10_100 = new double[] { + 9.250326e-03, 3.020208e-01, 6.887289e-01, + 7.693433e-12, 1.000000e+00, 5.728111e-10, + 1.340156e-07, 2.192982e-03, 9.978069e-01, + 6.073718e-03, 9.938811e-01, 4.522159e-05, + 1.343101e-10, 2.197802e-07, 9.999998e-01, + 9.960193e-01, 1.028366e-03, 2.952290e-03 + }; + + double[] expected_post_10_500 = new double[] { + 4.226647e-04, 7.513277e-02, 9.244446e-01, + 1.413080e-12, 1.000000e+00, 3.090662e-09, + 4.570232e-09, 4.071661e-04, 9.995928e-01, + 1.120916e-03, 9.986339e-01, 2.451646e-04, + 4.572093e-12, 4.073320e-08, 1.000000e+00, + 9.151689e-01, 5.144399e-03, 7.968675e-02 + }; + + double[] expected_post_10_1000 = new double[] { + 1.077685e-04, 3.870477e-02, 9.611875e-01, + 6.994030e-13, 1.000000e+00, 6.237975e-09, + 1.120976e-09, 2.017756e-04, 9.997982e-01, + 5.549722e-04, 9.989500e-01, 4.949797e-04, + 1.121202e-12, 2.018163e-08, 1.000000e+00, + 7.318346e-01, 8.311615e-03, 2.598538e-01 + }; + + double[] expected_post_40_100 = new double[] { + 1.102354e-01, 6.437516e-01, 2.460131e-01, + 4.301328e-11, 1.000000e+00, 9.599306e-11, + 4.422850e-06, 1.294493e-02, 9.870507e-01, + 3.303763e-02, 9.669550e-01, 7.373032e-06, + 4.480868e-09, 1.311474e-06, 9.999987e-01, + 9.997266e-01, 1.846199e-04, 8.882157e-05 + }; + + double[] expected_post_40_500 = new double[] { + 5.711785e-03, 2.557266e-01, 7.385617e-01, + 5.610428e-12, 1.000000e+00, 7.254558e-10, + 7.720262e-08, 1.732352e-03, 9.982676e-01, + 4.436495e-03, 9.955061e-01, 5.736604e-05, + 7.733659e-11, 1.735358e-07, 9.999998e-01, + 9.934793e-01, 1.406575e-03, 5.114153e-03 + }; + + double[] expected_post_40_1000 = new double[] { + 1.522132e-03, 1.422229e-01, 8.562549e-01, + 2.688330e-12, 1.000000e+00, 1.512284e-09, + 1.776184e-08, 8.317737e-04, 9.991682e-01, + 2.130611e-03, 9.977495e-01, 1.198547e-04, + 1.777662e-11, 8.324661e-08, 9.999999e-01, + 9.752770e-01, 2.881677e-03, 2.184131e-02 + }; + + double[] expected_post_90_100 = new double[] { + 6.887289e-01, 3.020208e-01, 9.250326e-03, + 5.728111e-10, 1.000000e+00, 7.693433e-12, + 6.394346e-04, 1.405351e-01, 8.588255e-01, + 3.127146e-01, 6.872849e-01, 4.200075e-07, + 7.445327e-07, 1.636336e-05, 9.999829e-01, + 9.999856e-01, 1.386699e-05, 5.346906e-07 + }; + + double[] expected_post_90_500 = new double[] { + 2.528165e-02, 4.545461e-01, 5.201723e-01, + 1.397100e-11, 1.000000e+00, 2.874546e-10, + 4.839050e-07, 4.360463e-03, 9.956391e-01, + 1.097551e-02, 9.890019e-01, 2.258221e-05, + 4.860244e-10, 4.379560e-07, 9.999996e-01, + 9.986143e-01, 5.677671e-04, 8.179741e-04 + }; + + double[] expected_post_90_1000 = new double[] { + 7.035938e-03, 2.807708e-01, 7.121932e-01, + 6.294627e-12, 1.000000e+00, 6.371561e-10, + 9.859771e-08, 1.971954e-03, 9.980279e-01, + 4.974874e-03, 9.949748e-01, 5.035678e-05, + 9.879252e-11, 1.975850e-07, 9.999998e-01, + 9.947362e-01, 1.255272e-03, 4.008518e-03 + }; + + double[][] expectations = new double[][] { + expected_post_10_100, + expected_post_10_500, + expected_post_10_1000, + expected_post_40_100, + expected_post_40_500, + expected_post_40_1000, + expected_post_90_100, + expected_post_90_500, + expected_post_90_1000 + }; + + int testIndex = 0; + for ( int altCount : altCounts ) { + for ( int numAlt : altAlleleNum ) { + double[] knownCounts = new double[2]; + knownCounts[0] = altCount; + knownCounts[1] = numAlt-altCount; + int expected_index = 0; + for ( int gl_index = 0; gl_index < likelihood_PLs.length; gl_index++ ) { + double[] post = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(likelihood_PLs[gl_index]), knownCounts, 2); + for ( int i = 0; i < post.length; i++ ) { + double expected = expectations[testIndex][expected_index++]; + double observed = Math.pow(10.0,post[i]); + double err = Math.abs( (expected-observed)/expected ); + Assert.assertTrue(err < 1e-4, String.format("Counts: %s | Expected: %e | Observed: %e | pre %s | prior %s | post %s", + Arrays.toString(knownCounts), expected,observed, Arrays.toString(pl2gl(likelihood_PLs[gl_index])), + Arrays.toString(PosteriorLikelihoodsUtils.getDirichletPrior(knownCounts,2)),Arrays.toString(post))); + } + } + testIndex++; + } + } + } + + private boolean arraysApproxEqual(double[] a, double[] b, double tol) { + if ( a.length != b.length ) { + return false; + } + + for ( int idx = 0; idx < a.length; idx++ ) { + if ( Math.abs(a[idx]-b[idx]) > tol ) { + return false; + } + } + + return true; + } + + private String errMsgArray(double[] a, double[] b) { + return String.format("Expected %s, Observed %s", Arrays.toString(a), Arrays.toString(b)); + } + + @Test + private void testPosteriorMultiAllelic() { + // AA AB BB AC BC CC AD BD CD DD + int[] PL_one = new int[] {40,20,30,0,15,25}; + int[] PL_two = new int[] {0,20,10,99,99,99}; + int[] PL_three = new int[] {50,40,0,30,30,10,20,40,80,50}; + int[] PL_four = new int[] {99,90,85,10,5,30,40,20,40,30,0,12,20,14,5}; + int[] PL_five = new int[] {60,20,30,0,40,10,8,12,18,22,40,12,80,60,20}; + double[] counts_one = new double[]{100.001,40.001,2.001}; + double[] counts_two = new double[]{2504.001,16.001,218.001}; + double[] counts_three = new double[]{10000.001,500.001,25.001,0.001}; + double[] counts_four = new double[]{4140.001,812.001,32.001,104.001,12.001}; + double[] counts_five = new double[]{80.001,40.001,8970.001,200.001,1922.001}; + + double expected_one[] = new double[] { -2.684035, -0.7852596, -2.4735, -0.08608339, -1.984017, -4.409852 }; + double expected_two[] = new double[] { -5.736189e-05, -3.893688, -5.362878, -10.65938, -12.85386, -12.0186}; + double expected_three[] = new double[] {-2.403234, -2.403276, -0.004467802, -2.70429, -4.005319, -3.59033, -6.102247, -9.403276, -14.70429, -13.40284}; + double expected_four[] = new double[] {-7.828677, -7.335196, -7.843136, -0.7395892, -0.947033, -5.139092, -3.227715, + -1.935159, -5.339552, -4.124552, -0.1655353, -2.072979, -4.277372, -3.165498, -3.469589 }; + double expected_five[] = new double[] { -9.170334, -5.175724, -6.767055, -0.8250021, -5.126027, -0.07628661, -3.276762, + -3.977787, -2.227065, -4.57769, -5.494041, -2.995066, -7.444344, -7.096104, -2.414187}; + + double[] post1 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_one),counts_one,2); + double[] post2 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_two),counts_two,2); + double[] post3 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_three),counts_three,2); + double[] post4 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_four),counts_four,2); + double[] post5 = PosteriorLikelihoodsUtils.calculatePosteriorGLs(pl2gl(PL_five),counts_five,2); + + double[] expecPrior5 = new double[] {-4.2878195, -4.2932090, -4.8845400, -1.9424874, -2.2435120, -0.1937719, -3.5942477, + -3.8952723, -1.5445506, -3.4951749, -2.6115263, -2.9125508, -0.5618292, -2.2135895, + -1.5316722}; + + Assert.assertTrue(arraysApproxEqual(expecPrior5, PosteriorLikelihoodsUtils.getDirichletPrior(counts_five,2),1e-5),errMsgArray(expecPrior5,PosteriorLikelihoodsUtils.getDirichletPrior(counts_five,2))); + + Assert.assertTrue(arraysApproxEqual(expected_one,post1,1e-6),errMsgArray(expected_one,post1)); + Assert.assertTrue(arraysApproxEqual(expected_two,post2,1e-5),errMsgArray(expected_two,post2)); + Assert.assertTrue(arraysApproxEqual(expected_three,post3,1e-5),errMsgArray(expected_three,post3)); + Assert.assertTrue(arraysApproxEqual(expected_four,post4,1e-5),errMsgArray(expected_four,post4)); + Assert.assertTrue(arraysApproxEqual(expected_five,post5,1e-5),errMsgArray(expected_five,post5)); + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 4b1483cb6..884b46692 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -316,7 +316,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea") + Arrays.asList("69862fb97e8e895fe65c7abb14b03cee") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index 0f698d03d..164645171 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -87,7 +87,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), 3, - Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95") + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","daf853d2fcbfc77daa1f9ae190be24f4","02f1c462ebc8576e399d0e94f729fd95") ); executeTest(testName, spec); @@ -177,7 +177,7 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), 3, - Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5") + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","3bfb01c17935e3d194d266755b446e82","7887d2e0bf605dbcd0688c552cdb99d5") ); executeTest(testName, spec); diff --git a/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java b/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java new file mode 100644 index 000000000..88f5910f7 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/RandomDNA.java @@ -0,0 +1,127 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils; + +import com.sun.istack.internal.NotNull; + +import java.util.Random; + +/** + * Random DNA sequence generator. + * + *

+ * Returned bases are always in upper case and one of the valid four nocleotides 'A', 'C', 'G' and 'T'. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class RandomDNA { + + private Random random; + + /** + * Constructs a new random DNA generator. + * + *

+ * The seed would be the default which would depend on system properties and the current time as + * described in {@link Random} documentation. + *

+ */ + public RandomDNA() { + random = new Random(); + } + + /** + * Constructs a new random DNA generator providing a seed. + * + * @param seed the random number generator seed. + */ + public RandomDNA(final long seed) { + random = new Random(seed); + } + + /** + * Updates the content of a byte array with a random base sequence. + * + *

+ * The whole array will be filled with new base values. + *

+ * + * @param destination the array to update. + * + * @throws NullPointerException if {@code destination} is {@code null}. + */ + public void nextBases(final byte[] destination) { + random.nextBytes(destination); + for (int i = 0; i < destination.length; i++) { + final int ord = destination[i] & 0x03; + switch (ord) { + case 0: destination[i] = 'A'; break; + case 1: destination[i] = 'C'; break; + case 2: destination[i] = 'G'; break; + case 3: destination[i] = 'T'; break; + default: throw new IllegalStateException("this cannot be happening!!!"); + } + } + } + + /** + * Returns a random RNA sequence of bases. + * @param size the length of the sequence. + * + * @throws IllegalArgumentException if {@code size} is negative. + * @return never {@code null}. + */ + @NotNull + public byte[] nextBases(final int size) { + if (size < 0) throw new IllegalArgumentException("the size cannot be negative"); + final byte[] result = new byte[size]; + nextBases(result); + return result; + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java new file mode 100644 index 000000000..ae9ead827 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/collections/CountSetUnitTest.java @@ -0,0 +1,249 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.utils.collections; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; + +/** + * Unit tests for {@link CountSet} + */ +public class CountSetUnitTest extends BaseTest { + + @Test(dataProvider="capacities") + public void testSize(final int capacity) { + final CountSet empty = new CountSet(capacity); + Assert.assertEquals(empty.size(), 0); + CountSet nonEmpty = new CountSet(capacity); + for (int i = 0; i < capacity * 3; i++) { + nonEmpty.add(i); + Assert.assertEquals(nonEmpty.size(),i + 1); + } + } + + @Test + public void testSingleValueAdd() { + final int CAPACITY = 10; + final CountSet subject = new CountSet(CAPACITY); + final HashSet reasuranceSet = new HashSet<>(CAPACITY); + final int REPEATS = 1000; + final Random rnd = new Random(13); + for (int i = 0; i < REPEATS; i++) { + int newInt = rnd.nextInt(500); + boolean expectedResult = reasuranceSet.add(newInt); + boolean result = subject.add(newInt); + Assert.assertEquals(result,expectedResult); + Assert.assertEquals(subject.size(),reasuranceSet.size()); + } + for (final int j : reasuranceSet) + Assert.assertTrue(subject.contains(j)); + for (int j = 0; j < 501; j++) + Assert.assertEquals(subject.contains(j),reasuranceSet.contains(j)); + } + + @Test + public void testToIntArray() { + final CountSet subject = new CountSet(10); + subject.addAll(1,4,7); + final int[] intArray = subject.toIntArray(); + Assert.assertEquals(intArray.length,3); + Assert.assertEquals(intArray[0],1); + Assert.assertEquals(intArray[1],4); + Assert.assertEquals(intArray[2],7); + } + + @Test + public void testCopyTo() { + final CountSet subject = new CountSet(10); + subject.addAll(1,4,7); + final int[] intArray = new int[3]; + subject.copyTo(intArray); + Assert.assertEquals(intArray[0],1); + Assert.assertEquals(intArray[1],4); + Assert.assertEquals(intArray[2],7); + } + + @Test + public void testSetToSingleValue() { + final CountSet subject = new CountSet(10); + subject.setTo(-31); + Assert.assertEquals(subject.size(),1); + Assert.assertEquals(subject.min(),-31); + Assert.assertEquals(subject.max(),-31); + Assert.assertTrue(subject.contains(-31)); + Assert.assertFalse(subject.contains(-21)); + } + + @Test void testSetToArrayOfValues() { + final int CAPACITY = 10; + final CountSet subject = new CountSet(CAPACITY); + final int REPEATS = 1000; + final Random rnd = new Random(13); + final int[] values = new int[REPEATS]; + for (int i = 0; i < REPEATS; i++) { + int newInt = rnd.nextInt(Integer.MAX_VALUE) * (rnd.nextBoolean() ? -1 : 1); + values[i] = newInt; + } + subject.setTo(values); + Arrays.sort(values); + Assert.assertEquals(subject.size(),REPEATS); + Assert.assertEquals(subject.min(),values[0]); + Assert.assertEquals(subject.max(),values[REPEATS - 1]); + } + + @Test + public void testMinMax() { + final int CAPACITY = 10; + final CountSet subject = new CountSet(CAPACITY); + final int REPEATS = 1000; + final Random rnd = new Random(13); + final int[] values = new int[REPEATS]; + for (int i = 0; i < REPEATS; i++) { + int newInt = rnd.nextInt(Integer.MAX_VALUE) * (rnd.nextBoolean() ? -1 : 1); + values[i] = newInt; + } + subject.addAll(values); + Arrays.sort(values); + Assert.assertEquals(subject.min(),values[0]); + Assert.assertEquals(subject.max(),values[REPEATS - 1]); + } + + @Test + public void testIncrease() { + final int CAPACITY = 10; + final CountSet subject = new CountSet(CAPACITY); + final HashSet reasuranceSet = new HashSet<>(CAPACITY); + final int REPEATS = 1000; + final Random rnd = new Random(13); + final int[] values = new int[REPEATS]; + final Integer[] valueWrappers = new Integer[REPEATS]; + for (int i = 0; i < REPEATS; i++) { + int newInt = rnd.nextInt(500); + values[i] = newInt; + valueWrappers[i] = newInt; + } + + subject.incAll(3); + + for (final int j : reasuranceSet) + Assert.assertTrue(subject.contains(j+3)); + for (int j = 0; j < 501; j++) + Assert.assertEquals(subject.contains(j+3),reasuranceSet.contains(j)); + + } + + @Test + public void testArrayValueAdd() { + final int CAPACITY = 10; + final CountSet subject = new CountSet(CAPACITY); + final HashSet reasuranceSet = new HashSet<>(CAPACITY); + final int REPEATS = 1000; + final Random rnd = new Random(13); + final int[] values = new int[REPEATS]; + final Integer[] valueWrappers = new Integer[REPEATS]; + for (int i = 0; i < REPEATS; i++) { + int newInt = rnd.nextInt(500); + values[i] = newInt; + valueWrappers[i] = newInt; + } + + boolean expectedResult = reasuranceSet.addAll(Arrays.asList(valueWrappers)); + boolean result = subject.addAll(values); + Assert.assertEquals(result,expectedResult); + Assert.assertEquals(subject.size(),reasuranceSet.size()); + + for (final int j : reasuranceSet) + Assert.assertTrue(subject.contains(j)); + for (int j = 0; j < 501; j++) + Assert.assertEquals(subject.contains(j),reasuranceSet.contains(j)); + + } + + @Test + public void testAddRange() { + final CountSet subject = new CountSet(10); + subject.addRange(10,21); + Assert.assertEquals(subject.size(),12); + for (int i = 10; i < 22; i++) + Assert.assertTrue(subject.contains(i)); + for (int i = -1; i < 10; i++) + Assert.assertFalse(subject.contains(i)); + for (int i = 22; i < 31; i++) + Assert.assertFalse(subject.contains(i)); + } + + @DataProvider(name="capacities") + public Iterator capacities() { + final int MIN = 0; + final int MAX = 255; + return new Iterator() { + private int current = MIN; + + + @Override + public boolean hasNext() { + return current < MAX; + } + + @Override + public Object[] next() { + return new Object[] { Integer.valueOf(current++) }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java index ffbc3c43f..5c14c490e 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/GVCFWriterUnitTest.java @@ -49,7 +49,6 @@ package org.broadinstitute.sting.utils.gvcf; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; @@ -91,7 +90,7 @@ public class GVCFWriterUnitTest extends BaseTest { private List standardPartition = Arrays.asList(1, 10, 20); private Allele REF = Allele.create("N", true); private Allele ALT = Allele.create("A"); - private List ALLELES = Arrays.asList(REF, ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + private List ALLELES = Arrays.asList(REF, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); private final String SAMPLE_NAME = "XXYYZZ"; @BeforeMethod @@ -130,6 +129,16 @@ public class GVCFWriterUnitTest extends BaseTest { return vcb.genotypes(gb.make()).make(); } + private VariantContext makeHomRefAlt(final String contig, final int start, final int GQ) { + final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); + final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, REF)); + gb.GQ(GQ); + gb.DP(10); + gb.AD(new int[]{1, 2}); + gb.PL(new int[]{0, 10, 100}); + return vcb.genotypes(gb.make()).make(); + } + private VariantContext makeNonRef(final String contig, final int start, final int GQ) { final VariantContextBuilder vcb = new VariantContextBuilder("test", contig, start, start, Arrays.asList(REF, ALT)); final GenotypeBuilder gb = new GenotypeBuilder(SAMPLE_NAME, Arrays.asList(REF, ALT)); @@ -223,10 +232,10 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getStart(), start); Assert.assertEquals(vc.getEnd(), stop); if ( nonRef ) { - Assert.assertNotEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertNotEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); } else { Assert.assertEquals(vc.getNAlleles(), 2); - Assert.assertEquals(vc.getAlternateAllele(0), ReferenceConfidenceModel.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(vc.getAlternateAllele(0), GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); Assert.assertEquals(vc.getAttributeAsInt(GVCFWriter.BLOCK_SIZE_INFO_FIELD, -1), stop - start + 1); Assert.assertEquals(vc.getAttributeAsInt(VCFConstants.END_KEY, -1), stop); Assert.assertTrue(vc.hasGenotypes()); @@ -234,8 +243,9 @@ public class GVCFWriterUnitTest extends BaseTest { Assert.assertEquals(vc.getGenotypes().size(), 1); final Genotype g = vc.getGenotype(SAMPLE_NAME); Assert.assertEquals(g.hasAD(), false); - Assert.assertEquals(g.hasLikelihoods(), false); - Assert.assertEquals(g.hasPL(), false); + Assert.assertEquals(g.hasLikelihoods(), true); + Assert.assertEquals(g.hasPL(), true); + Assert.assertEquals(g.getPL().length == 3, true); Assert.assertEquals(g.hasDP(), true); Assert.assertEquals(g.hasGQ(), true); } @@ -305,9 +315,28 @@ public class GVCFWriterUnitTest extends BaseTest { assertGoodVC(mockWriter.emitted.get(2), "20", 6, 7, false); } + @Test + public void testHomRefAlt() { + final GVCFWriter writer = new GVCFWriter(mockWriter, standardPartition); + + writer.add(makeHomRef("20", 1, 0)); + writer.add(makeHomRef("20", 2, 0)); + writer.add(makeHomRefAlt("20", 3, 0)); + writer.add(makeHomRef("20", 4, 0)); + writer.add(makeHomRef("20", 5, 0)); + writer.add(makeHomRef("20", 6, 0)); + writer.add(makeHomRef("20", 7, 0)); + writer.close(); + Assert.assertEquals(mockWriter.emitted.size(), 3); + assertGoodVC(mockWriter.emitted.get(0), "20", 1, 2, false); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("END")); + Assert.assertFalse(mockWriter.emitted.get(1).hasAttribute("BLOCK_SIZE")); + assertGoodVC(mockWriter.emitted.get(2), "20", 4, 7, false); + } + @DataProvider(name = "BandPartitionData") public Object[][] makeBandPartitionData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); tests.add(new Object[]{null, false}); tests.add(new Object[]{Collections.emptyList(), false}); diff --git a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java index 239aa93b5..ec4797f3d 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/gvcf/HomRefBlockUnitTest.java @@ -85,32 +85,34 @@ public class HomRefBlockUnitTest extends BaseTest { @Test public void testMinMedian() { + //TODO - might be better to make this test use a data provider? final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); int pos = vc.getStart(); - band.add(pos++, gb.DP(10).GQ(11).make()); + band.add(pos++, gb.DP(10).GQ(11).PL(new int[]{0,11,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 10, 11, 11); - band.add(pos++, gb.DP(11).GQ(10).make()); + band.add(pos++, gb.DP(11).GQ(10).PL(new int[]{0,10,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 11, 10, 11); - band.add(pos++, gb.DP(12).GQ(12).make()); + band.add(pos++, gb.DP(12).GQ(12).PL(new int[]{0,12,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 11, 10, 11); - band.add(pos++, gb.DP(13).GQ(15).make()); + band.add(pos++, gb.DP(13).GQ(15).PL(new int[]{0,15,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(14).GQ(16).make()); + band.add(pos++, gb.DP(14).GQ(16).PL(new int[]{0,16,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(15).GQ(17).make()); + band.add(pos++, gb.DP(15).GQ(17).PL(new int[]{0,17,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); - band.add(pos++, gb.DP(16).GQ(18).make()); + band.add(pos++, gb.DP(16).GQ(18).PL(new int[]{0,18,100}).make()); Assert.assertEquals(band.getStop(), pos - 1); assertValues(band, 10, 13, 10, 15); Assert.assertEquals(band.getSize(), pos - vc.getStart()); + Assert.assertTrue(Arrays.equals(band.getMinPLs(), new int[]{0,10,100})); } @Test @@ -118,7 +120,7 @@ public class HomRefBlockUnitTest extends BaseTest { final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); - band.add(vc.getStart(), gb.DP(1000).GQ(1000).make()); + band.add(vc.getStart(), gb.DP(1000).GQ(1000).PL(new int[]{0,10,100}).make()); assertValues(band, 1000, 1000, 99, 99); } @@ -127,7 +129,7 @@ public class HomRefBlockUnitTest extends BaseTest { final HomRefBlock band = new HomRefBlock(vc, 10, 20); final GenotypeBuilder gb = new GenotypeBuilder("NA12878"); - band.add(vc.getStart() + 10, gb.DP(10).GQ(11).make()); + band.add(vc.getStart() + 10, gb.DP(10).GQ(11).PL(new int[]{0,10,100}).make()); } private void assertValues(final HomRefBlock band, final int minDP, final int medianDP, final int minGQ, final int medianGQ) { @@ -140,7 +142,7 @@ public class HomRefBlockUnitTest extends BaseTest { @DataProvider(name = "ContiguousData") public Object[][] makeContiguousData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final String chrMod : Arrays.asList("", ".mismatch") ) { for ( final int offset : Arrays.asList(-10, -1, 0, 1, 10) ) { diff --git a/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java new file mode 100644 index 000000000..6d8fdd6e9 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java @@ -0,0 +1,82 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * User: btaylor + * Date: 8/1/13 + * Time: 11:09 AM + */ +public class HaplotypeSizeAndBaseComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + // desired ordering is by size first, subordered by lexacographic relationship between bases + final List rawStrings = Arrays.asList("A", "C", "AC", "CC", "CT", "AAT", "ACT", "GAT", "ACGT"); + final List lexStrings = new ArrayList<>(rawStrings); + + for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { + final List haps = new ArrayList<>(seqs.size()); + for ( final String seq : seqs ) { + haps.add(new Haplotype(seq.getBytes(), false)); + } + + Collections.sort(haps, new HaplotypeSizeAndBaseComparator()); + for ( int i = 0; i < lexStrings.size(); i++ ) + Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); + } + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java index f9a4985b0..337f23afe 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -67,7 +67,7 @@ public class NanoSchedulerIntegrationTest extends WalkerTest { for ( final int nct : Arrays.asList(1, 2) ) { // tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); //// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); - tests.add(new Object[]{ "BOTH", "aad3a398273ec795e363268997247bd8", nt, nct }); + tests.add(new Object[]{ "BOTH", "a80925b58735828158491f77ae64998b", nt, nct }); } return tests.toArray(new Object[][]{}); diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java new file mode 100644 index 000000000..84b995749 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java @@ -0,0 +1,588 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResult; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.AssemblyResultSet; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Civar; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** +* Mock-up active region data used in testing. +* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +public class ActiveRegionTestDataSet { + + private final byte[] referenceBytes; + protected String reference; + protected String[] haplotypeCigars; + protected List haplotypeStrings; + protected String[] readCigars; + protected byte[] bq; + protected byte[] dq; + protected byte[] iq; + protected int kmerSize; + private List haplotypeList; + private List readList; + private AssemblyResultSet assemblyResultSet; + private Map readBySequence; + private String stringRepresentation; + private List> readEventOffsetList; + private GenomeLocParser genomeLocParser; + + /** Create a new active region data test set */ + public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, + final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { + this.reference = reference; + this.referenceBytes = reference.getBytes(); + this.haplotypeCigars = haplotypes; + this.readCigars = readCigars; + this.bq = bq; + this.dq = dq; + this.iq = iq; + this.kmerSize = kmerSize; + this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); + } + + public String getReference() { + return reference; + } + + public String toString() { + if (stringRepresentation == null) + return super.toString(); + else return stringRepresentation; + } + + public AssemblyResultSet assemblyResultSet() { + if (assemblyResultSet == null) { + final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); + rtg.addSequence("anonymous", this.getReference().getBytes(), null, true); + for (final String haplotype : this.haplotypesStrings()) { + rtg.addSequence("anonymous", haplotype.getBytes(), null, false); + } + rtg.buildGraphIfNecessary(); + if (rtg.hasCycles()) + throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); + + List haplotypeList = this.haplotypeList(); + + assemblyResultSet = new AssemblyResultSet(); + final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? + AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + + for (final Haplotype h : haplotypeList) + assemblyResultSet.add(h, ar); + } + return assemblyResultSet; + } + + public List haplotypesStrings() { + if (haplotypeStrings != null) { + return haplotypeStrings; + } + final List result = new ArrayList<>(haplotypeCigars.length); + String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllCombinations(cigar.substring(6),reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(applyCigar(reference, cigar,0,true)); + } else { + result.add(cigar); + } + } + haplotypeStrings = result; + return result; + } + + private List expandAllCombinations(final String cigarString, final String reference) { + final Civar civar = Civar.fromCharSequence(cigarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + result.add(c.applyTo(reference)); + } + return result; + } + + private List expandAllHaplotypeCombinations(final String civarString, final String reference) { + final Civar civar = Civar.fromCharSequence(civarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + final String baseString = c.applyTo(reference); + final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + try { + haplotype.setCigar(c.toCigar(reference.length())); + } catch (final RuntimeException ex) { + c.applyTo(reference); + c.toCigar(reference.length()); + throw new RuntimeException("" + c + " " + ex.getMessage(),ex); + } + result.add(haplotype); + } + return result; + } + + + public List haplotypeList() { + if (haplotypeList == null) { + + final List result = new ArrayList<>(haplotypeCigars.length); + final String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(cigarToHaplotype(reference, cigar, 0, true)); + } else { + final Haplotype h = new Haplotype(cigar.getBytes()); + h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + result.add(h); + } + } + haplotypeList = result; + } + return haplotypeList; + } + + + protected SAMSequenceDictionary artificialSAMSequenceDictionary() { + return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); + } + + protected SAMFileHeader artificialSAMFileHeader() { + return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); + } + + public List readList() { + if (readList == null) { + final SAMFileHeader header = artificialSAMFileHeader(); + readList = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + int count = 0; + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); + } else { + sequence = descr; + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } + count = readList.size(); + } + } + return readList; + } + + public List> readEventOffsetList() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + if (readEventOffsetList == null) { + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + final List unrolledCivars = civar.optionalizeAll().unroll(); + + readEventOffsetList = new ArrayList<>(readCigars.length); + int count = 0; + for (final String descr : readCigars) { + if (descr.matches("^\\d+:\\d+:.+$")) { + throw new UnsupportedOperationException(); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); + } else { + throw new UnsupportedOperationException(); + } + count = readEventOffsetList.size(); + } + readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); + } + return readEventOffsetList; + } + + + + + @SuppressWarnings("unused") + public String cigarToSequence(final String cigar) { + String reference = this.reference; + return applyCigar(reference, cigar,0,true); + } + + @SuppressWarnings("unused") + public GATKSAMRecord readFromString(final String readSequence) { + if (readBySequence == null) { + final List readList = readList(); + readBySequence = new HashMap<>(readList.size()); + for (final GATKSAMRecord r : readList) + readBySequence.put(r.getReadString(),r); + } + return readBySequence.get(readSequence); + } + + public List unrolledCivars() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + return civar.optionalizeAll().unroll(); + } + + public void introduceErrors(final Random rnd) { + final List reads = readList(); + final ArrayList result = new ArrayList<>(reads.size()); + for (final GATKSAMRecord read : reads) { + result.add(new MyGATKSAMRecord(read,rnd)); + } + readList = result; + } + + private class MyGATKSAMRecord extends GATKSAMRecord { + protected MyGATKSAMRecord(final GATKSAMRecord r) { + super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), + (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), + r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), + new byte[0]); + this.setReadBases(r.getReadBases()); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + } + + ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); + + public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { + super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), + (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), + r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), + new byte[0]); + final byte[] bases = new byte[r.getReadBases().length]; + + final byte[] readBases = r.getReadBases(); + final byte[] bq = r.getBaseQualities(); + final byte[] iq = r.getBaseInsertionQualities(); + final byte[] dq = r.getBaseDeletionQualities(); + int refOffset = r.getAlignmentStart() - 1; + int readOffset = 0; + for (int i = 0; i < r.getReadBases().length;) { + double p = rnd.nextDouble(); + double iqp = QualityUtils.qualToErrorProb(iq[i]); + if (p < iqp) { // insertion + final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); + final int refStart = rnd.nextInt(reference.length() - length); + System.arraycopy(referenceBytes,refStart,bases,i,length); + i += length; + continue; + } + p -= iqp; + double dqp = QualityUtils.qualToErrorProb(dq[i]); + if (p < dqp) { + final int length = generateIndelLength(rnd); + refOffset += length; + refOffset = refOffset % referenceBytes.length; + readOffset += length; + continue; + } + p -= dqp; + double bqp = QualityUtils.qualToErrorProb(bq[i]); + byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; + byte nb; + if (p < bqp) { + switch (b) { + case 'A': nb = 'C'; break; + case 'T': nb = 'A'; break; + case 'C': nb = 'G'; break; + case 'G': nb = 'B'; break; + default: nb = 'A'; + } + } else + nb = b; + + bases[i++] = nb; + refOffset++; + refOffset = refOffset % referenceBytes.length; + readOffset++; + } + this.setReadBases(bases); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + + + } + + private int generateIndelLength(final Random rnd) { + final int length; + try { + length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); + } catch (Exception e) { + throw new RuntimeException(e); + } + return length; + } + + @Override + public byte[] getBaseDeletionQualities() { + return Arrays.copyOf(dq,getReadLength()); + } + + @Override + public byte[] getBaseInsertionQualities() { + return Arrays.copyOf(iq,getReadLength()); + } + + @Override + public int getMappingQuality() { + return 100; + } + + @Override + public int hashCode() { + return getReadName().hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof GATKSAMRecord) { + return getReadName().equals(((GATKSAMRecord)o).getReadName()); + } else { + return false; + } + } + + public String toString() { + return super.toString() + " " + this.getReadString(); + } + } + + + public List readStrings() { + final List result = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + result.add(sequence); + } else if (descr.matches("\\*:^\\d+:\\d+")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + result.addAll(generateReads(haplotypes, readCount, readLength)); + } else { + sequence = descr; + result.add(sequence); + } + } + return result; + } + + private List generateReads(final List haplotypes, final int readCount, final int readLength) { + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = i % h.length() - readLength; + result.add(h.substring(offset,offset + readLength)); + } + return result; + } + + private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { + int id = idStart; + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + byte[] bases = h.substring(offset,to).getBytes(); + byte[] quals = Arrays.copyOf(bq,to - offset); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); + result.add(new MyGATKSAMRecord(samRecord)); + } + return result; + } + + + private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { + + final List> result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % unrolledCivars.size(); + final Civar c = unrolledCivars.get(hi); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + result.add(c.eventOffsets(reference,offset,to)); + } + return result; + } + + private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); + + + private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { + final String sequence = applyCigar(reference,cigar,offset,global); + final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); + return haplotype; + } + + private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { + final Matcher pm = cigarPattern.matcher(cigar); + StringBuffer sb = new StringBuffer(); + int index = offset; + while (pm.find()) { + int length = Integer.valueOf(pm.group(1)); + char operator = pm.group(2).charAt(0); + switch (operator) { + case '=' : + try { + sb.append(reference.substring(index, index + length)); + } catch (Exception e) { + throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); + } + index += length; break; + case 'D' : + index += length; break; + case 'I' : + String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); + sb.append(insert); break; + case 'V' : + sb.append(transversionV(reference.charAt(index))); index++; break; + case 'W' : + sb.append(transversionW(reference.charAt(index))); index++; break; + case 'T' : + sb.append(transition(reference.charAt(index))); index++; break; + default: + throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); + } + } + if (global && index != reference.length()) { + throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); + } else if (index > reference.length()) { + throw new RuntimeException(" index beyond end "); + } + return sb.toString(); + } + + protected int kmerSize() { + return kmerSize; + } + + private char transversionV(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'C'; + case 'G': return 'T'; + case 'C': return 'A'; + case 'T': return 'G'; + default: + return c; + } + + } + + private char transversionW(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'T'; + case 'G': return 'C'; + case 'T': return 'A'; + case 'C': return 'G'; + default: + return c; + } + + } + + private char transition(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'G'; + case 'G': return 'A'; + case 'T': return 'C'; + case 'C': return 'T'; + default: + return c; + } + + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java new file mode 100644 index 000000000..14ab552e5 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/CnyPairHMMUnitTest.java @@ -0,0 +1,92 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class CnyPairHMMUnitTest extends BaseTest { + + @Test(enabled = true) + public void testResultQueue() { + final double[] row1 = new double[] { 4.5, 53.1, 6.4 }; + final double[] row2 = new double[] { 1.0, 5.9, 6.9, 6.1, 19.8 }; + final double[] row3 = new double[] { 10.4, 9.101, 89.5, 9.8}; + final double[] row4 = new double[] { 7.3, 1.4, 5.67, 56.32 }; + CnyPairHMM.ResultQueue queue = new CnyPairHMM.ResultQueue(); + + // Test inter-mixed push/pop operations produce the correct output + queue.push(row1); + queue.push(row2); + + for (double aRow1 : row1) Assert.assertEquals(aRow1, queue.pop()); + + for (int i=0; i<2; i++) + Assert.assertEquals(row2[i], queue.pop()); + + queue.push(row3); + + for (int i=2; i reads = as.readList(); + final List haplotypes = as.haplotypeList(); + PairHMMReadyHaplotypes haplotypeCollection = new PairHMMReadyHaplotypes(haplotypes.size()); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); + Map basesToPos = new HashMap<>(sortedHaplotypes.size()); + int nextIdx = 0; + + for (final Haplotype h : sortedHaplotypes) { + final byte[] bases = h.getBases(); + haplotypeCollection.add(bases); + basesToPos.put(bases,nextIdx++); + } + for (GATKSAMRecord read : reads) { + final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; + final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; + unsorted.loadRead(read); + sorted.loadRead(read); + final Map unsortedResults = new HashMap<>(haplotypes.size()); + for (int i = 0; i < sortedHaplotypes.size(); i++) { + final Haplotype h = sortedHaplotypes.get(i); + final byte[] haplotypeBases = h.getBases().clone(); + unsorted.loadHaplotypeBases(haplotypeBases); + double lk = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + unsortedLikelihoods[i] = lk; + } + sorted.calculateLocalLikelihoods(0, read.getReadLength(), haplotypeCollection); + for (final PairHMMReadyHaplotypes.Entry entry : haplotypeCollection) { + final byte[] bases = entry.getBases(); + final double lk = entry.getLikelihood(); + final int haplotypePos = basesToPos.get(bases); + sortedLikelihoods[haplotypePos] = lk; + } + for (int i = 0; i < unsortedLikelihoods.length; i++) + Assert.assertEquals(unsortedLikelihoods[i],sortedLikelihoods[i],0.00000001,Arrays.toString(unsortedLikelihoods) + Arrays.toString(sortedLikelihoods)); + } + } + + @Test(enabled=true,dataProvider="activeRegionTestDataSets") + public void testSortedVsUnsorted(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + final List reads = as.readList(); + final List haplotypes = as.haplotypeList(); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); + + byte[] lastHaplotypeBases = null; + for (GATKSAMRecord read : reads) { + final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; + final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; + unsorted.loadRead(read); + sorted.loadRead(read); + for (int i = 0; i < sortedHaplotypes.size(); i++) { + final Haplotype h = sortedHaplotypes.get(i); + final byte[] haplotypeBases = h.getBases().clone(); + final byte[] haplotypeBases2 = haplotypeBases.clone(); + int commonPrefixEnd = 0; + + + if (lastHaplotypeBases != null) { + final int prefixEndLimit = Math.min(lastHaplotypeBases.length,haplotypeBases.length); + for (commonPrefixEnd = 0; commonPrefixEnd < prefixEndLimit; commonPrefixEnd++) + if (lastHaplotypeBases[commonPrefixEnd] != haplotypeBases[commonPrefixEnd]) + break; + } + + unsorted.loadHaplotypeBases(haplotypeBases); + sorted.changeHaplotypeSuffix(commonPrefixEnd, haplotypeBases, commonPrefixEnd, haplotypeBases.length); + Assert.assertTrue(Arrays.equals(haplotypeBases2, unsorted.getHaplotypeBases())); + Assert.assertTrue(Arrays.equals(haplotypeBases2, sorted.getHaplotypeBases())); + unsortedLikelihoods[i] = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + sortedLikelihoods[i] = sorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + Assert.assertTrue(Arrays.equals(haplotypeBases2,unsorted.getHaplotypeBases())); + Assert.assertTrue(Arrays.equals(haplotypeBases2,sorted.getHaplotypeBases())); + Assert.assertEquals((double)unsortedLikelihoods[i], (double) sortedLikelihoods[i],0.00000001); + lastHaplotypeBases = haplotypeBases; + } + } + } + + public static final Comparator HAPLOTYPE_COMPARATOR = new Comparator() { + + @Override + public int compare(final Haplotype o1, final Haplotype o2) { + if (o1 == o2) + return 0; + final byte[] bases1 = o1.getBases(); + final byte[] bases2 = o2.getBases(); + final int ilimit = Math.min(bases1.length,bases2.length); + for (int i = 0; i < ilimit; i++) { + final int cmp = Byte.compare(bases1[i],bases2[i]); + if (cmp != 0) return cmp; + } + if (bases1.length == bases2.length) return 0; + return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. + } + }; + + + + +} diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java similarity index 68% rename from protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java index 3d8137ecf..151097aad 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMEmpiricalBenchmark.java @@ -46,117 +46,70 @@ package org.broadinstitute.sting.utils.pairhmm; -import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.text.XReadLines; +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; -import java.io.*; -import java.util.LinkedHashMap; +import java.io.File; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.zip.GZIPInputStream; +import java.lang.Math; /** - * Useful single class carrying test data for PairHMMs (for use in benchmarking and unit tests) - * - * User: depristo - * Date: 5/12/13 - * Time: 3:52 PM - * To change this template use File | Settings | File Templates. + * Caliper microbenchmark for empirical test data for PairHMM */ -public class PairHMMTestData { - public final String ref; - private final String read; - public final byte[] baseQuals, insQuals, delQuals, gcp; - public final double log10l; +public class PairHMMEmpiricalBenchmark extends SimpleBenchmark { + @Param ({"array_logless", "logless"}) + String algorithm; - PairHMMTestData(String ref, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l) { - this.ref = ref; - this.read = read; - this.baseQuals = baseQuals; - this.insQuals = insQuals; - this.delQuals = delQuals; - this.gcp = gcp; - this.log10l = log10l; - } + @Param({"likelihoods_NA12878_HiSeqWGS_chr20_1mb.txt"}) + String likelihoodsFile; - PairHMMTestData(String ref, String read, final byte qual) { - this.ref = ref; - this.read = read; - this.baseQuals = this.insQuals = this.delQuals = Utils.dupBytes(qual, read.length()); - this.gcp = Utils.dupBytes((byte)10, read.length()); - this.log10l = -1; - } + @Param({"1000","10000","70000"}) + int records; + + PairHMM hmm =null; + + List empiricalData = new LinkedList<>(); + List workingData = new LinkedList<>(); - public double runHMM(final PairHMM hmm) { - hmm.initialize(getRead().length(), ref.length()); - return hmm.computeReadLikelihoodGivenHaplotypeLog10(ref.getBytes(), getRead().getBytes(), - baseQuals, insQuals, delQuals, gcp, true); - } @Override - public String toString() { - return "Info{" + - "ref='" + ref + '\'' + - ", read='" + getRead() + '\'' + - ", log10l=" + log10l + - '}'; + protected void setUp() throws Exception { + empiricalData = PairHMMTestData.readLikelihoodsInOrder(new File(likelihoodsFile)); + records = Math.min(records, empiricalData.size()); + workingData = empiricalData.subList(0,records); + + int maxReadLength = PairHMMTestData.calcMaxReadLen(workingData); + int maxHaplotypeLength = PairHMMTestData.calcMaxHaplotypeLen(workingData); + + hmm = getHmm(); + hmm.initialize(maxReadLength,maxHaplotypeLength); } - public static void runHMMs(final PairHMM hmm, final List data, final boolean runSingly) { - if ( runSingly ) { - for ( final PairHMMTestData datum : data ) - datum.runHMM(hmm); - } else { - // running in batch mode - final PairHMMTestData first = data.get(0); - int maxHaplotypeLen = calcMaxHaplotypeLen(data); - hmm.initialize(first.getRead().length(), maxHaplotypeLen); - for ( final PairHMMTestData datum : data ) { - hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), datum.getRead().getBytes(), - datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, false); + private PairHMM getHmm() { + switch (algorithm) { + case "logless": return new LoglessPairHMM(); + case "array_logless": return new ArrayLoglessPairHMM(); + default: throw new IllegalStateException("Unexpected algorithm " + algorithm); + } + } + + public double timeHMM(int rep){ + double result = 0; + for (int i = 0; i < rep; i++) { + for (final PairHMMTestData datum : workingData){ + result += hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), + datum.getRead().getBytes(), + datum.baseQuals, + datum.insQuals, + datum.delQuals, + datum.gcp, + datum.newRead, + datum.nextRef.getBytes()); } } - } - - public static int calcMaxHaplotypeLen(final List data) { - int maxHaplotypeLen = 0; - for ( final PairHMMTestData datum : data ) - maxHaplotypeLen = Math.max(maxHaplotypeLen, datum.ref.length()); - return maxHaplotypeLen; - } - - public static Map> readLikelihoods(final File file) throws IOException { - final Map> results = new LinkedHashMap<>(); - - InputStream in = new FileInputStream(file); - if ( file.getName().endsWith(".gz") ) { - in = new GZIPInputStream(in); - } - - for ( final String line : new XReadLines(in) ) { - final String[] parts = line.split(" "); - final PairHMMTestData info = new PairHMMTestData( - parts[0], parts[1], - SAMUtils.fastqToPhred(parts[2]), - SAMUtils.fastqToPhred(parts[3]), - SAMUtils.fastqToPhred(parts[4]), - SAMUtils.fastqToPhred(parts[5]), - Double.parseDouble(parts[6])); - - if ( ! results.containsKey(info.read) ) { - results.put(info.read, new LinkedList()); - } - final List byHap = results.get(info.read); - byHap.add(info); - } - - return results; - } - - public String getRead() { - return read; + return result; } } diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java new file mode 100644 index 000000000..9706c0e9d --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMSyntheticBenchmark.java @@ -0,0 +1,127 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.TextCigarCodec; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; + +import java.util.*; + +/** + * Caliper microbenchmark for synthetic test data for PairHMM + */ +public class PairHMMSyntheticBenchmark extends SimpleBenchmark { + @Param ({"array_logless", "logless"}) +// @Param({"logless", "array_logless"}) +// @Param({"logless", "banded_w5_mle10", "banded_w5_mle20"}) +// @Param({"logless", "banded_w10_mle20", "banded_w5_mle20", "banded_w5_mle10"}) + String algorithm; + +// @Param({"40", "100", "200", "300", "500"}) + @Param({"40", "300"}) +// @Param({"300"}) + int refLength; + +// @Param({"200"}) + @Param({"40", "101", "200"}) +// @Param({"40", "100", "200", "300", "500"}) + int readLength; + + private PairHMM hmm; + private PairHMMTestData testData; + + + @Override + protected void setUp() throws Exception { + hmm = getHmm(); + final String ref = generateSeq(refLength); + final String nextRef = generateSeq(refLength); + final String read = generateSeq(readLength); + testData = new PairHMMTestData(ref, nextRef, read, (byte)30); + System.out.println(testData.toString()); + } + + private PairHMM getHmm() { + switch (algorithm) { + case "logless": return new LoglessPairHMM(); + case "array_logless": return new ArrayLoglessPairHMM(); +// case "banded_w10_mle20": return new BandedLoglessPairHMM(10, 1e-20); +// case "banded_w5_mle20": return new BandedLoglessPairHMM(5, 1e-20); +// case "banded_w5_mle10": return new BandedLoglessPairHMM(5, 1e-10); + default: throw new IllegalStateException("Unexpected algorithm " + algorithm); + } + } + + private String generateSeq(final int len) { + final List root = Arrays.asList("A", "C", "G", "T"); + + String seq = ""; + for ( int i = 0; true; i++ ) { + final String base = root.get(i % root.size()); + final int copies = i / root.size() + 1; + seq += Utils.dupString(base, copies); + if ( seq.length() >= len ) + return seq.substring(0, len); + } + } + + public void timePairHMM(int rep) { + for ( int i = 0; i < rep; i++ ) { + testData.runHMM(hmm); + } +// if ( hmm instanceof BandedLoglessPairHMM ) { +// final BandedLoglessPairHMM banded = (BandedLoglessPairHMM)hmm; +// System.out.printf("Banded n cells possible : %d%n", banded.nCellsOverall); +// System.out.printf("Banded n cells evaluated : %d%n", banded.nCellsEvaluated); +// } + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java new file mode 100644 index 000000000..e6c8e1e61 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMTestData.java @@ -0,0 +1,256 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * Useful single class carrying test data for PairHMMs (for use in benchmarking and unit tests) + * + * User: depristo + * Date: 5/12/13 + * Time: 3:52 PM + * To change this template use File | Settings | File Templates. + */ +public class PairHMMTestData { + public final String ref; + public final String nextRef; + private final String read; + public final byte[] baseQuals, insQuals, delQuals, gcp; + public final double log10l; + public final boolean newRead; + + PairHMMTestData(String ref, String nextRef, String read, byte[] baseQuals, byte[] insQuals, byte[] delQuals, byte[] gcp, double log10l, boolean newRead) { + this.ref = ref; + this.nextRef = nextRef; + this.read = read; + this.baseQuals = baseQuals; + this.insQuals = insQuals; + this.delQuals = delQuals; + this.gcp = gcp; + this.log10l = log10l; + this.newRead = newRead; + } + + PairHMMTestData(String ref, String nextRef, String read, final byte qual) { + this.ref = ref; + this.nextRef = nextRef; + this.read = read; + this.baseQuals = this.insQuals = this.delQuals = Utils.dupBytes(qual, read.length()); + this.gcp = Utils.dupBytes((byte)10, read.length()); + this.log10l = -1; + this.newRead = true; + } + + public double runHMM(final PairHMM hmm) { + hmm.initialize(getRead().length(), ref.length()); + return hmm.computeReadLikelihoodGivenHaplotypeLog10(ref.getBytes(), getRead().getBytes(), + baseQuals, insQuals, delQuals, gcp, true, null); + } + + @Override + public String toString() { + return "Info{" + + "ref='" + ref + '\'' + + ", nextRef=" + nextRef + '\'' + + ", read='" + getRead() + '\'' + + ", log10l=" + log10l + '\'' + + ", newRead=" + newRead + + '}'; + } + + public static double runHMMs(final PairHMM hmm, final List data, final boolean runSingly) { + double result = 0; + if ( runSingly ) { + for ( final PairHMMTestData datum : data ) + result += datum.runHMM(hmm); + } else { + // running in batch mode + final PairHMMTestData first = data.get(0); + int maxHaplotypeLen = calcMaxHaplotypeLen(data); + hmm.initialize(first.getRead().length(), maxHaplotypeLen); + for ( final PairHMMTestData datum : data ) { + result += hmm.computeReadLikelihoodGivenHaplotypeLog10(datum.ref.getBytes(), datum.getRead().getBytes(), + datum.baseQuals, datum.insQuals, datum.delQuals, datum.gcp, datum.newRead, datum.nextRef.getBytes()); + + } + } + return result; + } + + public static int calcMaxHaplotypeLen(final List data) { + int maxHaplotypeLen = 0; + for ( final PairHMMTestData datum : data ) + maxHaplotypeLen = Math.max(maxHaplotypeLen, datum.ref.length()); + return maxHaplotypeLen; + } + + public static int calcMaxReadLen(final List data) { + int maxReadLen = 0; + for ( final PairHMMTestData datum : data ) + maxReadLen = Math.max(maxReadLen, datum.getRead().length()); + return maxReadLen; + } + + public static Map> readLikelihoods(final File file) throws IOException { + final Map> results = new LinkedHashMap<>(); + + InputStream in = new FileInputStream(file); + if ( file.getName().endsWith(".gz") ) { + in = new GZIPInputStream(in); + } + + String[] nextEntry; + String[] thisEntry = null; + for ( final String line : new XReadLines(in) ) { + // peak at the next entry (to get the haplotype bases) + nextEntry = line.split(" "); + // process the current entry + if (thisEntry != null) { + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], nextEntry[0], thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6]), + ! results.containsKey(thisEntry[1])); + + if ( ! results.containsKey(info.read) ) { + results.put(info.read, new LinkedList()); + } + final List byHap = results.get(info.read); + byHap.add(info); + } + // update the current entry + thisEntry = nextEntry; + } + // process the final entry + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], null, thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6]), + ! results.containsKey(thisEntry[1])); + + if ( ! results.containsKey(info.read) ) { + results.put(info.read, new LinkedList()); + } + final List byHap = results.get(info.read); + byHap.add(info); + + return results; + } + + + /* + * simplified likelihoods file reader that returns a list instead of a map + * + * readLikelihoods() method was reordering inputs, with the result that caching would be more efficient + * This method simply returns a list of read/haplotype pairs in their original order, providing a more realistic caching scenario + */ + public static List readLikelihoodsInOrder(final File file) throws IOException { + final List results = new LinkedList<>(); + + InputStream in = new FileInputStream(file); + if ( file.getName().endsWith(".gz") ) { + in = new GZIPInputStream(in); + } + + String previousRead = null; + String[] nextEntry; + String[] thisEntry = null; + for ( final String line : new XReadLines(in) ) { + // peak at the next entry (to get the haplotype bases) + nextEntry = line.split(" "); + // process the current entry + if (thisEntry != null) { + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], nextEntry[0], thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6]), + !(thisEntry[1].equals(previousRead))); + + results.add(info); + previousRead = info.getRead(); + } + // update the current entry + thisEntry = nextEntry; + } + // process the final entry + final PairHMMTestData info = new PairHMMTestData( + thisEntry[0], null, thisEntry[1], + SAMUtils.fastqToPhred(thisEntry[2]), + SAMUtils.fastqToPhred(thisEntry[3]), + SAMUtils.fastqToPhred(thisEntry[4]), + SAMUtils.fastqToPhred(thisEntry[5]), + Double.parseDouble(thisEntry[6]), + !(thisEntry[1].equals(previousRead))); + + results.add(info); + + return results; + } + + public String getRead() { + return read; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java index 2499183a6..4a224e0be 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java @@ -56,6 +56,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -68,12 +69,23 @@ public class PairHMMUnitTest extends BaseTest { private final static boolean ALLOW_READS_LONGER_THAN_HAPLOTYPE = true; private final static boolean DEBUG = false; final static boolean EXTENSIVE_TESTING = true; - final PairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation - final PairHMM originalHMM = new Log10PairHMM(false); // the reference implementation - final PairHMM loglessHMM = new LoglessPairHMM(); + final N2MemoryPairHMM exactHMM = new Log10PairHMM(true); // the log truth implementation + final N2MemoryPairHMM originalHMM = new Log10PairHMM(false); // the reference implementation + final N2MemoryPairHMM loglessHMM = new LoglessPairHMM(); + final PairHMM arrayHMM = new ArrayLoglessPairHMM(); + final N2MemoryPairHMM fastloglessHMM = new FastLoglessPairHMM((byte) 10); - private List getHMMs() { - return Arrays.asList(exactHMM, originalHMM, loglessHMM); + @BeforeClass + public void initialize() { + exactHMM.doNotUseTristateCorrection(); + originalHMM.doNotUseTristateCorrection(); + loglessHMM.doNotUseTristateCorrection(); + arrayHMM.doNotUseTristateCorrection(); + fastloglessHMM.doNotUseTristateCorrection(); + } + + private List getHMMs() { + return Arrays.asList(exactHMM, originalHMM, loglessHMM, fastloglessHMM); } // -------------------------------------------------------------------------------- @@ -83,8 +95,8 @@ public class PairHMMUnitTest extends BaseTest { // -------------------------------------------------------------------------------- private class BasicLikelihoodTestProvider { - final String ref, read; - final byte[] refBasesWithContext, readBasesWithContext; + final String ref, nextRef, read; + final byte[] refBasesWithContext, nextRefBasesWithContext, readBasesWithContext; final int baseQual, insQual, delQual, gcp; final int expectedQual; final boolean left, right; @@ -92,28 +104,30 @@ public class PairHMMUnitTest extends BaseTest { final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { - this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); + public BasicLikelihoodTestProvider(final String ref, final String nextRef, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) { + this(ref, nextRef, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); } - public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { + public BasicLikelihoodTestProvider(final String ref, final String nextRef, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) { this.baseQual = baseQual; this.delQual = delQual; this.insQual = insQual; this.gcp = gcp; this.read = read; this.ref = ref; + this.nextRef = nextRef; this.expectedQual = expectedQual; this.left = left; this.right = right; refBasesWithContext = asBytes(ref, left, right); + nextRefBasesWithContext = asBytes(nextRef, left, right); readBasesWithContext = asBytes(read, false, false); } @Override public String toString() { - return String.format("ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); + return String.format("ref=%s nextRef=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, nextRef, read, baseQual, insQual, delQual, gcp, left, right, expectedQual); } public double expectedLogL() { @@ -121,7 +135,7 @@ public class PairHMMUnitTest extends BaseTest { } public double getTolerance(final PairHMM hmm) { - if ( hmm instanceof LoglessPairHMM) + if ( hmm instanceof LoglessPairHMM || hmm instanceof ArrayLoglessPairHMM) return toleranceFromExact(); if ( hmm instanceof Log10PairHMM ) { return ((Log10PairHMM)hmm).isDoingExactLog10Calculations() ? toleranceFromExact() : toleranceFromReference(); @@ -146,11 +160,14 @@ public class PairHMMUnitTest extends BaseTest { return pairHMM.computeReadLikelihoodGivenHaplotypeLog10( refBasesWithContext, readBasesWithContext, qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel), - qualAsBytes(gcp, false, anchorIndel), true); + qualAsBytes(gcp, false, anchorIndel), true, nextRefBasesWithContext); } private byte[] asBytes(final String bases, final boolean left, final boolean right) { - return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); + if(bases == null) + return null; + else + return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); } private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) { @@ -196,7 +213,8 @@ public class PairHMMUnitTest extends BaseTest { final String ref = new String(new byte[]{refBase}); final String read = new String(new byte[]{readBase}); final int expected = refBase == readBase ? 0 : baseQual; - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)}); + // runBasicLikelihoodTests uses calcLogL(), which runs HMM with recacheReads=true. Since we will not cache, should pass null in place of a nextRef + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp)}); } } @@ -212,10 +230,11 @@ public class PairHMMUnitTest extends BaseTest { final String ref = insertionP ? small : big; final String read = insertionP ? big : small; - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp)}); - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false)}); - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true)}); - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true)}); + // runBasicLikelihoodTests uses calcLogL(), which runs HMM with recacheReads=true. Since we will not cache, should pass null in place of a nextRef + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp, true, false)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp, false, true)}); + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, expected, gcp, true, true)}); } } } @@ -253,7 +272,8 @@ public class PairHMMUnitTest extends BaseTest { for ( final boolean leftFlank : Arrays.asList(true, false) ) for ( final boolean rightFlank : Arrays.asList(true, false) ) - tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank)}); + // runOptimizedLikelihoodTests uses calcLogL(), which runs HMM with recacheReads=true. Since we will not cache, should pass null in place of a nextRef + tests.add(new Object[]{new BasicLikelihoodTestProvider(ref, null, read, baseQual, indelQual, indelQual, -0, gcp, leftFlank, rightFlank)}); } } } @@ -294,6 +314,7 @@ public class PairHMMUnitTest extends BaseTest { } } + @Test(enabled = !DEBUG) public void testMismatchInEveryPositionInTheReadWithCenteredHaplotype() { final byte[] haplotype1 = "TTCTCTTCTGTTGTGGCTGGTT".getBytes(); @@ -315,7 +336,7 @@ public class PairHMMUnitTest extends BaseTest { final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false); + final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, true, null); final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual)); Assert.assertEquals(res1, expected, 1e-2); } @@ -343,7 +364,7 @@ public class PairHMMUnitTest extends BaseTest { final byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); // change single base at position k to C. If it's a C, change to T mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); - final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, false); + final double res1 = loglessHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype1, mread, quals, gop, gop, gcp, true , null); final double expected = Math.log10(1.0/haplotype1.length * Math.pow(QualityUtils.qualToProb(matchQual), mread.length-1) * QualityUtils.qualToErrorProb(mismatchQual)); Assert.assertEquals(res1, expected, 1e-2); } @@ -374,11 +395,12 @@ public class PairHMMUnitTest extends BaseTest { final byte delQual = 37; final byte gcp = 10; hmm.initialize(readBases.length, refBases.length); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases final double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); Assert.assertTrue(d <= 0.0, "Likelihoods should be <= 0 but got "+ d); } @@ -391,11 +413,12 @@ public class PairHMMUnitTest extends BaseTest { final byte delQual = 100; final byte gcp = 100; hmm.initialize(readBases.length, refBases.length); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); double expected = 0; final double initialCondition = ((double) Math.abs(refBases.length-readBases.length+1))/refBases.length; if (readBases.length < refBases.length) { @@ -437,11 +460,12 @@ public class PairHMMUnitTest extends BaseTest { final byte delQual = 40; final byte gcp = 10; hmm.initialize(readBases.length, refBases.length); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); } @Test(enabled = !DEBUG) @@ -452,20 +476,27 @@ public class PairHMMUnitTest extends BaseTest { final byte insQual = 40; final byte delQual = 40; final byte gcp = 10; - + // running HMMs with no haplotype caching. Should therefore pass null in place of nextRef bases exactHMM.initialize(readBases.length, refBases.length); exactHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); loglessHMM.initialize(readBases.length, refBases.length); loglessHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, Utils.dupBytes(baseQual, readBases.length), Utils.dupBytes(insQual, readBases.length), Utils.dupBytes(delQual, readBases.length), - Utils.dupBytes(gcp, readBases.length), true); + Utils.dupBytes(gcp, readBases.length), true, null); + + arrayHMM.initialize(readBases.length, refBases.length); + arrayHMM.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, + Utils.dupBytes(baseQual, readBases.length), + Utils.dupBytes(insQual, readBases.length), + Utils.dupBytes(delQual, readBases.length), + Utils.dupBytes(gcp, readBases.length), true, null); } @DataProvider(name = "JustHMMProvider") @@ -490,7 +521,8 @@ public class PairHMMUnitTest extends BaseTest { final byte[] gcp = Utils.dupBytes((byte) 10, delQual.length); hmm.initialize(readBases.length + 100, refBases.length + 100); for ( int nExtraMaxSize = 0; nExtraMaxSize < 100; nExtraMaxSize++ ) { - hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true); + // running HMM with no haplotype caching. Should therefore pass null in place of nextRef bases + hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, quals, insQual, delQual, gcp, true, null); } } @@ -498,10 +530,12 @@ public class PairHMMUnitTest extends BaseTest { public Object[][] makeHaplotypeIndexingProvider() { List tests = new ArrayList(); + // First difference (root2, root3) is the base position immediately following first difference (root1, root2) final String root1 = "ACGTGTCAAACCGGGTT"; - final String root2 = "ACGTGTCACACTGGGTT"; // differs in two locations + final String root2 = "ACGTGTCACACTGGGTT"; // differs in two locations from root1 + final String root3 = "ACGTGTCACTCCGCGTT"; // differs in two locations from root2 - final String read1 = "ACGTGTCACACTGGATT"; // 1 diff from 2, 2 diff from root1 + final String read1 = "ACGTGTCACACTGGATT"; // 1 diff from 2, 2 diff from root1, 2 diff from root3 final String read2 = root1; // same as root1 final String read3 = root2; // same as root2 final String read4 = "ACGTGTCACACTGGATTCGAT"; @@ -513,7 +547,7 @@ public class PairHMMUnitTest extends BaseTest { // int readLength = read.length(); { for ( int readLength = 10; readLength < read.length(); readLength++ ) { final String myRead = read.substring(0, readLength); - tests.add(new Object[]{hmm, root1, root2, myRead}); + tests.add(new Object[]{hmm, root1, root2, root3, myRead}); } } } @@ -522,7 +556,7 @@ public class PairHMMUnitTest extends BaseTest { } @Test(enabled = !DEBUG, dataProvider = "HaplotypeIndexingProvider") - void testHaplotypeIndexing(final PairHMM hmm, final String root1, final String root2, final String read) { + void testHaplotypeIndexing(final PairHMM hmm, final String root1, final String root2, final String root3, final String read) { final double TOLERANCE = 1e-9; final String prefix = "AACCGGTTTTTGGGCCCAAACGTACGTACAGTTGGTCAACATCGATCAGGTTCCGGAGTAC"; @@ -536,24 +570,30 @@ public class PairHMMUnitTest extends BaseTest { final String myPrefix = prefix.substring(prefixStart, prefix.length()); final String hap1 = myPrefix + root1; final String hap2 = myPrefix + root2; + final String hap3 = myPrefix + root3; final int hapStart = PairHMM.findFirstPositionWhereHaplotypesDiffer(hap1.getBytes(), hap2.getBytes()); - final double actual1 = testHaplotypeIndexingCalc(hmm, hap1, read, 0, true); - final double actual2 = testHaplotypeIndexingCalc(hmm, hap2, read, hapStart, false); - final double expected2 = testHaplotypeIndexingCalc(hmm, hap2, read, 0, true); - Assert.assertEquals(actual2, expected2, TOLERANCE, "Caching calculation failed for read " + read + " against haplotype with prefix '" + myPrefix + // Run the HMM on the first haplotype, peaking ahead the second, to set up caching + // Then run on the second haplotype in both cached and uncached mode, and verify that results are the same + // When evaluating actual2, it is important that we both apply old caching from hap1 and set up new caching for hap3, to ensure read/write operations do not cause conflicts + final double actual1 = testHaplotypeIndexingCalc(hmm, hap1, hap2, read, 0, true); + final double actual2 = testHaplotypeIndexingCalc(hmm, hap2, hap3, read, hapStart, false); + final double expected2 = testHaplotypeIndexingCalc(hmm, hap2, null, read, 0, true); + Assert.assertEquals(actual2, expected2, TOLERANCE, "HMM " + hmm.getClass() + " Caching calculation failed for read " + read + " against haplotype with prefix '" + myPrefix + "' expected " + expected2 + " but got " + actual2 + " with hapStart of " + hapStart); } } - private double testHaplotypeIndexingCalc(final PairHMM hmm, final String hap, final String read, final int hapStart, final boolean recache) { + private double testHaplotypeIndexingCalc(final PairHMM hmm, final String hap, final String nextHap, final String read, final int hapStart, final boolean recache) { final byte[] readBases = read.getBytes(); + // if not peaking ahead to capture info for a future cache run, the next haplotype will be null, and this should be passed to HMM + final byte[] nextHapBases = nextHap == null ? null : nextHap.getBytes(); final byte[] baseQuals = Utils.dupBytes((byte)30, readBases.length); final byte[] insQuals = Utils.dupBytes((byte)45, readBases.length); final byte[] delQuals = Utils.dupBytes((byte)40, readBases.length); final byte[] gcp = Utils.dupBytes((byte)10, readBases.length); - double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache); + double d = hmm.computeReadLikelihoodGivenHaplotypeLog10(hap.getBytes(), readBases, baseQuals, insQuals, delQuals, gcp, recache, nextHapBases); Assert.assertTrue(MathUtils.goodLog10Probability(d), "Likelihoods = " + d + " was bad for read " + read + " and ref " + hap + " with hapStart " + hapStart); return d; } @@ -568,7 +608,6 @@ public class PairHMMUnitTest extends BaseTest { for ( final boolean oneIsDiff : Arrays.asList(true, false) ) { final byte[] hap1 = Utils.dupBytes((byte)'A', haplotypeSize1); final byte[] hap2 = Utils.dupBytes((byte)'A', haplotypeSize2); - final int expected = oneIsDiff ? makeDiff(hap1, differingSite, minLength) : makeDiff(hap2, differingSite, minLength); @@ -592,8 +631,17 @@ public class PairHMMUnitTest extends BaseTest { public Object[][] makeUninitializedHMMs() { List tests = new ArrayList(); - tests.add(new Object[]{new LoglessPairHMM()}); - tests.add(new Object[]{new Log10PairHMM(true)}); + final LoglessPairHMM myLoglessPairHMM = new LoglessPairHMM(); + myLoglessPairHMM.doNotUseTristateCorrection(); + tests.add(new Object[]{myLoglessPairHMM}); + + final ArrayLoglessPairHMM myArrayLoglessPairHMM = new ArrayLoglessPairHMM(); + myArrayLoglessPairHMM.doNotUseTristateCorrection(); + tests.add(new Object[]{myArrayLoglessPairHMM}); + + final Log10PairHMM myLog10PairHMM = new Log10PairHMM(true); + myLog10PairHMM.doNotUseTristateCorrection(); + tests.add(new Object[]{myLog10PairHMM}); return tests.toArray(new Object[][]{}); } @@ -606,7 +654,7 @@ public class PairHMMUnitTest extends BaseTest { // didn't call initialize => should exception out double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, true); + baseQuals, baseQuals, baseQuals, baseQuals, true, null); } @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @@ -617,7 +665,7 @@ public class PairHMMUnitTest extends BaseTest { hmm.initialize(3, 3); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, true); + baseQuals, baseQuals, baseQuals, baseQuals, true, null); } @Test(enabled = true, expectedExceptions = IllegalArgumentException.class, dataProvider = "JustHMMProvider") @@ -628,6 +676,6 @@ public class PairHMMUnitTest extends BaseTest { hmm.initialize(2, 3); double d = hmm.computeReadLikelihoodGivenHaplotypeLog10( refBases, readBases, - baseQuals, baseQuals, baseQuals, baseQuals, true); + baseQuals, baseQuals, baseQuals, baseQuals, true, null); } } \ No newline at end of file diff --git a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R index 2bc0a2fa5..6679781ee 100644 --- a/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R +++ b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R @@ -57,7 +57,7 @@ plotJobsGantt <- function(gatkReport, sortOverall, title, includeText) { p <- p + xlim(0, maxRelTime * 1.3) p <- p + xlab(paste("Start time, relative to first job", RUNTIME_UNITS)) p <- p + ylab("Job number") - p <- p + ggtitle(title) + p <- p + opts(title = title) print(p) } diff --git a/public/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/java/src/org/broadinstitute/sting/commandline/Argument.java index fa7ca9cc3..96731584b 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Argument.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Argument.java @@ -86,4 +86,40 @@ public @interface Argument { * @return Non-empty regexp for validation, blank otherwise. */ String validation() default ""; + + /** + * Hard lower bound on the allowed value for the annotated argument -- generates an exception if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Hard lower bound on the allowed value for the annotated argument, or Double.NEGATIVE_INFINITY + * if there is none. + */ + double minValue() default Double.NEGATIVE_INFINITY; + + /** + * Hard upper bound on the allowed value for the annotated argument -- generates an exception if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Hard upper bound on the allowed value for the annotated argument, or Double.POSITIVE_INFINITY + * if there is none. + */ + double maxValue() default Double.POSITIVE_INFINITY; + + /** + * Soft lower bound on the allowed value for the annotated argument -- generates a warning if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Soft lower bound on the allowed value for the annotated argument, or Double.NEGATIVE_INFINITY + * if there is none. + */ + double minRecommendedValue() default Double.NEGATIVE_INFINITY; + + /** + * Soft upper bound on the allowed value for the annotated argument -- generates a warning if violated. + * Enforced only for numeric types whose values are explicitly specified on the command line. + * + * @return Soft upper bound on the allowed value for the annotated argument, or Double.POSITIVE_INFINITY + * if there is none. + */ + double maxRecommendedValue() default Double.POSITIVE_INFINITY; } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java index efacde231..12bf548d5 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java @@ -159,6 +159,22 @@ public class ArgumentSource { return field.isAnnotationPresent(Advanced.class); } + /** + * Is the given argument an output. + * @return True if so. False otherwise. + */ + public boolean isOutput() { + return field.isAnnotationPresent(Output.class); + } + + /** + * Is the given argument an input. + * @return True if so. False otherwise. + */ + public boolean isInput() { + return field.isAnnotationPresent(Input.class); + } + /** * Is this command-line argument dependent on some primitive argument types? * @return True if this command-line argument depends on other arguments; false otherwise. diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index cf11bb61c..f00bd0ad6 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -246,7 +246,7 @@ public abstract class CommandLineProgram { } } catch (ArgumentException e) { - clp.parser.printHelp(clp.getApplicationDetails()); + //clp.parser.printHelp(clp.getApplicationDetails()); // Rethrow the exception to exit with an error. throw e; } @@ -370,8 +370,8 @@ public abstract class CommandLineProgram { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); errorPrintf("%n"); - errorPrintf("Please check the documentation guide to see if this is a known problem%n"); - errorPrintf("If not, please post the error, with stack trace, to the GATK forum%n"); + errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); + errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); printDocumentationReference(); if ( msg == null ) // some exceptions don't have detailed messages msg = "Code exception (see stack trace for error itself)"; @@ -387,12 +387,17 @@ public abstract class CommandLineProgram { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A USER ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("The invalid arguments or inputs must be corrected before the GATK can proceed%n"); - errorPrintf("Please do not post this error to the GATK forum%n"); errorPrintf("%n"); - errorPrintf("See the documentation (rerun with -h) for this tool to view allowable command-line arguments.%n"); + errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); + errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); + errorPrintf("%n"); printDocumentationReference(); errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); + errorPrintf("%n"); errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); errorPrintf("------------------------------------------------------------------------------------------%n"); System.exit(1); @@ -404,14 +409,17 @@ public abstract class CommandLineProgram { errorPrintf("------------------------------------------------------------------------------------------%n"); errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("The invalid inputs must be corrected before the GATK can proceed%n"); - errorPrintf("Please do not post this error to the GATK forum until you have followed the instructions below%n"); errorPrintf("%n"); - errorPrintf("Please make sure that your BAM file is well-formed by running Picard's validator on it%n"); - errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); - errorPrintf("Also, please ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); + errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); printDocumentationReference(); errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); + errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); + errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); + errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); + errorPrintf("%n"); errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); errorPrintf("------------------------------------------------------------------------------------------%n"); System.exit(1); diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java index 5e863f4f7..aca20d5a1 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.help.HelpFormatter; import java.io.File; import java.io.IOException; +import java.lang.annotation.Annotation; import java.lang.reflect.Field; import java.util.*; @@ -46,6 +47,7 @@ import java.util.*; * A parser for Sting command-line arguments. */ public class ParsingEngine { + /** * The loaded argument sources along with their back definitions. */ @@ -376,6 +378,19 @@ public class ParsingEngine { * @param object Object into which to add arguments. */ public void loadArgumentsIntoObject( Object object ) { + loadArgumentsIntoObject(object, true); + } + + /** + * Loads a set of matched command-line arguments into the given object. + * @param object Object into which to add arguments. + * @param enforceArgumentRanges If true, check that the argument value is within the range specified + * in the corresponding Argument annotation by min/max value attributes. This + * check is only performed for numeric types, and only when a min and/or + * max value is actually defined in the annotation. It is also only performed + * for values actually specified on the command line, and not for default values. + */ + public void loadArgumentsIntoObject( Object object, boolean enforceArgumentRanges ) { List argumentSources = extractArgumentSources(object.getClass()); List dependentArguments = new ArrayList(); @@ -389,13 +404,13 @@ public class ParsingEngine { dependentArguments.add(argumentSource); continue; } - loadValueIntoObject( argumentSource, object, argumentMatches.findMatches(this,argumentSource) ); + loadValueIntoObject(argumentSource, object, argumentMatches.findMatches(this,argumentSource), enforceArgumentRanges); } for(ArgumentSource dependentArgument: dependentArguments) { MultiplexArgumentTypeDescriptor dependentDescriptor = dependentArgument.createDependentTypeDescriptor(this,object); ArgumentSource dependentSource = dependentArgument.copyWithCustomTypeDescriptor(dependentDescriptor); - loadValueIntoObject(dependentSource,object,argumentMatches.findMatches(this,dependentSource)); + loadValueIntoObject(dependentSource,object,argumentMatches.findMatches(this,dependentSource), enforceArgumentRanges); } } @@ -447,8 +462,13 @@ public class ParsingEngine { * @param argumentMatches Argument matches to load into the object. * @param source Argument source to load into the object. * @param instance Object into which to inject the value. The target might be in a container within the instance. + * @param enforceArgumentRanges If true, check that the argument value is within the range specified + * in the corresponding Argument annotation by min/max value attributes. This + * check is only performed for numeric types, and only when a min and/or + * max value is actually defined in the annotation. It is also only performed + * for values actually specified on the command line, and not for default values. */ - private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches ) { + private void loadValueIntoObject( ArgumentSource source, Object instance, ArgumentMatches argumentMatches, boolean enforceArgumentRanges ) { // Nothing to load if( argumentMatches.size() == 0 && ! source.createsTypeDefault() ) return; @@ -461,12 +481,78 @@ public class ParsingEngine { throw new ReviewedStingException("Internal command-line parser error: unable to find a home for argument matches " + argumentMatches); for( Object target: targets ) { - Object value = (argumentMatches.size() != 0) ? source.parse(this,argumentMatches) : source.createTypeDefault(this); + Object value; + boolean usedTypeDefault = false; + if ( argumentMatches.size() != 0 ) { + value = source.parse(this,argumentMatches); + } + else { + value = source.createTypeDefault(this); + usedTypeDefault = true; + } + + // Only check argument ranges if a check was requested AND we used a value from the command line rather + // than the type default + if ( enforceArgumentRanges && ! usedTypeDefault ) { + checkArgumentRange(source, value); + } JVMUtils.setFieldValue(source.field,target,value); } } + /** + * Check the provided value against any range constraints specified in the Argument annotation + * for the corresponding field. Throw an exception if hard limits are violated, or emit a warning + * if soft limits are violated. + * + * Only checks numeric types (int, double, etc.) + * Only checks fields with an actual @Argument annotation + * Only checks manually-specified constraints (there are no default constraints). + * + * @param argumentSource The source field for the command-line argument + * @param argumentValue The value we're considering putting in that source field + */ + private void checkArgumentRange( final ArgumentSource argumentSource, final Object argumentValue ) { + // Only validate numeric types + if ( ! (argumentValue instanceof Number) ) { + return; + } + final double argumentDoubleValue = ((Number)argumentValue).doubleValue(); + + // Only validate fields with an @Argument annotation + final Annotation argumentAnnotation = argumentSource.field.getAnnotation(Argument.class); + if ( argumentAnnotation == null ) { + return; + } + + final double minValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minValue"); + final double maxValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxValue"); + final double minRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "minRecommendedValue"); + final double maxRecommendedValue = (Double)CommandLineUtils.getValue(argumentAnnotation, "maxRecommendedValue"); + final String argumentName = (String)CommandLineUtils.getValue(argumentAnnotation, "fullName"); + + // Check hard limits first, if specified + if ( minValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minValue ) { + throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, minValue, "minimum"); + } + + if ( maxValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxValue ) { + throw new ArgumentValueOutOfRangeException(argumentName, argumentDoubleValue, maxValue, "maximum"); + } + + // Then check soft limits, if specified + if ( minRecommendedValue != Double.NEGATIVE_INFINITY && argumentDoubleValue < minRecommendedValue ) { + logger.warn(String.format("WARNING: argument --%s has value %.2f, but minimum recommended value is %.2f", + argumentName, argumentDoubleValue, minRecommendedValue)); + } + + if ( maxRecommendedValue != Double.POSITIVE_INFINITY && argumentDoubleValue > maxRecommendedValue ) { + logger.warn(String.format("WARNING: argument --%s has value %.2f, but maximum recommended value is %.2f", + argumentName, argumentDoubleValue, maxRecommendedValue)); + } + } + public Collection getRodBindings() { return Collections.unmodifiableCollection(rodBindings); } @@ -654,6 +740,13 @@ class InvalidArgumentValueException extends ArgumentException { } } +class ArgumentValueOutOfRangeException extends ArgumentException { + public ArgumentValueOutOfRangeException( final String argumentName, final double argumentActualValue, + final double argumentBoundaryValue, final String argumentBoundaryType ) { + super(String.format("Argument --%s has value %.2f, but %s allowed value is %.2f", + argumentName, argumentActualValue, argumentBoundaryType, argumentBoundaryValue)); + } +} /** * An exception for values that can't be mated with any argument. diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index c4f1a286d..27b030060 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -62,9 +62,11 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; import org.broadinstitute.sting.utils.recalibration.BQSRArgumentSet; +import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import java.io.File; +import java.io.FileNotFoundException; import java.util.*; import java.util.concurrent.TimeUnit; @@ -530,8 +532,8 @@ public class GenomeAnalysisEngine { } if ( duplicateSamFiles.size() > 0 ) { - throw new ArgumentException("The following BAM files appear multiple times in the list of input files: " + - duplicateSamFiles + " BAM files may be specified at most once."); + throw new UserException("The following BAM files appear multiple times in the list of input files: " + + duplicateSamFiles + " BAM files may be specified at most once."); } } @@ -854,6 +856,10 @@ public class GenomeAnalysisEngine { final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; + final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? + loadSampleRenameMap(argCollection.sampleRenameMappingFile) : + null; + return new SAMDataSource( samReaderIDs, threadAllocation, @@ -869,9 +875,63 @@ public class GenomeAnalysisEngine { includeReadsWithDeletionAtLoci(), argCollection.defaultBaseQualities, removeProgramRecords, - keepReadsInLIBS); + keepReadsInLIBS, + sampleRenameMap); } + /** + * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory + * HashMap. This file must consist of lines with two whitespace-separated fields: + * + * absolute_path_to_bam_file new_sample_name + * + * The engine will verify that each bam file contains reads from only one sample when the on-the-fly sample + * renaming feature is being used. + * + * @param sampleRenameMapFile sample rename map file from which to load data + * @return a HashMap containing the contents of the map file, with the keys being the bam file paths and + * the values being the new sample names. + */ + protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { + logger.info("Renaming samples from BAM files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); + + final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); + + try { + for ( final String line : new XReadLines(sampleRenameMapFile) ) { + final String[] tokens = line.split("\\s+"); + + if ( tokens.length != 2 ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", + tokens.length, line)); + } + + final File bamFile = new File(tokens[0]); + final String newSampleName = tokens[1]; + + if ( ! bamFile.isAbsolute() ) { + throw new UserException.MalformedFile(sampleRenameMapFile, "Bam file path not absolute at line: " + line); + } + + final SAMReaderID bamID = new SAMReaderID(bamFile, new Tags()); + + if ( sampleRenameMap.containsKey(bamID) ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Bam file %s appears more than once", bamFile.getAbsolutePath())); + } + + sampleRenameMap.put(bamID, newSampleName); + } + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); + } + + return sampleRenameMap; + } + + /** * Opens a reference sequence file paired with an index. Only public for testing purposes * diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b38f0fc0b..08f892f97 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -35,6 +35,8 @@ import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import java.io.File; import java.util.ArrayList; @@ -119,21 +121,28 @@ public class GATKArgumentCollection { // Downsampling Arguments // // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) + /** + * Reads will be selected randomly to be removed from the pile based on the method described here. + */ + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus", required = false) public DownsampleType downsamplingType = null; @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; + /** + * For locus-based traversals (LocusWalkers and ActiveRegionWalkers), downsample_to_coverage controls the + * maximum depth of coverage at each locus. For read-based traversals (ReadWalkers), it controls the + * maximum number of reads sharing the same alignment start position. For ReadWalkers you will typically need to use + * much lower dcov values than you would with LocusWalkers to see an effect. Note that this downsampling option does + * not produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of the + * to-coverage downsampler is to maintain an even representation of reads from all alignment start positions when + * removing excess coverage. For a truly unbiased random sampling of reads, use -dfrac instead. Also note + * that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling + * algorithm will under some circumstances retain slightly more or less coverage than requested. + */ @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Coverage [integer] to downsample to. For locus-based traversals (eg., LocusWalkers and ActiveRegionWalkers)," + - "this controls the maximum depth of coverage at each locus. For non-locus-based traversals (eg., ReadWalkers), " + - "this controls the maximum number of reads sharing the same alignment start position. Note that this downsampling " + - "option does NOT produce an unbiased random sampling from all available reads at each locus: instead, the primary goal of " + - "the to-coverage downsampler is to maintain an even representation of reads from all alignment start positions " + - "when removing excess coverage. For a true across-the-board unbiased random sampling of reads, use -dfrac instead. " + - "Also note that the coverage target is an approximate goal that is not guaranteed to be met exactly: the downsampling " + - "algorithm will under some circumstances retain slightly more coverage than requested.", + doc = "Coverage [integer] to downsample to per locus (for locus walkers) or per alignment start position (for read walkers)", required = false) public Integer downsampleCoverage = null; @@ -281,6 +290,15 @@ public class GATKArgumentCollection { @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false) public boolean keepProgramRecords = false; + @Advanced + @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", + doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file. This option requires that " + + "each BAM file listed in the mapping file have only a single sample specified in its header (though there " + + "may be multiple read groups for that sample). Each line of the mapping file must contain the absolute path " + + "to a BAM file, followed by whitespace, followed by the new sample name for that BAM file.", + required = false) + public File sampleRenameMappingFile = null; + @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; @@ -323,9 +341,9 @@ public class GATKArgumentCollection { public int numberOfIOThreads = 0; /** - * Enable GATK to monitor its own threading efficiency, at a itsy-bitsy tiny + * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for - * debugging purposes. + * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. */ @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false) public Boolean monitorThreadEfficiency = false; @@ -438,5 +456,28 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // VCF/BCF index parameters + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Specify the Tribble indexing strategy to use for VCFs. + * + * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter + * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter + * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + */ + + @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="which type of IndexCreator to use for VCF/BCF indices",required=false) + @Advanced + public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + + @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator",required=false) + @Advanced + public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index adb668ff9..2f03edb68 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -246,6 +246,14 @@ public class BAMScheduler implements Iterator { */ private PeekableIterator bamScheduleIterator = null; + /** + * Clean up underlying BAMSchedule file handles. + */ + public void close() { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + } + /** * Get the next overlapping tree of bins associated with the given BAM file. * @param currentLocus The actual locus for which to check overlap. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 048ce17f5..b476945ce 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -62,6 +62,9 @@ public class IntervalSharder implements Iterator { wrappedIterator = new PeekableIterator(scheduler); this.parser = parser; } + public void close() { + wrappedIterator.close(); + } public boolean hasNext() { return wrappedIterator.hasNext(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index a36667ec4..9dc9734a5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -31,6 +31,7 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; +import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -47,8 +48,10 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; +import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; +import java.io.FileNotFoundException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; @@ -131,6 +134,11 @@ public class SAMDataSource { */ private final Map originalToMergedReadGroupMappings = new HashMap(); + /** + * Mapping from bam file ID to new sample name. Used only when doing on-the-fly sample renaming. + */ + private Map sampleRenameMap = null; + /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(SAMDataSource.class); @@ -202,7 +210,8 @@ public class SAMDataSource { includeReadsWithDeletionAtLoci, (byte) -1, false, - false); + false, + null); } /** @@ -219,6 +228,8 @@ public class SAMDataSource { * bases will be seen in the pileups, and the deletions will be skipped silently. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? + * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. + * Will be null if we're not doing sample renaming. */ public SAMDataSource( Collection samFiles, @@ -235,7 +246,9 @@ public class SAMDataSource { boolean includeReadsWithDeletionAtLoci, byte defaultBaseQualities, boolean removeProgramRecords, - final boolean keepReadsInLIBS) { + final boolean keepReadsInLIBS, + final Map sampleRenameMap) { + this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; @@ -261,6 +274,8 @@ public class SAMDataSource { ReadShard.setReadBufferSize(100000); } + this.sampleRenameMap = sampleRenameMap; + resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); @@ -337,6 +352,14 @@ public class SAMDataSource { resourcePool.releaseReaders(readers); } + public void close() { + SAMReaders readers = resourcePool.getAvailableReaders(); + for(SAMReaderID readerID: readerIDs) { + SAMFileReader reader = readers.getReader(readerID); + reader.close(); + } + } + /** * Returns Reads data structure containing information about the reads data sources placed in this pool as well as * information about how they are downsampled, sorted, and filtered @@ -825,8 +848,31 @@ public class SAMDataSource { if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); Collection headers = new LinkedList(); - for(SAMFileReader reader: readers.values()) - headers.add(reader.getFileHeader()); + + // Examine the bam headers, perform any requested sample renaming on them, and add + // them to the list of headers to pass to the Picard SamFileHeaderMerger: + for ( final Map.Entry readerEntry : readers.entrySet() ) { + final SAMReaderID readerID = readerEntry.getKey(); + final SAMFileReader reader = readerEntry.getValue(); + final SAMFileHeader header = reader.getFileHeader(); + + // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, + // or the user's sample rename map file didn't contain an entry for this bam file: + final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID) : null; + + // If we've been asked to rename the sample for this bam file, do so now. We'll check to + // make sure this bam only contains reads from one sample before proceeding. + // + // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of + // the existing read group attributes (including sample name) when merging + // headers, regardless of whether there are read group collisions or not. + if ( remappedSampleName != null ) { + remapSampleName(readerID, header, remappedSampleName); + } + + headers.add(header); + } + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); // update all read groups to GATKSAMRecordReadGroups @@ -837,6 +883,43 @@ public class SAMDataSource { headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); } + /** + * Changes the sample name in the read groups for the provided bam file header to match the + * remappedSampleName. Blows up with a UserException if the header contains more than one + * sample name. + * + * @param readerID ID for the bam file from which the provided header came from + * @param header The bam file header. Will be modified by this call. + * @param remappedSampleName New sample name to replace the existing sample attribute in the + * read groups for the header. + */ + private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { + String firstEncounteredSample = null; + + for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { + final String thisReadGroupSample = readGroup.getSample(); + + if ( thisReadGroupSample == null ) { + throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + + "bam file contains a read group (id: %s) with a null sample attribute", + readerID.getSamFilePath(), readGroup.getId())); + } + else if ( firstEncounteredSample == null ) { + firstEncounteredSample = thisReadGroupSample; + } + else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { + throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + + "however this bam file contains reads from more than one sample " + + "(encountered samples %s and %s in the bam header). The GATK requires that " + + "all bams for which on-the-fly sample renaming is requested " + + "contain reads from only a single sample per bam.", + readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); + } + + readGroup.setSample(remappedSampleName); + } + } + final private void printReaderPerformance(final int nExecutedTotal, final int nExecutedInTick, final int totalNumberOfFiles, diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java index 7efab5fb0..72c037707 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderID.java @@ -98,7 +98,7 @@ public class SAMReaderID implements Comparable { if(!(other instanceof SAMReaderID)) return false; SAMReaderID otherID = (SAMReaderID)other; - return this.samFile.equals(otherID.samFile); + return this.getSamFilePath().equals(otherID.getSamFilePath()); } /** @@ -107,7 +107,7 @@ public class SAMReaderID implements Comparable { */ @Override public int hashCode() { - return samFile.hashCode(); + return samFile.getAbsolutePath().hashCode(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java index ff0fa1127..37f1bcfac 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ShardBalancer.java @@ -43,4 +43,7 @@ public abstract class ShardBalancer implements Iterable { this.filePointers = new PeekableIterator(filePointers); this.parser = parser; } + public void close() { + this.filePointers.close(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java new file mode 100644 index 000000000..39bcb96e1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/LibraryReadFilter.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.filters; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.gatk.filters.ReadFilter; + +/** + * Only use reads from the specified library + * + * @author kcibul + * @since Aug 15, 2012 + * + */ + +public class LibraryReadFilter extends ReadFilter { + @Argument(fullName = "library", shortName = "library", doc="The name of the library to keep, filtering out all others", required=true) + private String LIBRARY_TO_KEEP = null; + + public boolean filterOut( final SAMRecord read ) { + final SAMReadGroupRecord readGroup = read.getReadGroup(); + return ( readGroup == null || readGroup.getLibrary() == null || !readGroup.getLibrary().equals( LIBRARY_TO_KEEP ) ); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java index 41ab59845..e576666e1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignMappingQualityFilter.java @@ -32,10 +32,23 @@ import org.broadinstitute.sting.commandline.Argument; * A read filter (transformer) that sets all reads mapping quality to a given value. * *

- * If a BAM file contains erroneous or missing mapping qualities, this 'filter' will set - * all your mapping qualities to a given value. Default being 60. + * If a BAM file contains erroneous or missing mapping qualities (MAPQ), this read transformer will set all your + * mapping qualities to a given value (see arguments list for default value). *

* + *

See also

+ * + *

ReassignOneMappingQualityFilter: reassigns a single MAPQ value, as opposed to all those found in the BAM file.

+ * + *

Caveats

+ * + *

Note that due to the order of operations involved in applying filters, it is possible that other read filters + * (determined either at command-line or internally by the tool you are using) will be applied to your data before + * this read transformation can be applied. If one of those other filters acts on the read mapping quality (MAPQ), + * then you may not obtain the expected results. Unfortunately it is currently not possible to change the order of + * operations from command line. To avoid the problem, we recommend applying this filter separately from any other + * analysis, using PrintReads.

+ * * *

Input

*

@@ -50,9 +63,9 @@ import org.broadinstitute.sting.commandline.Argument; * *

Examples

*
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -rf ReassignMappingQuality
+ *  java -jar GenomeAnalysisTK.jar \
+ *      -T PrintReads \
+ *      -rf ReassignMappingQuality \
  *      -DMQ 35
  *  
* diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java index f31313a86..232b7ed3d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReassignOneMappingQualityFilter.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.commandline.Argument; * A read filter (transformer) that changes a given read mapping quality to a different value. * *

- * This 'filter' will change a certain read mapping quality to a different value without affecting reads that + * This read transformer will change a certain read mapping quality to a different value without affecting reads that * have other mapping qualities. This is intended primarily for users of RNA-Seq data handling programs such * as TopHat, which use MAPQ = 255 to designate uniquely aligned reads. According to convention, 255 normally * designates "unknown" quality, and most GATK tools automatically ignore such reads. By reassigning a different @@ -46,7 +46,6 @@ import org.broadinstitute.sting.commandline.Argument; * that have no assigned mapping qualities. *

* - * *

Input

*

* BAM file(s) @@ -60,8 +59,8 @@ import org.broadinstitute.sting.commandline.Argument; * *

Examples

*
- *    java
- *      -jar GenomeAnalysisTK.jar
+ *    java -jar GenomeAnalysisTK.jar
+ *      -T PrintReads
  *      -rf ReassignOneMappingQuality
  *      -RMQF 255
  *      -RMQT 60
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java
index 35aba8114..231f46f10 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java
@@ -62,7 +62,7 @@ public class StorageFactory {
      * @param  Type of the stream to create.
      * @return Storage object with a facade of type T.
      */
-        public static  Storage createStorage( Stub stub, File file ) {
+     public static  Storage createStorage( Stub stub, File file ) {
         Storage storage;
 
         if(stub instanceof OutputStreamStub) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
index 80841bae7..de203e59f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java
@@ -28,17 +28,18 @@ package org.broadinstitute.sting.gatk.io.storage;
 import net.sf.samtools.util.BlockCompressedOutputStream;
 import org.apache.log4j.Logger;
 import org.broad.tribble.AbstractFeatureReader;
+import org.broad.tribble.Feature;
 import org.broad.tribble.FeatureCodec;
 import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub;
 import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
-import org.broadinstitute.variant.bcf2.BCF2Utils;
-import org.broadinstitute.variant.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.variant.bcf2.BCF2Utils;
 import org.broadinstitute.variant.variantcontext.VariantContext;
 import org.broadinstitute.variant.variantcontext.writer.Options;
 import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
 import org.broadinstitute.variant.variantcontext.writer.VariantContextWriterFactory;
+import org.broadinstitute.variant.vcf.VCFHeader;
 
 import java.io.*;
 import java.util.Arrays;
@@ -132,14 +133,21 @@ public class VariantContextWriterStorage implements Storage options = stub.getWriterOptions(indexOnTheFly);
-        VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), options);
+        VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options);
 
         // if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both
         // TODO -- remove me when argument generateShadowBCF is removed
         if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) {
             final File bcfFile = BCF2Utils.shadowBCF(file);
             if ( bcfFile != null ) {
-                VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, stub.getMasterSequenceDictionary(), options);
+                FileOutputStream bcfStream;
+                try {
+                    bcfStream = new FileOutputStream(bcfFile);
+                } catch (FileNotFoundException e) {
+                    throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e);
+                }
+
+                VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options);
                 writer = new TestWriter(writer, bcfWriter);
             }
         }
@@ -205,12 +213,11 @@ public class VariantContextWriterStorage implements Storage codec = fd.getCodec();
-            final AbstractFeatureReader source =
-                    AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false);
-            
-            for ( final VariantContext vc : source.iterator() ) {
-                target.writer.add(vc);
+            final FeatureCodec codec = fd.getCodec();
+            final AbstractFeatureReader source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false);
+
+            for ( final Feature vc : source.iterator() ) {
+                target.writer.add((VariantContext) vc);
             }
 
             source.close();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
index 3e3d6de41..9ad388adf 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
@@ -26,6 +26,7 @@
 package org.broadinstitute.sting.gatk.io.stubs;
 
 import net.sf.samtools.SAMSequenceDictionary;
+import org.broad.tribble.index.IndexCreator;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.io.OutputTracker;
 import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
@@ -70,6 +71,17 @@ public class VariantContextWriterStub implements Stub, Var
      */
     private final PrintStream genotypeStream;
 
+    /**
+     * A hack: push the argument sources into the VCF header so that the VCF header
+     * can rebuild the command-line arguments.
+     */
+    private final Collection argumentSources;
+
+    /**
+     * Which IndexCreator to use
+     */
+    private final IndexCreator indexCreator;
+
     /**
      * The cached VCF header (initialized to null)
      */
@@ -80,12 +92,6 @@ public class VariantContextWriterStub implements Stub, Var
      */
     private boolean isCompressed = false;
 
-    /**
-     * A hack: push the argument sources into the VCF header so that the VCF header
-     * can rebuild the command-line arguments.
-     */
-    private final Collection argumentSources;
-
     /**
      * Should the header be written out?  A hidden argument.
      */
@@ -118,6 +124,7 @@ public class VariantContextWriterStub implements Stub, Var
         this.engine = engine;
         this.genotypeFile = genotypeFile;
         this.genotypeStream = null;
+        this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile);
         this.argumentSources = argumentSources;
     }
 
@@ -132,6 +139,7 @@ public class VariantContextWriterStub implements Stub, Var
         this.engine = engine;
         this.genotypeFile = null;
         this.genotypeStream = new PrintStream(genotypeStream);
+        this.indexCreator = null;
         this.argumentSources = argumentSources;
     }
 
@@ -175,6 +183,10 @@ public class VariantContextWriterStub implements Stub, Var
         this.forceBCF = forceBCF;
     }
 
+    public IndexCreator getIndexCreator() {
+        return indexCreator;
+    }
+
     /**
      * Gets the master sequence dictionary from the engine associated with this stub
      * @link GenomeAnalysisEngine.getMasterSequenceDictionary
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index 67d72189c..e8c8896f6 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -71,10 +71,10 @@ import java.util.zip.GZIPOutputStream;
  * @since 2010
  */
 public class GATKRunReport {
-    protected static final String REPORT_BUCKET_NAME = "GATK_Run_Reports";
-    protected static final String TEST_REPORT_BUCKET_NAME = "GATK_Run_Reports_Test";
-    protected final static String AWS_ACCESS_KEY_MD5 = "43433e5488d60788042ed5de3dcf9b0a";
-    protected final static String AWS_SECRET_KEY_MD5 = "0aa28b227ecacbdc9d2d5e8d82b10d32";
+    protected static final String REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports";
+    protected static final String TEST_REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports.test";
+    protected final static String AWS_ACCESS_KEY_MD5 = "34d4a26eb2062b3f06e833b28f9a38c6";
+    protected final static String AWS_SECRET_KEY_MD5 = "83f2332eec99ef1d7425d5dc5d4b514a";
 
     private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss");
 
@@ -525,7 +525,7 @@ public class GATKRunReport {
                         }
                         break;
                     case NORMAL:
-                        // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
+                        // IAM GATK user credentials -- only right is to PutObject into broad.gsa.gatk.run.reports bucket
                         final S3Service s3Service = initializeAWSService(getAWSUploadAccessKey(), getAWSUploadSecretKey());
 
                         // Create an S3Object based on a file, with Content-Length set automatically and
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key b/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key
index 5b3e0c2ec..28f2cd0b8 100644
Binary files a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key and b/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_access.key differ
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key b/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key
index 9f1224a54..5c289a838 100644
Binary files a/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key and b/public/java/src/org/broadinstitute/sting/gatk/phonehome/resources/GATK_AWS_secret.key differ
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java
index 68e751521..60b6f4683 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java
@@ -225,10 +225,20 @@ public class FeatureManager  {
         docs.append(String.format(format, nameHeader, featureHeader, docHeader));
         for ( final FeatureDescriptor descriptor : featureDescriptors ) {
             if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) {
-                String oneDoc = String.format(format,
-                        descriptor.getName(),
-                        descriptor.getSimpleFeatureName(),
-                        GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass()));
+                final String DocURL = GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass());
+                final String oneDoc;
+                if ( DocURL.contains("_sting_") ) {
+                    oneDoc = String.format(format,
+                            descriptor.getName(),
+                            descriptor.getSimpleFeatureName(),
+                            DocURL);
+                } else {
+                    oneDoc = String.format(format,
+                            descriptor.getName(),
+                            descriptor.getSimpleFeatureName(),
+                            "(this is an external codec and is not documented within GATK)");
+                }
+
                 docs.append(oneDoc);
             }
         }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java
index e9ccebf34..5e7c3ec86 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportGatherer.java
@@ -42,23 +42,21 @@ public class GATKReportGatherer extends Gatherer {
         try {
             o = new PrintStream(output);
         } catch (FileNotFoundException e) {
-            throw new UserException("File to be output by CoverageByRG Gather function was not found");
+            throw new UserException(String.format("File %s to be output by GATKReportGatherer function was not found", output));
         }
 
         GATKReport current = new GATKReport();
         boolean isFirst = true;
         for (File input : inputs) {
-
-            // If the table is empty
             if (isFirst) {
                 current = new GATKReport(input);
                 isFirst = false;
             } else {
-                GATKReport toAdd = new GATKReport(input);
-                current.concat(toAdd);
+                current.concat(new GATKReport(input));
             }
         }
 
         current.print(o);
+        o.close();
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
index b85365366..f8628bb78 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java
@@ -86,7 +86,7 @@ public final class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList();
+    private final LinkedList workQueue = new LinkedList<>();
 
     private TAROrderedReadCache myReads = null;
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
index 962f81d0d..f60b7de54 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java
@@ -83,6 +83,7 @@ public abstract class ActiveRegionWalker extends Walker> activeRegionBindings = null;
 
@@ -179,4 +180,6 @@ public abstract class ActiveRegionWalker extends WalkerThe allele balance is the fraction of ref bases over ref + alt bases.

+ * + *

Caveats

+ *

Note that this annotation will only work properly for biallelic samples that are called as heterozygous.

*/ public class AlleleBalance extends InfoFieldAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 608257b54..f5930078f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -51,7 +51,7 @@ import java.util.List; *

The allele balance is the fraction of ref bases over ref + alt bases.

* *

Caveats

- *

Note that this annotation will only work properly for biallelic het-called samples.

+ *

Note that this annotation will only work properly for biallelic samples that are called as heterozygous.

*

This is an experimental annotation. As such, it is unsupported; we do not make any guarantees that it will work properly, and you use it at your own risk.

*/ public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java new file mode 100644 index 000000000..12e923b8f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtil.java @@ -0,0 +1,154 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.EffectType; + +import java.util.*; +/** + * Created with IntelliJ IDEA. + * User: farjoun + * Date: 6/5/13 + * Time: 12:06 PM + * To change this template use File | Settings | File Templates. + */ + +/* This class holds a tree representation of the annotations used in snpEff, and provides a mechanism for telling if a +given annotation is a descendant of another. +The idea is to be able to stratify effects by large branches and not only the specific +snpEff annotation that a variant might have. For example if we want to know whether a variant is in CDS +but if it's marked SYNONYMOUS_CODING or NON_SYNONYMOUS_CODING (or many other options) still imply that its in the CDS. + +The hierarchy was determined by Yossi Farjoun with input from Pablo (SNPEFF) and Tim Fennel. +*/ + + +public class SnpEffUtil { + + // A map holding for every child, it's parent. + // A node that isn't a key node is a root node. + static private final Map snpEffectGraph = new HashMap<>(); + + //A map from each value of EffectType to a set of it's ancestors + static private final Map> snpEffectAncestorSet = new HashMap<>(); + + static { + + + //INTERGENIC + snpEffectGraph.put(EffectType.UPSTREAM,EffectType.INTERGENIC); + snpEffectGraph.put(EffectType.DOWNSTREAM,EffectType.INTERGENIC); + snpEffectGraph.put(EffectType.INTERGENIC_CONSERVED,EffectType.INTERGENIC); + + //INTRON + snpEffectGraph.put(EffectType.INTRON_CONSERVED,EffectType.INTRON); + snpEffectGraph.put(EffectType.SPLICE_SITE_ACCEPTOR,EffectType.INTRON); + snpEffectGraph.put(EffectType.SPLICE_SITE_DONOR,EffectType.INTRON); + + //CDS + snpEffectGraph.put(EffectType.EXON_DELETED,EffectType.CDS); + snpEffectGraph.put(EffectType.SYNONYMOUS_CODING,EffectType.CDS); + snpEffectGraph.put(EffectType.NON_SYNONYMOUS_CODING,EffectType.CDS); + + //SYNONYMOUS_CODING + snpEffectGraph.put(EffectType.SYNONYMOUS_STOP,EffectType.SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.SYNONYMOUS_START,EffectType.SYNONYMOUS_CODING); + + //NON_SYNONYMOUS_CODING + snpEffectGraph.put(EffectType.START_LOST,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.STOP_GAINED,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.STOP_LOST,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.CODON_CHANGE,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.CODON_INSERTION,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.CODON_DELETION,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.CODON_CHANGE_PLUS_CODON_DELETION,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.CODON_CHANGE_PLUS_CODON_INSERTION,EffectType.NON_SYNONYMOUS_CODING); + snpEffectGraph.put(EffectType.FRAME_SHIFT,EffectType.NON_SYNONYMOUS_CODING); + + //UTRs + snpEffectGraph.put(EffectType.UTR_5_DELETED,EffectType.UTR_5_PRIME); + snpEffectGraph.put(EffectType.UTR_3_DELETED,EffectType.UTR_3_PRIME); + snpEffectGraph.put(EffectType.START_GAINED,EffectType.UTR_5_PRIME); + + //EXON + snpEffectGraph.put(EffectType.UTR_5_PRIME,EffectType.EXON); + snpEffectGraph.put(EffectType.UTR_3_PRIME,EffectType.EXON); + snpEffectGraph.put(EffectType.CDS,EffectType.EXON); + + + //TRANSCRIPT + snpEffectGraph.put(EffectType.INTRON,EffectType.TRANSCRIPT); + snpEffectGraph.put(EffectType.EXON,EffectType.TRANSCRIPT); + + //GENE + snpEffectGraph.put(EffectType.TRANSCRIPT,EffectType.GENE); + snpEffectGraph.put(EffectType.REGULATION,EffectType.GENE); + + //CHROMOSOME + snpEffectGraph.put(EffectType.GENE,EffectType.CHROMOSOME); + snpEffectGraph.put(EffectType.INTERGENIC,EffectType.CHROMOSOME); + } + + //A helper function that gets the parent set of the set of children + private static Set getParentSet(final Set children){ + final Set parents=new HashSet<>(); + for(EffectType child:children){ + final EffectType parent = snpEffectGraph.get(child); + if(parent!=null) parents.add(parent); + } + return parents; + } + + //builds the total set of ancestors of a given node + private static Set getAncestorSet(final EffectType child, final boolean isSelfIncluded){ + + final Set ancestors=new HashSet<>(); + if(isSelfIncluded) ancestors.add(child); + + Set untraversedNodes=Collections.singleton(child); + + while(!untraversedNodes.isEmpty()){ + final Set putativeParents = getParentSet(untraversedNodes); //get immediate parents of unexamined set + putativeParents.removeAll(ancestors); //remove all known parents, remaining with previously unknown parents + ancestors.addAll(putativeParents); // add these parents to growing list of ancestors + untraversedNodes=putativeParents; //still need to traverse parents of these nodes + } + return ancestors; + } + + //returns true if the child effect is a subType of the parentEffect (including itself) + public static boolean isSubTypeOf(final SnpEff.EffectType childEffect, final SnpEff.EffectType parentEffect){ + + Set ancestorSet=snpEffectAncestorSet.get(childEffect); + + if(ancestorSet==null) { //lazy population of map. + ancestorSet = new HashSet<>(); + ancestorSet.addAll(getAncestorSet(childEffect, true)); //"true" so that a type is considered a subtype of itself + snpEffectAncestorSet.put(childEffect, ancestorSet); + } + return ancestorSet.contains(parentEffect); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 10ba4ca17..f2f808cad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; @@ -83,6 +84,7 @@ import java.util.*; @Requires(value={}) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=250) @By(DataSource.REFERENCE) public class VariantAnnotator extends RodWalker implements AnnotatorCompatible, TreeReducible { @@ -132,21 +134,21 @@ public class VariantAnnotator extends RodWalker implements Ann * See the -list argument to view available annotations. */ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList(); + protected List annotationsToUse = new ArrayList<>(); /** * Note that this argument has higher priority than the -A or -G arguments, * so annotations will be excluded even if they are explicitly included with the other options. */ @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(); + protected List annotationsToExclude = new ArrayList<>(); /** * If specified, all available annotations in the group will be applied. See the VariantAnnotator -list argument to view available groups. * Keep in mind that RODRequiringAnnotations are not intended to be used as a group, because they require specific ROD inputs. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected List annotationGroupsToUse = new ArrayList(); + protected List annotationGroupsToUse = new ArrayList<>(); /** * This option enables you to add annotations from one VCF to another. @@ -193,8 +195,8 @@ public class VariantAnnotator extends RodWalker implements Ann } // get the list of all sample names from the variant VCF input rod, if applicable - List rodName = Arrays.asList(variantCollection.variants.getName()); - Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); + final List rodName = Arrays.asList(variantCollection.variants.getName()); + final Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName); if ( USE_ALL_ANNOTATIONS ) engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit()); @@ -204,23 +206,23 @@ public class VariantAnnotator extends RodWalker implements Ann // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones - Set hInfo = new HashSet(); + final Set hInfo = new HashSet<>(); hInfo.addAll(engine.getVCFAnnotationDescriptions()); - for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } // for the expressions, pull the info header line from the header of the resource rod - for ( VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { + for ( final VariantAnnotatorEngine.VAExpression expression : engine.getRequestedExpressions() ) { // special case the ID field if ( expression.fieldName.equals("ID") ) { hInfo.add(new VCFInfoHeaderLine(expression.fullName, 1, VCFHeaderLineType.String, "ID field transferred from external VCF resource")); continue; } VCFInfoHeaderLine targetHeaderLine = null; - for ( VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(expression.binding.getName())) ) { if ( line instanceof VCFInfoHeaderLine ) { - VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; + final VCFInfoHeaderLine infoline = (VCFInfoHeaderLine)line; if ( infoline.getID().equals(expression.fieldName) ) { targetHeaderLine = infoline; break; @@ -285,7 +287,7 @@ public class VariantAnnotator extends RodWalker implements Ann Map stratifiedContexts; if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) { stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup()); - annotatedVCs = new ArrayList(VCs.size()); + annotatedVCs = new ArrayList<>(VCs.size()); for ( VariantContext vc : VCs ) annotatedVCs.add(engine.annotateContext(tracker, ref, stratifiedContexts, vc)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 078a36dd9..25e683c2f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -58,15 +58,15 @@ public class VariantAnnotatorEngine { public RodBinding binding; public VAExpression(String fullExpression, List> bindings) { - int indexOfDot = fullExpression.lastIndexOf("."); + final int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); fullName = fullExpression; fieldName = fullExpression.substring(indexOfDot+1); - String bindingName = fullExpression.substring(0, indexOfDot); - for ( RodBinding rod : bindings ) { + final String bindingName = fullExpression.substring(0, indexOfDot); + for ( final RodBinding rod : bindings ) { if ( rod.getName().equals(bindingName) ) { binding = rod; break; @@ -96,7 +96,7 @@ public class VariantAnnotatorEngine { // select specific expressions to use public void initializeExpressions(Set expressionsToUse) { // set up the expressions - for ( String expression : expressionsToUse ) + for ( final String expression : expressionsToUse ) requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } @@ -113,15 +113,15 @@ public class VariantAnnotatorEngine { if ( annotationsToExclude.size() == 0 ) return; - List tempRequestedInfoAnnotations = new ArrayList(requestedInfoAnnotations.size()); - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) { + final List tempRequestedInfoAnnotations = new ArrayList<>(requestedInfoAnnotations.size()); + for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) { if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) tempRequestedInfoAnnotations.add(annotation); } requestedInfoAnnotations = tempRequestedInfoAnnotations; - List tempRequestedGenotypeAnnotations = new ArrayList(requestedGenotypeAnnotations.size()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + final List tempRequestedGenotypeAnnotations = new ArrayList<>(requestedGenotypeAnnotations.size()); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) tempRequestedGenotypeAnnotations.add(annotation); } @@ -143,24 +143,24 @@ public class VariantAnnotatorEngine { variantOverlapAnnotator = new VariantOverlapAnnotator(dbSNPBinding, overlapBindings, engine.getGenomeLocParser()); } - public void invokeAnnotationInitializationMethods( Set headerLines ) { - for ( VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { + public void invokeAnnotationInitializationMethods( final Set headerLines ) { + for ( final VariantAnnotatorAnnotation annotation : requestedInfoAnnotations ) { annotation.initialize(walker, toolkit, headerLines); } - for ( VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { + for ( final VariantAnnotatorAnnotation annotation : requestedGenotypeAnnotations ) { annotation.initialize(walker, toolkit, headerLines); } } public Set getVCFAnnotationDescriptions() { - Set descriptions = new HashSet(); + final Set descriptions = new HashSet<>(); - for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) + for ( final InfoFieldAnnotation annotation : requestedInfoAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) descriptions.addAll(annotation.getDescriptions()); - for ( String db : variantOverlapAnnotator.getOverlapNames() ) { + for ( final String db : variantOverlapAnnotator.getOverlapNames() ) { if ( VCFStandardHeaderLines.getInfoLine(db, false) != null ) descriptions.add(VCFStandardHeaderLines.getInfoLine(db)); else @@ -170,10 +170,10 @@ public class VariantAnnotatorEngine { return descriptions; } - public VariantContext annotateContext(final RefMetaDataTracker tracker, - final ReferenceContext ref, - final Map stratifiedContexts, - VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { return annotateContext(tracker, ref, stratifiedContexts, vc, null); } @@ -182,20 +182,20 @@ public class VariantAnnotatorEngine { final Map stratifiedContexts, final VariantContext vc, final Map perReadAlleleLikelihoodMap) { - Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); + final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // annotate expressions where available annotateExpressions(tracker, ref.getLocus(), infoAnnotations); // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); + for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + final Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); if ( annotationsFromCurrentType != null ) infoAnnotations.putAll(annotationsFromCurrentType); } // generate a new annotated VC - VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process final VariantContext annotated = builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); @@ -210,11 +210,11 @@ public class VariantAnnotatorEngine { final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { + for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) continue; - Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); + final Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); } @@ -244,12 +244,12 @@ public class VariantAnnotatorEngine { } private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { - for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getValues(expression.binding, loc); + for ( final VAExpression expression : requestedExpressions ) { + final Collection VCs = tracker.getValues(expression.binding, loc); if ( VCs.size() == 0 ) continue; - VariantContext vc = VCs.iterator().next(); + final VariantContext vc = VCs.iterator().next(); // special-case the ID field if ( expression.fieldName.equals("ID") ) { if ( vc.hasID() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java index 6af6723f2..e30965925 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java @@ -84,7 +84,8 @@ import java.io.PrintStream; *

*

Examples

*
- *     -T CallableLociWalker \
+ *  java -jar GenomeAnalysisTK.jar \
+ *     -T CallableLoci \
  *     -I my.bam \
  *     -summary my.summary \
  *     -o my.bed
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
index c4ef4d23b..ca3255097 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java
@@ -102,7 +102,7 @@ import java.util.*;
  * 
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T Coverage \
+ *   -T DepthOfCoverage \
  *   -o file_name_base \
  *   -I input_bams.list
  *   [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
index 86676ca54..42e3ae0c0 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java
@@ -44,10 +44,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import java.io.PrintStream;
 
 /**
- * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine)
+ * Compute the read error rate per position
  *
- * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read
- * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS.
+ * 

This tool computes the read error rate per position in sequence reads. It does this in the original 5'->3' + * orientation that the read had coming off the machine. It then emits a GATKReport containing readgroup, cycle, + * mismatches, counts, qual, and error rate for each read group in the input BAMs.

* *

Input

*

@@ -56,9 +57,9 @@ import java.io.PrintStream; * *

Output

*

- * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. + * A GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. * - * For example, running this tool on the NA12878 data sets: + * For example, running this tool on the NA12878 data sets yields the following table: * *

  *      ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
@@ -82,16 +83,20 @@ import java.io.PrintStream;
  *      
*

* - *

Examples

+ *

Example

*
  *    java
  *      -jar GenomeAnalysisTK.jar
  *      -T ErrorRatePerCycle
- *      -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
- *      -R bundle/current/b37/human_g1k_v37.fasta
- *      -o example.gatkreport.txt
+ *      -R human_g1k_v37.fasta
+ *      -I my_sequence_reads.bam
+ *      -o error_rates.gatkreport.txt
  *  
* + *

Caveat

+ * + *

Note that when it is run on paired-end sequence data, this tool only uses the first read in a pair.

+ * * @author Kiran Garimella, Mark DePristo */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java index a269a94bc..796c817ff 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -38,7 +38,10 @@ import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.TreeMap; /** * Outputs the read lengths of all the reads in a file. @@ -77,51 +80,101 @@ public class ReadLengthDistribution extends ReadWalker { @Output public PrintStream out; - private GATKReport report; + //A map from RG to its column number (its index in an int[] array) + private Map readGroupsLocation; + //Each line in the table is a read length and each column it the number of reads of a specific RG with that length. Thus a table is a map between read lengths to array of values (one for each RG). + private Map table; + private List readGroups; public void initialize() { - final List readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + readGroups = getToolkit().getSAMFileHeader().getReadGroups(); + readGroupsLocation = new HashMap<>(); + table = new TreeMap<>(); + int readGroupsNum = 0; - report = new GATKReport(); - report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroups.isEmpty() ? 1 : readGroups.size())); - GATKReportTable table = report.getTable("ReadLengthDistribution"); - - table.addColumn("readLength"); - - if (readGroups.isEmpty()) - table.addColumn("SINGLE_SAMPLE"); - else - for (SAMReadGroupRecord rg : readGroups) - table.addColumn(rg.getSample()); - } - - public boolean filter(ReferenceContext ref, GATKSAMRecord read) { - return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag()); + if (!readGroups.isEmpty()){ + for (SAMReadGroupRecord rg : readGroups){ + readGroupsLocation.put(rg,readGroupsNum); + readGroupsNum++; + } + } } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) { - GATKReportTable table = report.getTable("ReadLengthDistribution"); + public Integer map(final ReferenceContext referenceContext,final GATKSAMRecord samRecord,final RefMetaDataTracker RefMetaDataTracker) { - int length = Math.abs(samRecord.getReadLength()); - String sample = samRecord.getReadGroup().getSample(); + final int length = Math.abs(samRecord.getReadLength()); + final SAMReadGroupRecord rg = samRecord.getReadGroup(); - table.increment(length, sample); + increment(table,length, rg); return null; } + final private void increment(final Map table,final int length,final SAMReadGroupRecord rg){ + if(readGroupsLocation.isEmpty()){ + if(table.containsKey(length)) + table.get(length)[0]++; + else{ + final int[] newLength = {1}; + table.put(length,newLength); + } + } + else{ + final int rgLocation = readGroupsLocation.get(rg); + if(table.containsKey(length)) + table.get(length)[rgLocation]++; + else{ + table.put(length,new int[readGroupsLocation.size()]); + table.get(length)[rgLocation]++; + } + } + } + @Override public Integer reduceInit() { return null; } @Override - public Integer reduce(Integer integer, Integer integer1) { + public Integer reduce(final Integer integer,final Integer integer1) { return null; } - public void onTraversalDone(Integer sum) { + public void onTraversalDone(final Integer sum) { + final GATKReport report = createGATKReport(); report.print(out); } + + final private GATKReport createGATKReport(){ + final GATKReport report = new GATKReport(); + report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroupsLocation.isEmpty() ? 1 : readGroupsLocation.size())); + final GATKReportTable tableReport = report.getTable("ReadLengthDistribution"); + + tableReport.addColumn("readLength"); + + if (readGroupsLocation.isEmpty()){ + tableReport.addColumn("SINGLE_SAMPLE"); + int rowIndex = 0; + for (Integer length : table.keySet()){ + tableReport.set(rowIndex,0,length); + tableReport.set(rowIndex,1,table.get(length)[0]); + rowIndex++; + } + } + else{ + for (SAMReadGroupRecord rg : readGroups) + tableReport.addColumn(rg.getSample()); + int rowIndex = 0; + for (Integer length : table.keySet()){ + tableReport.set(rowIndex,0,length); + for (int i=0; i < readGroupsLocation.size(); i++) + tableReport.set(rowIndex,i+1,table.get(length)[i]); + rowIndex++; + } + + } + + return report; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java index 83d4d81d0..5ef1f0cf0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltration.java @@ -89,6 +89,8 @@ public class VariantFiltration extends RodWalker { /** * Any variant which overlaps entries from the provided mask rod will be filtered. If the user wants logic to be reversed, * i.e. filter variants that do not overlap with provided mask, then argument -filterNotInMask can be used. + * Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used + * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). */ @Input(fullName="mask", shortName="mask", doc="Input ROD mask", required=false) public RodBinding mask; @@ -138,6 +140,13 @@ public class VariantFiltration extends RodWalker { @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false) protected Integer MASK_EXTEND = 0; + + /** + * When using the -mask argument, the maskName will be annotated in the variant record. + * Note that when using the -filterNotInMask argument to reverse the masking logic, + * it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used + * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). + */ @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false) protected String MASK_NAME = "Mask"; @@ -145,6 +154,8 @@ public class VariantFiltration extends RodWalker { * By default, if the -mask argument is used, any variant falling in a mask will be filtered. * If this argument is used, logic is reversed, and variants falling outside a given mask will be filtered. * Use case is, for example, if we have an interval list or BED file with "good" sites. + * Note that it is up to the user to adapt the name of the mask to make it clear that the reverse logic was used + * (e.g. if masking against Hapmap, use -maskName=hapmap for the normal masking and -maskName=not_hapmap for the reverse masking). */ @Argument(fullName="filterNotInMask", shortName="filterNotInMask", doc="Filter records NOT in given input mask.", required=false) protected boolean filterRecordsNotInMask = false; @@ -219,7 +230,7 @@ public class VariantFiltration extends RodWalker { filterExps = VariantContextUtils.initializeMatchExps(FILTER_NAMES, FILTER_EXPS); genotypeFilterExps = VariantContextUtils.initializeMatchExps(GENOTYPE_FILTER_NAMES, GENOTYPE_FILTER_EXPS); - VariantContextUtils.engine.setSilent(true); + VariantContextUtils.engine.get().setSilent(true); initializeVcfWriter(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index 65f82efe4..7c2d19d30 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -92,7 +92,7 @@ public class CountRODs extends RodWalker> rods = Collections.emptyList(); @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java index 879022299..78029eb85 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/readutils/ClipReads.java @@ -57,36 +57,34 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences. + * Read clipping based on quality, position or sequence matching * + *

This tool provides simple, powerful read clipping capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.

* - *

- * It allows the user to clip bases in reads with poor quality scores, that match particular - * sequences, or that were generated by particular machine cycles. + *

There are three options for clipping (quality, position and sequence), which can be used alone or in combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at least one of the three clipping options, and specifying a clipping representation is not sufficient. If you do not specify a clipping option, the program will run but it will not do anything to your reads.

* *
*
Quality score based clipping
*
* Clip bases from the read in clipper from - *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
- * to the end of the read. This is blatantly stolen from BWA. + *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is copied from BWA. * * Walk through the read from the end (in machine cycle order) to the beginning, calculating the * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the * clipping index in the read (from the end). - *
+ *
*
Cycle based clipping
*
Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12. - *
+ *
*
Sequence matching
*
Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm, * filtering only bases whose sequence exactly matches SEQ.
*
* - *

* *

Input

*

@@ -99,7 +97,7 @@ import java.util.regex.Pattern; * operation applied to each read. *

*

- *

Summary output

+ *

Summary output (console)

*
  *     Number of examined reads              13
  *     Number of clipped reads               13
@@ -113,16 +111,29 @@ import java.util.regex.Pattern;
  *     
*

* - *

- *

Example clipping

- * Suppose we are given this read: + *

Example

+ *
+ *   java -jar GenomeAnalysisTK.jar \
+ *     -T ClipReads \
+ *     -R reference.fasta \
+ *     -I original.bam \
+ *     -o clipped.bam \
+ *     -XF seqsToClip.fasta \
+ *     -X CCCCC \
+ *     -CT "1-5,11-15" \
+ *     -QT 10
+ * 
+ *

The command line shown above will apply all three options in combination. See the detailed examples below to see how the choice of clipping representation affects the output.

+ * + *

Detailed clipping examples

+ *

Suppose we are given this read:

*
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
* - * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get: + *

If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:

* *
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
@@ -130,26 +141,20 @@ import java.util.regex.Pattern;
  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
* - * Whereas with -CR WRITE_Q0S: + *

Whereas with -QT 10 -CR WRITE_Q0S:

*
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
  *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
* - * Or -CR SOFTCLIP_BASES: + *

Or -QT 10 -CR SOFTCLIP_BASES:

*
  *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
  *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
  *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
  *     
- *

* - *

Examples

- *
- *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
- *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
- * 
* @author Mark DePristo * @since 2010 @@ -158,10 +163,9 @@ import java.util.regex.Pattern; @Requires({DataSource.READS}) public class ClipReads extends ReadWalker { /** - * If provided, ClipReads will write summary statistics about the clipping operations applied - * to the reads to this file. + * If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file. */ - @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false, defaultToStdout = false) + @Output(fullName = "outputStatistics", shortName = "os", doc = "File to output statistics", required = false, defaultToStdout = false) PrintStream out = null; /** @@ -305,7 +309,7 @@ public class ClipReads extends ReadWalker * + *

Caveat

+ * + *

Some stratifications and evaluators are incompatible with each other due to their respective memory requirements, such as AlleleCount and VariantSummary, or Sample and VariantSummary. + * If you specify such a combination, the program will output an error message and ask you to disable one of these options. + * We do not currently provide an exhaustive list of incompatible combinations, so we recommend trying out combinations that you are interested in on a dummy command line, to rapidly ascertain whether it will work or not.

+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-50, stop=50)) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 33a5a9fc9..63c34586e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -197,6 +197,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval { break; case MIXED: break; + case UNAVAILABLE: + break; default: throw new ReviewedStingException("BUG: Unexpected genotype type: " + g); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java new file mode 100644 index 000000000..f393da6ad --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/SnpEffPositionModifier.java @@ -0,0 +1,86 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.EffectType; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.InfoFieldKey; +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEffUtil; +import org.broadinstitute.variant.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.List; + +/** + * Stratifies variants as genes or coding regions, according to the effect modifier, as indicated by snpEff. + * The 'gene' category includes category 'coding region', and additionally includes introns. 'Coding regions' + * includes transcripts and, implicitly, UTRs. + */ +public class SnpEffPositionModifier extends VariantStratifier { + + public enum PositionModifier { + GENE, // EXON + CODING_REGION, // CDS + SPLICE_SITE, // not a straight translation -- see getRelevantStates + STOP_GAINED, // STOP_GAINED + STOP_LOST // STOP_LOST + } + + @Override + public void initialize() { + for (final PositionModifier type : PositionModifier.values()) states.add(type.name()); + } + + @Override + public List getRelevantStates( + final ReferenceContext ref, + final RefMetaDataTracker tracker, + final VariantContext comp, + final String compName, + final VariantContext eval, + final String evalName, + final String sampleName) + { + final List relevantStates = new ArrayList(); + if (eval != null && eval.isVariant() && eval.hasAttribute(InfoFieldKey.EFFECT_KEY.getKeyName())) { + final SnpEff.EffectType effectType = SnpEff.EffectType.valueOf( + eval.getAttribute(InfoFieldKey.EFFECT_KEY.getKeyName()).toString()); + + if (SnpEffUtil.isSubTypeOf(effectType, EffectType.EXON)) relevantStates.add(PositionModifier.GENE.name()); + if (SnpEffUtil.isSubTypeOf(effectType, EffectType.CDS)) relevantStates.add(PositionModifier.CODING_REGION.name()); + if (SnpEffUtil.isSubTypeOf(effectType, EffectType.STOP_GAINED)) relevantStates.add(PositionModifier.STOP_GAINED.name()); + if (SnpEffUtil.isSubTypeOf(effectType, EffectType.STOP_LOST)) relevantStates.add(PositionModifier.STOP_LOST.name()); + + if (SnpEffUtil.isSubTypeOf(effectType, EffectType.SPLICE_SITE_ACCEPTOR) || + SnpEffUtil.isSubTypeOf(effectType, EffectType.SPLICE_SITE_DONOR)) + relevantStates.add(PositionModifier.SPLICE_SITE.name()); + } + + return relevantStates; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 45dbc937d..1362b109e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -164,6 +164,9 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false) public boolean minimalVCF = false; + @Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the combining procedure", required=false) + public boolean EXCLUDE_NON_VARIANTS = false; + /** * Set to 'null' if you don't want the set field emitted. */ @@ -171,7 +174,7 @@ public class CombineVariants extends RodWalker implements Tree public String SET_KEY = "set"; /** - * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime.. + * This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime. */ @Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false) public boolean ASSUME_IDENTICAL_SAMPLES = false; @@ -188,6 +191,9 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; + @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false) + public boolean COMBINE_ANNOTATIONS = false; + private List priority = null; /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ @@ -229,8 +235,6 @@ public class CombineVariants extends RodWalker implements Tree vcfWriter.writeHeader(vcfHeader); } - - private void validateAnnotateUnionArguments() { Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); @@ -238,7 +242,7 @@ public class CombineVariants extends RodWalker implements Tree throw new UserException.MissingArgument("rod_priority_list", "Priority string must be provided if you want to prioritize genotypes"); if ( PRIORITY_STRING != null){ - priority = new ArrayList(Arrays.asList(PRIORITY_STRING.split(","))); + priority = new ArrayList<>(Arrays.asList(PRIORITY_STRING.split(","))); if ( rodNames.size() != priority.size() ) throw new UserException.BadArgumentValue("rod_priority_list", "The priority list must contain exactly one rod binding per ROD provided to the GATK: rodNames=" + rodNames + " priority=" + priority); @@ -252,13 +256,16 @@ public class CombineVariants extends RodWalker implements Tree if ( tracker == null ) // RodWalkers can make funky map calls return 0; - Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); + final Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); + Collection potentialRefVCs = tracker.getValues(variants); + potentialRefVCs.removeAll(vcs); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); + potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs); } if ( ASSUME_IDENTICAL_SAMPLES ) { @@ -270,7 +277,7 @@ public class CombineVariants extends RodWalker implements Tree } int numFilteredRecords = 0; - for (VariantContext vc : vcs) { + for (final VariantContext vc : vcs) { if (vc.filtersWereApplied() && vc.isFiltered()) numFilteredRecords++; } @@ -278,16 +285,16 @@ public class CombineVariants extends RodWalker implements Tree if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - List mergedVCs = new ArrayList(); + final List mergedVCs = new ArrayList<>(); if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.BY_TYPE) { - Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); + final Map> VCsByType = GATKVariantContextUtils.separateVariantContextsByType(vcs); // TODO -- clean this up in a refactoring // merge NO_VARIATION into another type of variant (based on the ordering in VariantContext.Type) if ( VCsByType.containsKey(VariantContext.Type.NO_VARIATION) && VCsByType.size() > 1 ) { final List refs = VCsByType.remove(VariantContext.Type.NO_VARIATION); - for ( VariantContext.Type type : VariantContext.Type.values() ) { + for ( final VariantContext.Type type : VariantContext.Type.values() ) { if ( VCsByType.containsKey(type) ) { VCsByType.get(type).addAll(refs); break; @@ -296,23 +303,27 @@ public class CombineVariants extends RodWalker implements Tree } // iterate over the types so that it's deterministic - for (VariantContext.Type type : VariantContext.Type.values()) { - if (VCsByType.containsKey(type)) - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + for (final VariantContext.Type type : VariantContext.Type.values()) { + // make sure that it is a variant or in case it is not, that we want to include the sites with no variants + if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { + if (VCsByType.containsKey(type)) { + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs, + priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + } + } } } else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs, priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); } else { logger.warn("Ignoring all records at site " + ref.getLocus()); } - for ( VariantContext mergedVC : mergedVCs ) { + for ( final VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; @@ -320,9 +331,12 @@ public class CombineVariants extends RodWalker implements Tree final VariantContextBuilder builder = new VariantContextBuilder(mergedVC); // re-compute chromosome counts VariantContextUtils.calculateChromosomeCounts(builder, false); + if ( minimalVCF ) GATKVariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(builder.make()); + final VariantContext vc = builder.make(); + if( !EXCLUDE_NON_VARIANTS || vc.isPolymorphicInSamples() ) + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index e61cda765..2b18eda20 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -46,6 +46,13 @@ import java.util.*; /** * Filters a lifted-over VCF file for ref bases that have been changed. + * + * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. + * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. + * + * FilteredLiftedVariants is intended to be the second of two processing steps for the liftover process. The first step is to run LiftoverVariants on your VCF file. + * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) @@ -70,32 +77,49 @@ public class FilterLiftedVariants extends RodWalker { writer.writeHeader(vcfHeader); } - private void filterAndWrite(byte[] ref, VariantContext vc) { + /** + * Determines whether records should be filtered; if not, writes them to the output + * + * @param ref the reference context + * @param vc the VariantContext to process + * @return true if the record is not filtered, false otherwise + */ + protected boolean filterOrWrite(final byte[] ref, final VariantContext vc) { + if ( ref == null ) throw new IllegalArgumentException("Cannot filter based on a null reference array"); + if ( vc == null ) throw new IllegalArgumentException("Cannot filter a null Variant Context"); totalLocs++; - boolean failed = false; - byte[] recordRef = vc.getReference().getBases(); - for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { - if ( recordRef[i] != ref[i] ) { - failed = true; - break; + boolean filter = false; + final byte[] recordRef = vc.getReference().getBases(); + + // this can happen for records that get placed at the ends of chromosomes + if ( recordRef.length > ref.length ) { + filter = true; + } else { + for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { + if ( recordRef[i] != ref[i] ) { + filter = true; + break; + } } } - if ( failed ) + if ( filter ) failedLocs++; else writer.add(vc); + + return !filter; } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { if ( tracker == null ) return 0; - Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - for ( VariantContext vc : VCs ) - filterAndWrite(ref.getBases(), vc); + final Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); + for ( final VariantContext vc : VCs ) + filterOrWrite(ref.getBases(), vc); return 0; } @@ -107,4 +131,4 @@ public class FilterLiftedVariants extends RodWalker { public void onTraversalDone(Integer result) { System.out.println("Filtered " + failedLocs + " records out of " + totalLocs + " total records."); } -} \ No newline at end of file +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java index da8b20c66..724578a09 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/GenotypeConcordance.java @@ -69,12 +69,15 @@ import java.util.*; * Genotype Concordance writes a GATK report to the specified file (via -o) , consisting of multiple tables of counts * and proportions. These tables may be optionally moltenized via the -moltenize argument. That is, the standard table * + *
  *  Sample   NO_CALL_HOM_REF  NO_CALL_HET  NO_CALL_HOM_VAR   (...)
  *  NA12878       0.003        0.001            0.000        (...)
  *  NA12891       0.005        0.000            0.000        (...)
+ *  
* * would instead be displayed * + *
  *  NA12878  NO_CALL_HOM_REF   0.003
  *  NA12878  NO_CALL_HET       0.001
  *  NA12878  NO_CALL_HOM_VAR   0.000
@@ -82,6 +85,7 @@ import java.util.*;
  *  NA12891  NO_CALL_HET       0.000
  *  NA12891  NO_CALL_HOM_VAR   0.000
  *  (...)
+ *  
* * * These tables are constructed on a per-sample basis, and include counts of eval vs comp genotype states, and the @@ -92,8 +96,10 @@ import java.util.*; * counts for EVAL_SUBSET_TRUTH and EVAL_SUPERSET_TRUTH will be generated. * * For example, in the following situation + *
  *    eval:  ref - A   alt - C
  *    comp:  ref - A   alt - C,T
+ *  
* then the site is tabulated as EVAL_SUBSET_TRUTH. Were the situation reversed, it would be EVAL_SUPERSET_TRUTH. * However, in the case where eval has both C and T alternate alleles, both must be observed in the genotypes * (that is, there must be at least one of (0/1,1/1) and at least one of (0/2,1/2,2/2) in the genotype field). If @@ -106,11 +112,15 @@ import java.util.*; * (if no record exists in the comp VCF). * * That is, in the situation + *
  *   eval:  ref - A   alt - C   genotypes - 0/0  0/0  0/0 ... 0/0
  *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
+ *  
* is equivalent to + *
  *   eval:  ref - A   alt - .   genotypes - 0/0  0/0  0/0 ... 0/0
  *   comp:  ref - A   alt - C   ...         0/0  0/0  ...
+ *  
* * When a record is present in the comp VCF the *genotypes* for the monomorphic site will still be used to evaluate * per-sample genotype concordance counts. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 0e38869c6..478a2a351 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.writer.Options; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -56,9 +57,14 @@ import java.util.*; /** * Lifts a VCF file over from one build to another. * - * Important note: the resulting VCF is not guaranteed to be valid according to the official specification. The file could - * possibly be mis-sorted and the header may not be complete. LiftoverVariants is intended to be the first of two processing steps - * for the liftover; the second step, FilterLiftedVariants, will produce a valid well-behaved VCF file. + * "Lifting over" variants means adjusting variant calls from one reference to another. Specifically, the process adjusts the position of the call to match the corresponding position on the target reference. + * For example, if you have variants called from reads aligned to the hg19 reference, and you want to compare them to calls made based on the b37 reference, you need to liftover one of the callsets to the other reference. + * + * LiftoverVariants is intended to be the first of two processing steps for the liftover process. + * The second step is to run FilterLiftedVariants on the output of LiftoverVariants. This will produce valid well-behaved VCF files, where you'll see that the contig names in the header have all been correctly replaced. + * + * To be clear, the VCF resulting from the LiftoverVariants run is not guaranteed to be valid according to the official specification. The file could + * possibly be mis-sorted and the header may not be complete. That is why you need to run FilterLiftedVariants on it. */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) public class LiftoverVariants extends RodWalker { @@ -113,7 +119,7 @@ public class LiftoverVariants extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(metaData, samples); - writer = VariantContextWriterFactory.create(file, getMasterSequenceDictionary(), VariantContextWriterFactory.NO_OPTIONS); + writer = VariantContextWriterFactory.create(file, getMasterSequenceDictionary(), EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER)); writer.writeHeader(vcfHeader); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 1f2b6d09b..e688b7f17 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -299,7 +299,7 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(doc="indel size select",required=false,fullName="maxIndelSize") private int maxIndelSize = Integer.MAX_VALUE; - @Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") + @Argument(doc="Allow samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; @@ -657,6 +657,7 @@ public class SelectVariants extends RodWalker implements TreeR * Helper method to subset a VC record, modifying some metadata stored in the INFO field (i.e. AN, AC, AF). * * @param vc the VariantContext record to subset + * @param excludeNonVariants should we exclude sites that have AC=0 for any alternate alleles? * @return the subsetted VariantContext */ private VariantContext subsetRecord(final VariantContext vc, final boolean excludeNonVariants) { @@ -665,14 +666,10 @@ public class SelectVariants extends RodWalker implements TreeR final VariantContext sub = vc.subContextFromSamples(samples, excludeNonVariants); // strip out the alternate alleles that aren't being used - VariantContextBuilder builder = new VariantContextBuilder(sub); + final VariantContextBuilder builder = new VariantContextBuilder(sub); - GenotypesContext newGC = sub.getGenotypes(); - - // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate) - final boolean lostAllelesInSelection = vc.getAlleles().size() != sub.getAlleles().size(); - if ( lostAllelesInSelection ) - newGC = GATKVariantContextUtils.stripPLsAndAD(sub.getGenotypes()); + // if there are fewer alternate alleles now in the selected VC, we need to fix the PL and AD values + GenotypesContext newGC = GATKVariantContextUtils.updatePLsAndAD(sub, vc); // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags if ( vc.getNSamples() != sub.getNSamples() ) { @@ -682,11 +679,11 @@ public class SelectVariants extends RodWalker implements TreeR // Remove a fraction of the genotypes if needed if ( fractionGenotypes > 0 ){ - ArrayList genotypes = new ArrayList(); + final ArrayList genotypes = new ArrayList<>(); for ( Genotype genotype : newGC ) { //Set genotype to no call if it falls in the fraction. if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + final List alleles = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); genotypes.add(new GenotypeBuilder(genotype).alleles(alleles).noGQ().make()); } else{ @@ -698,7 +695,7 @@ public class SelectVariants extends RodWalker implements TreeR builder.genotypes(newGC); - addAnnotations(builder, sub, lostAllelesInSelection); + addAnnotations(builder, sub, vc.getAlleles().size() != sub.getAlleles().size()); return builder.make(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index c414b443e..55b4c0029 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -81,8 +81,9 @@ public class VariantsToBinaryPed extends RodWalker { * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. *

* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the - * command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the - * alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data. + * command line has "-md [name].fam", the fam file will be subset and reordered to match the sample content and ordering + * of the VCF. However, if a metadata file of the alternate format is passed by "-md [name].txt", the walker will + * construct a formatted .fam file from the data. *

*/ @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + @@ -170,6 +171,13 @@ public class VariantsToBinaryPed extends RodWalker { if ( ! sampleMetaValues.containsKey(sample) ) { throw new UserException("No metadata provided for sample "+sample); } + Map mVals = sampleMetaValues.get(sample); + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.containsKey("phenotype") ? mVals.get("phenotype") : "-1"; + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { // only need to instantiate the files and buffers if in individual major. @@ -469,7 +477,6 @@ public class VariantsToBinaryPed extends RodWalker { values.put("sex",sex); values.put("phenotype",pheno); metaValues.put(sid,values); - outFam.printf("%s%n",line); } } else { for ( String line : new XReadLines(metaDataFile) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index f1f93f1f5..3fcabdf5b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -124,7 +124,7 @@ public class VariantsToTable extends RodWalker { /** * -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL). - * Note this argument accepts any number of inputs. So -F GQ -F PL is allowed. + * Note this argument accepts any number of inputs. So -GF GQ -GF PL is allowed. */ @Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false) public List genotypeFieldsToTake = new ArrayList(); @@ -448,10 +448,6 @@ public class VariantsToTable extends RodWalker { getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } }); - getters.put("GQ", new Getter() { public String get(VariantContext vc) { - if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF"); - return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError()); - }}); } private static Object splitAltAlleles(VariantContext vc) { diff --git a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java index b59786d15..8e5078f1f 100644 --- a/public/java/src/org/broadinstitute/sting/tools/CatVariants.java +++ b/public/java/src/org/broadinstitute/sting/tools/CatVariants.java @@ -31,12 +31,15 @@ import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Level; import org.broad.tribble.AbstractFeatureReader; import org.broad.tribble.FeatureReader; +import org.broad.tribble.index.IndexCreator; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.CommandLineProgram; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.variant.bcf2.BCF2Codec; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.vcf.VCFCodec; @@ -123,6 +126,12 @@ public class CatVariants extends CommandLineProgram { @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if he input files are already sorted (based on the position of the variants", required = false) private Boolean assumeSorted = false; + @Argument(fullName = "variant_index_type", doc = "which type of IndexCreator to use for VCF/BCF indices", required = false) + private GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + + @Argument(fullName = "variant_index_parameter", doc = "the parameter (bin width or features per bin) to pass to the VCF/BCF IndexCreator", required = false) + private Integer variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; + /* * print usage information */ @@ -204,7 +213,8 @@ public class CatVariants extends CommandLineProgram { FileOutputStream outputStream = new FileOutputStream(outputFile); EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); - final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), options); + final IndexCreator idxCreator = GATKVCFUtils.getIndexCreator(variant_index_type, variant_index_parameter, outputFile); + final VariantContextWriter outputWriter = VariantContextWriterFactory.create(outputFile, outputStream, ref.getSequenceDictionary(), idxCreator, options); boolean firstFile = true; int count =0; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 46f8f2a84..1bbf481b1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -138,6 +138,48 @@ public class BaseUtils { return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2); } + /** + * Checks whether to bases are the same in fact ignore ambiguous 'N' bases. + * + * @param base1 first base to compare. + * @param base2 second base to compare. + * @return true if {@code base1 == base2} or either is an 'N', false otherwise. + */ + static public boolean basesAreEqualIgnoreAmbiguous(final byte base1, final byte base2) { + if (base1 == base2) return true; + else if (base1 == 'n' || base1 == 'N' || base2 == 'N' || base2 == 'n') return true; + else return false; + } + + /** + * Compare to base arrays ranges checking whether they contain the same bases. + * + *

+ * By default two array have equal bases, i.e. {@code length == 0} results results in {@code true}. + *

+ * + * @param bases1 first base array to compare. + * @param offset1 position of the first base in bases1 to compare. + * @param bases2 second base array to compare. + * @param offset2 position of the first base in bases2 to compare. + * @param length number of bases to compare. + * + * @throws NullPointerException if {@code bases1} or {@code bases2} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if: + *
    + *
  • {@code offset1} is not within the range [0,{@code bases1.length}) or
  • + *
  • {@code offset2} is not within the range [0,{@code bases2.length}) or
  • + *
  • {@code offset1 + length} is not within the range [0,{@code bases1.length}) or
  • + *
  • {@code offset2 + length} is not within the range [0,{@code bases2.length})
  • + *
+ * @return + */ + static public boolean basesAreEqualIgnoreAmbiguous(final byte[] bases1, final int offset1, final byte[] bases2, final int offset2, final int length) { + for (int i = 0; i < length; i++) + if (!basesAreEqualIgnoreAmbiguous(bases1[offset1 + i],bases2[offset2 + i])) return false; + return true; + } + static public boolean extendedBasesAreEqual(byte base1, byte base2) { return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); } diff --git a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java index 78c32ed02..9823e524a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java +++ b/public/java/src/org/broadinstitute/sting/utils/DeprecatedToolChecks.java @@ -46,6 +46,7 @@ public class DeprecatedToolChecks { deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)"); deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)"); deprecatedGATKWalkers.put("CountBestAlignments", "2.2 (no replacement)"); + deprecatedGATKWalkers.put("SomaticIndelDetector", "2.0 (replaced by the standalone tool Indelocator; see Cancer Tools documentation)"); } // Mapping from walker name to major version number where the walker first disappeared and optional replacement options diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 3af71eabb..82c9fe751 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -27,10 +27,11 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.apache.commons.math.distribution.ExponentialDistributionImpl; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import javax.annotation.Nullable; import java.math.BigDecimal; import java.util.*; @@ -425,7 +426,6 @@ public class MathUtils { * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a * utility function. */ - @Nullable static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { return null; @@ -845,17 +845,29 @@ public class MathUtils { } /** - * Compute the median element of the array of integers + * Compute the median element of the list of integers * @param array a list of integers * @return the median element */ - public static int median(final List array) { + public static > T median(final List array) { + /* TODO -- from Valentin + the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). + + But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] + + My suggestion for a solution is then: + + unify median and medianDoubles to public static T median(Collection) + check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. + relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) + In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) + */ if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); final int size = array.size(); if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); else if ( size == 1 ) return array.get(0); else { - final ArrayList sorted = new ArrayList<>(array); + final ArrayList sorted = new ArrayList<>(array); Collections.sort(sorted); return sorted.get(size / 2); } @@ -966,6 +978,16 @@ public class MathUtils { return count; } + public static int countOccurrences(final boolean element, final boolean[] array) { + int count = 0; + for (final boolean b : array) { + if (element == b) + count++; + } + + return count; + } + /** * Returns n random indices drawn with replacement from the range 0..(k-1) @@ -1405,7 +1427,7 @@ public class MathUtils { * @return */ public static List log10LinearRange(final int start, final int stop, final double eps) { - final LinkedList values = new LinkedList(); + final LinkedList values = new LinkedList<>(); final double log10range = Math.log10(stop - start); if ( start == 0 ) @@ -1460,4 +1482,37 @@ public class MathUtils { return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); } + /** + * Return the likelihood of observing the counts of categories having sampled a population + * whose categorial frequencies are distributed according to a Dirichlet distribution + * @param dirichletParams - params of the prior dirichlet distribution + * @param dirichletSum - the sum of those parameters + * @param counts - the counts of observation in each category + * @param countSum - the sum of counts (number of trials) + * @return - associated likelihood + */ + public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, + final int[] counts, final int countSum) { + if ( dirichletParams.length != counts.length ) { + throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); + } + // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) + double likelihood = log10MultinomialCoefficient(countSum,counts); + likelihood += log10Gamma(dirichletSum); + likelihood -= log10Gamma(dirichletSum+countSum); + for ( int idx = 0; idx < counts.length; idx++ ) { + likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); + likelihood -= log10Gamma(dirichletParams[idx]); + } + + return likelihood; + } + + public static double dirichletMultinomial(double[] params, int[] counts) { + return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); + } + + public static ExponentialDistribution exponentialDistribution( final double mean ) { + return new ExponentialDistributionImpl(mean); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index fe782bc31..c0d1df09d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -63,6 +63,7 @@ public class QualityUtils { private static double qualToErrorProbCache[] = new double[256]; private static double qualToProbLog10Cache[] = new double[256]; + static { for (int i = 0; i < 256; i++) { qualToErrorProbCache[i] = qualToErrorProb((double) i); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 75bd6a3d1..9657bc403 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMProgramRecord; import org.apache.log4j.Logger; @@ -745,7 +744,7 @@ public class Utils { /** * @see #calcMD5(byte[]) */ - public static String calcMD5(final String s) throws NoSuchAlgorithmException { + public static String calcMD5(final String s) { return calcMD5(s.getBytes()); } @@ -754,17 +753,21 @@ public class Utils { * * @param bytes the bytes to calculate the md5 of * @return the md5 of bytes, as a 32-character long string - * @throws NoSuchAlgorithmException */ @Ensures({"result != null", "result.length() == 32"}) - public static String calcMD5(final byte[] bytes) throws NoSuchAlgorithmException { + public static String calcMD5(final byte[] bytes) { if ( bytes == null ) throw new IllegalArgumentException("bytes cannot be null"); - final byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytes); - final BigInteger bigInt = new BigInteger(1, thedigest); + try { + final byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytes); + final BigInteger bigInt = new BigInteger(1, thedigest); - String md5String = bigInt.toString(16); - while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 - return md5String; + String md5String = bigInt.toString(16); + while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 + return md5String; + } + catch ( NoSuchAlgorithmException e ) { + throw new IllegalStateException("MD5 digest algorithm not present"); + } } /** @@ -835,4 +838,18 @@ public class Utils { // don't perform array copies if we need to copy everything anyways return ( trimFromFront == 0 && trimFromBack == 0 ) ? seq : Arrays.copyOfRange(seq, trimFromFront, seq.length - trimFromBack); } + + /** + * Simple wrapper for sticking elements of a int[] array into a List + * @param ar - the array whose elements should be listified + * @return - a List where each element has the same value as the corresponding index in @ar + */ + public static List listFromPrimitives(final int[] ar) { + final ArrayList lst = new ArrayList<>(ar.length); + for ( final int d : ar ) { + lst.add(d); + } + + return lst; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index bb74619a7..8f6af0158 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -32,9 +32,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.HasGenomeLocation; -import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; @@ -109,6 +107,12 @@ public class ActiveRegion implements HasGenomeLocation { */ private GenomeLoc spanIncludingReads; + + /** + * Indicates whether the active region has been finalized + */ + private boolean hasBeenFinalized; + /** * Create a new ActiveRegion containing no reads * @@ -205,7 +209,7 @@ public class ActiveRegion implements HasGenomeLocation { * @return a non-null array of bytes holding the reference bases in referenceReader */ @Ensures("result != null") - private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); @@ -451,4 +455,12 @@ public class ActiveRegion implements HasGenomeLocation { return new ActiveRegion( subActive, Collections.emptyList(), isActive, genomeLocParser, requiredExtension ); } + + public void setFinalized(final boolean value) { + hasBeenFinalized = value; + } + + public boolean isFinalized() { + return hasBeenFinalized; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java index 6ab3c9a16..4a6becf6f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/beagle/BeagleCodec.java @@ -51,9 +51,8 @@ package org.broadinstitute.sting.utils.codecs.beagle; import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.Feature; import org.broad.tribble.exception.CodecLineParsingException; -import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.LineIterator; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -131,8 +130,8 @@ public class BeagleCodec extends AsciiFeatureCodec implements Ref this.genomeLocParser = genomeLocParser; } - public Object readHeader(LineReader reader) - { + @Override + public Object readActualHeader(LineIterator reader) { int[] lineCounter = new int[1]; try { header = readHeader(reader, lineCounter); @@ -181,14 +180,14 @@ public class BeagleCodec extends AsciiFeatureCodec implements Ref return header; } - private static String[] readHeader(final LineReader source, int[] lineCounter) throws IOException { + private static String[] readHeader(final LineIterator source, int[] lineCounter) throws IOException { String[] header = null; int numLines = 0; //find the 1st line that's non-empty and not a comment - String line; - while( (line = source.readLine()) != null ) { + while(source.hasNext()) { + final String line = source.next(); numLines++; if ( line.trim().isEmpty() ) { continue; diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 38ff35c3f..b2b31b572 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -28,9 +28,7 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; -import org.broad.tribble.readers.AsciiLineReader; -import org.broad.tribble.readers.LineReader; -import org.broad.tribble.readers.PositionalBufferedStream; +import org.broad.tribble.readers.LineIterator; import java.io.IOException; import java.util.Arrays; @@ -112,18 +110,16 @@ public class RawHapMapCodec extends AsciiFeatureCodec { headerLine); } - public Object readHeader(LineReader reader) { - try { - headerLine = reader.readLine(); - } catch (IOException e) { - throw new IllegalArgumentException("Unable to read a line from the line reader"); - } + @Override + public Object readActualHeader(final LineIterator lineIterator) { + this.headerLine = lineIterator.next(); return headerLine; } @Override - public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { - final AsciiLineReader br = new AsciiLineReader(stream); - return new FeatureCodecHeader(readHeader(br), br.getPosition()); + public FeatureCodecHeader readHeader(final LineIterator lineIterator) throws IOException { + final String header = (String) readActualHeader(lineIterator); + // TODO: This approach may cause issues with files formatted with \r\n-style line-endings. + return new FeatureCodecHeader(header, header.length() + 1); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java index 82ee76a81..3c2329b8f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.codecs.refseq; import org.broad.tribble.AsciiFeatureCodec; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; +import org.broad.tribble.readers.LineIterator; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -93,7 +94,8 @@ public class RefSeqCodec extends AsciiFeatureCodec implements Ref } @Override - public Feature decodeLoc(String line) { + public Feature decodeLoc(final LineIterator lineIterator) { + final String line = lineIterator.next(); if (line.startsWith("#")) return null; String fields[] = line.split("\t"); if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); @@ -160,4 +162,10 @@ public class RefSeqCodec extends AsciiFeatureCodec implements Ref feature.setExon_frames(exon_frames); return feature; } + + @Override + public Object readActualHeader(LineIterator lineIterator) { + // No header for this format + return null; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java index 37e44bf8c..34705c4c9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -26,8 +26,8 @@ package org.broadinstitute.sting.utils.codecs.sampileup; import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.Feature; import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.readers.LineIterator; import org.broad.tribble.util.ParsingUtils; import java.util.ArrayList; @@ -163,6 +163,12 @@ public class SAMPileupCodec extends AsciiFeatureCodec { return feature; } + @Override + public Object readActualHeader(LineIterator lineIterator) { + // No header for this format + return null; + } + private void parseIndels(String genotype,SAMPileupFeature feature) { String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java index 69127b289..cfc5d2c2d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/samread/SAMReadCodec.java @@ -29,8 +29,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.TextCigarCodec; import net.sf.samtools.util.StringUtil; import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.Feature; import org.broad.tribble.exception.CodecLineParsingException; +import org.broad.tribble.readers.LineIterator; import org.broad.tribble.util.ParsingUtils; /** @@ -114,4 +114,10 @@ public class SAMReadCodec extends AsciiFeatureCodec { bases, qualities); } + + @Override + public Object readActualHeader(LineIterator lineIterator) { + // No header for this format + return null; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index 9e66056f2..f69001d9d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -26,14 +26,14 @@ package org.broadinstitute.sting.utils.codecs.table; import org.broad.tribble.AsciiFeatureCodec; -import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.LineIterator; import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; /** * Reads tab deliminated tabular text files @@ -97,30 +97,29 @@ public class TableCodec extends AsciiFeatureCodec implements Refer String[] split = line.split(delimiterRegex); if (split.length < 1) throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); - return new TableFeature(genomeLocParser.parseGenomeLoc(split[0]),Arrays.asList(split),header); + return new TableFeature(genomeLocParser.parseGenomeLoc(split[0]),Arrays.asList(split), header); } @Override - public Object readHeader(LineReader reader) { - String line = ""; - try { - boolean isFirst = true; - while ((line = reader.readLine()) != null) { - if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) { - throw new UserException.MalformedFile("TableCodec file does not have a header"); - } - isFirst &= line.startsWith(commentDelimiter); - if (line.startsWith(headerDelimiter)) { - if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines. The second is = " + line); - String spl[] = line.split(delimiterRegex); - for (String s : spl) header.add(s); - return header; - } else if (!line.startsWith(commentDelimiter)) { - break; - } + public Object readActualHeader(final LineIterator reader) { + boolean isFirst = true; + while (reader.hasNext()) { + final String line = reader.peek(); // Peek to avoid reading non-header data + if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) { + throw new UserException.MalformedFile("TableCodec file does not have a header"); + } + isFirst &= line.startsWith(commentDelimiter); + if (line.startsWith(headerDelimiter)) { + reader.next(); // "Commit" the peek + if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines. The second is = " + line); + final String spl[] = line.split(delimiterRegex); + Collections.addAll(header, spl); + return header; + } else if (line.startsWith(commentDelimiter)) { + reader.next(); // "Commit" the peek + } else { + break; } - } catch (IOException e) { - throw new UserException.MalformedFile("unable to parse header from TableCodec file",e); } return header; } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 6126116c2..40a730029 100644 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.variant.GATKVCFIndexType; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; @@ -455,6 +456,14 @@ public class UserException extends ReviewedStingException { } } + public static class GVCFIndexException extends UserException { + public GVCFIndexException (GATKVCFIndexType indexType, int indexParameter) { + super(String.format("GVCF output requires a specific indexing strategy. Please re-run including the arguments " + + "-variant_index_type %s -variant_index_parameter %d.", + indexType, indexParameter)); + } + } + /** * A special exception that happens only in the case where * the filesystem, by design or configuration, is completely unable diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 5d882ba8c..e88065c4a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.utils.fragments; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import net.sf.picard.util.QualityUtil; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -60,6 +61,11 @@ import java.util.*; * Time: 10:09 PM */ public final class FragmentUtils { + + public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; + public final static int DEFAULT_PCR_ERROR_QUAL = QualityUtil.getPhredScoreFromErrorProbability(DEFAULT_PCR_ERROR_RATE); + public final static int HALF_OF_DEFAULT_PCR_ERROR_QUAL = DEFAULT_PCR_ERROR_QUAL / 2; + protected final static byte MIN_QUAL_BAD_OVERLAP = 16; private FragmentUtils() {} // private constructor @@ -189,6 +195,70 @@ public final class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } + public static void adjustQualsOfOverlappingPairedFragments( final List overlappingPair ) { + if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } + + final GATKSAMRecord firstRead = overlappingPair.get(0); + final GATKSAMRecord secondRead = overlappingPair.get(1); + + if ( secondRead.getSoftStart() < firstRead.getSoftStart() ) { + adjustQualsOfOverlappingPairedFragments(secondRead, firstRead); + } else { + adjustQualsOfOverlappingPairedFragments(firstRead, secondRead); + } + } + + /** + * Merge two overlapping reads from the same fragment into a single super read, if possible + * + * firstRead and secondRead must be part of the same fragment (though this isn't checked). Looks + * at the bases and alignment, and tries its best to create a meaningful synthetic single super read + * that represents the entire sequenced fragment. + * + * Assumes that firstRead starts before secondRead (according to their soft clipped starts) + * + * @param clippedFirstRead the left most read + * @param clippedSecondRead the right most read + * + * @return a strandless merged read of first and second, or null if the algorithm cannot create a meaningful one + */ + public static void adjustQualsOfOverlappingPairedFragments(final GATKSAMRecord clippedFirstRead, final GATKSAMRecord clippedSecondRead) { + if ( clippedFirstRead == null ) throw new IllegalArgumentException("clippedFirstRead cannot be null"); + if ( clippedSecondRead == null ) throw new IllegalArgumentException("clippedSecondRead cannot be null"); + if ( ! clippedFirstRead.getReadName().equals(clippedSecondRead.getReadName()) ) throw new IllegalArgumentException("attempting to merge two reads with different names " + clippedFirstRead + " and " + clippedSecondRead); + + // don't adjust fragments that do not overlap + if ( clippedFirstRead.getAlignmentEnd() < clippedSecondRead.getAlignmentStart() || clippedFirstRead.getReferenceIndex() != clippedSecondRead.getReferenceIndex() ) + return; + + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(clippedFirstRead, clippedSecondRead.getAlignmentStart()); + final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); + final int numOverlappingBases = Math.min(clippedFirstRead.getReadLength() - firstReadStop, clippedSecondRead.getReadLength()); + + final byte[] firstReadBases = clippedFirstRead.getReadBases(); + final byte[] firstReadQuals = clippedFirstRead.getBaseQualities(); + final byte[] secondReadBases = clippedSecondRead.getReadBases(); + final byte[] secondReadQuals = clippedSecondRead.getBaseQualities(); + + for ( int i = 0; i < numOverlappingBases; i++ ) { + final int firstReadIndex = firstReadStop + i; + final byte firstReadBase = firstReadBases[firstReadIndex]; + final byte secondReadBase = secondReadBases[i]; + + if ( firstReadBase == secondReadBase ) { + firstReadQuals[firstReadIndex] = (byte) Math.min(firstReadQuals[firstReadIndex], HALF_OF_DEFAULT_PCR_ERROR_QUAL); + secondReadQuals[i] = (byte) Math.min(secondReadQuals[i], HALF_OF_DEFAULT_PCR_ERROR_QUAL); + } else { + // TODO -- use the proper statistical treatment of the quals from DiploidSNPGenotypeLikelihoods.java + firstReadQuals[firstReadIndex] = 0; + secondReadQuals[i] = 0; + } + } + + clippedFirstRead.setBaseQualities(firstReadQuals); + clippedSecondRead.setBaseQualities(secondReadQuals); + } + public static List mergeOverlappingPairedFragments( final List overlappingPair ) { if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java index 03a2b8077..9b12a58d5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/MostLikelyAllele.java @@ -65,7 +65,7 @@ public final class MostLikelyAllele { * @param log10LikelihoodOfMostLikely the log10 likelihood of the most likely allele * @param log10LikelihoodOfSecondBest the log10 likelihood of the next most likely allele (should be NEGATIVE_INFINITY if none is available) */ - public MostLikelyAllele(Allele mostLikely, Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { + public MostLikelyAllele(final Allele mostLikely, final Allele secondMostLikely, double log10LikelihoodOfMostLikely, double log10LikelihoodOfSecondBest) { if ( mostLikely == null ) throw new IllegalArgumentException("mostLikely allele cannot be null"); if ( log10LikelihoodOfMostLikely != Double.NEGATIVE_INFINITY && ! MathUtils.goodLog10Probability(log10LikelihoodOfMostLikely) ) throw new IllegalArgumentException("log10LikelihoodOfMostLikely must be either -Infinity or a good log10 prob but got " + log10LikelihoodOfMostLikely); diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index 70be85f54..b7cd03919 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -110,7 +110,7 @@ public class PerReadAlleleLikelihoodMap { * @return a map from each allele to a list of reads that 'support' the allele */ protected Map> getAlleleStratifiedReadMap() { - final Map> alleleReadMap = new HashMap>(alleles.size()); + final Map> alleleReadMap = new HashMap<>(alleles.size()); for ( final Allele allele : alleles ) alleleReadMap.put(allele, new ArrayList()); @@ -152,7 +152,7 @@ public class PerReadAlleleLikelihoodMap { /** * Does the current map contain the key associated with a particular SAM record in pileup? * @param p Pileup element - * @return + * @return true if the map contains pileup element, else false */ public boolean containsPileupElement(final PileupElement p) { return likelihoodReadMap.containsKey(p.getRead()); @@ -176,9 +176,9 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.keySet(); } - public Collection> getLikelihoodMapValues() { - return likelihoodReadMap.values(); - } +// public Collection> getLikelihoodMapValues() { +// return likelihoodReadMap.values(); +// } public int getNumberOfStoredElements() { return likelihoodReadMap.size(); @@ -191,6 +191,21 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.get(p.getRead()); } + + /** + * Get the log10 likelihood associated with an individual read/allele + * + * @param read the read whose likelihood we want + * @param allele the allele whose likelihood we want + * @return the log10 likelihood that this read matches this allele + */ + public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ + if (!allelesSet.contains(allele) || !likelihoodReadMap.containsKey(read)) + return 0.0; + + return likelihoodReadMap.get(read).get(allele); + } + /** * Get the most likely alleles estimated across all reads in this object * @@ -290,18 +305,16 @@ public class PerReadAlleleLikelihoodMap { /** * Debug method to dump contents of object into string for display */ - @Override public String toString() { - StringBuilder sb = new StringBuilder(); + final StringBuilder sb = new StringBuilder(); sb.append("Alelles in map:"); - for (Allele a:alleles) { + for (final Allele a:alleles) { sb.append(a.getDisplayString()+","); - } sb.append("\n"); - for (Map.Entry > el : getLikelihoodReadMap().entrySet() ) { - for (Map.Entry eli : el.getValue().entrySet()) { + for (final Map.Entry > el : getLikelihoodReadMap().entrySet() ) { + for (final Map.Entry eli : el.getValue().entrySet()) { sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java index 1f932b222..a381e1b2f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/Haplotype.java @@ -38,10 +38,13 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; +import java.util.Comparator; import java.util.LinkedHashMap; import java.util.List; public class Haplotype extends Allele { + + private GenomeLoc genomeLocation = null; private EventMap eventMap = null; private Cigar cigar; @@ -214,7 +217,7 @@ public class Haplotype extends Allele { public void setCigar( final Cigar cigar ) { this.cigar = AlignmentUtils.consolidateCigar(cigar); if ( this.cigar.getReadLength() != length() ) - throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength()); + throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength() + " " + this.cigar); } @Requires({"refInsertLocation >= 0"}) @@ -311,4 +314,30 @@ public class Haplotype extends Allele { public void setScore(double score) { this.score = this.isReference() ? Double.MAX_VALUE : score; } + + /** + * Comparator used to sort haplotypes, alphanumerically. + * + *

+ * If one haplotype is the prefix of the other, the shorter one comes first. + *

+ */ + public static final Comparator ALPHANUMERICAL_COMPARATOR = new Comparator() { + + @Override + public int compare(final Haplotype o1, final Haplotype o2) { + if (o1 == o2) + return 0; + final byte[] bases1 = o1.getBases(); + final byte[] bases2 = o2.getBases(); + final int iLimit = Math.min(bases1.length, bases2.length); + for (int i = 0; i < iLimit; i++) { + final int cmp = Byte.compare(bases1[i], bases2[i]); + if (cmp != 0) return cmp; + } + if (bases1.length == bases2.length) return 0; + return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. + } + }; + } diff --git a/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java new file mode 100644 index 000000000..9981ce495 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/haplotype/HaplotypeSizeAndBaseComparator.java @@ -0,0 +1,47 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.haplotype; + +import java.util.Comparator; + +/** + * Compares two haplotypes first by their lengths and then by lexicographic order of their bases. + * + * User: btaylor + * Date: 8/1/13 + * Time: 11:09 AM + */ +public class HaplotypeSizeAndBaseComparator implements Comparator { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + if (hap1.getBases().length < hap2.getBases().length) + return -1; + else if (hap1.getBases().length > hap2.getBases().length) + return 1; + else + return hap1.getBaseString().compareTo(hap2.getBaseString()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java index 02c269495..893a8349b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GenericDocumentationHandler.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -9,10 +9,10 @@ * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: -* +* * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* +* * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND @@ -37,6 +37,8 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.collections.Pair; @@ -192,11 +194,26 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { @Requires("argumentSource != null") @Ensures("result != null") private String docKindOfArg(ArgumentSource argumentSource) { - if (argumentSource.isRequired()) return "required"; - else if (argumentSource.isAdvanced()) return "advanced"; + if (argumentSource.isRequired()) { + if (argumentSource.isInput()) return "required_in"; + else if (argumentSource.isOutput()) return "required_out"; + else if (argumentSource.isFlag()) return "required_flag"; + else return "required_param"; + } + else if (argumentSource.isAdvanced()) { + if (argumentSource.isInput()) return "advanced_in"; + else if (argumentSource.isOutput()) return "advanced_out"; + else if (argumentSource.isFlag()) return "advanced_flag"; + else return "advanced_param"; + } else if (argumentSource.isHidden()) return "hidden"; - else if (argumentSource.isDeprecated()) return "depreciated"; - else return "optional"; + else if (argumentSource.isDeprecated()) return "deprecated"; + else { + if (argumentSource.isInput()) return "optional_in"; + else if (argumentSource.isOutput()) return "optional_out"; + else if (argumentSource.isFlag()) return "optional_flag"; + else return "optional_param"; + } } /** @@ -236,11 +253,20 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { private Map>> createArgumentMap() { Map>> args = new HashMap>>(); args.put("all", new ArrayList>()); - args.put("required", new ArrayList>()); - args.put("optional", new ArrayList>()); - args.put("advanced", new ArrayList>()); + args.put("required_in", new ArrayList>()); + args.put("required_out", new ArrayList>()); + args.put("required_param", new ArrayList>()); + args.put("required_flag", new ArrayList>()); + args.put("optional_in", new ArrayList>()); + args.put("optional_out", new ArrayList>()); + args.put("optional_param", new ArrayList>()); + args.put("optional_flag", new ArrayList>()); + args.put("advanced_in", new ArrayList>()); + args.put("advanced_out", new ArrayList>()); + args.put("advanced_param", new ArrayList>()); + args.put("advanced_flag", new ArrayList>()); args.put("hidden", new ArrayList>()); - args.put("depreciated", new ArrayList>()); + args.put("deprecated", new ArrayList>()); return args; } @@ -295,6 +321,8 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { // Get annotation info (what type of annotation, standard etc.) final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); root.put("annotinfo", StringUtils.join(annotInfo, ", ")); + // Get annotation field (whether it goes in INFO or FORMAT) + root.put("annotfield", getAnnotField(myClass)); // Get walker type if applicable root.put("walkertype", getWalkerType(myClass)); // Get partition type if applicable @@ -316,6 +344,7 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { // put empty items to avoid blowups root.put("parallel", new HashSet()); root.put("annotinfo", ""); + root.put("annotfield", ""); root.put("walkertype", ""); root.put("partitiontype", ""); root.put("readfilters", new HashSet>()); @@ -359,6 +388,27 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { return getParallelism(mySuperClass, parallelOptions); } + /** + * Utility function that looks up whether the annotation goes in INFO or FORMAT field. + * + * @param myClass the class to query for the interfaces + * @return a String specifying the annotation field + */ + private final String getAnnotField(Class myClass) { + // + // Look up superclasses recursively until we find either + // GenotypeAnnotation or InfoFieldAnnotation + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass == InfoFieldAnnotation.class) { + return "INFO (variant-level)"; + } else if (mySuperClass == GenotypeAnnotation.class) { + return "FORMAT (sample genotype-level)"; + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getAnnotField(mySuperClass); + } + /** * Utility function that determines the annotation type for an instance of class c. * @@ -817,7 +867,7 @@ public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { // general attributes List attributes = new ArrayList(); if (def.required) attributes.add("required"); - if (source.isDeprecated()) attributes.add("depreciated"); + if (source.isDeprecated()) attributes.add("deprecated"); if (attributes.size() > 0) root.put("attributes", Utils.join(", ", attributes)); diff --git a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java index 6f588ac0e..2926bdc63 100644 --- a/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/utils/locusiterator/LocusIteratorByState.java @@ -29,7 +29,8 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecordIterator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -154,7 +155,7 @@ public final class LocusIteratorByState extends LocusIterator { * @param reader a non-null reader * @param it an iterator from reader that has the reads we want to use to create ReadBackPileups */ - public LocusIteratorByState(final SAMFileReader reader, final SAMRecordIterator it) { + public LocusIteratorByState(final SAMFileReader reader, final CloseableIterator it) { this(new GATKSAMIterator(it), new LIBSDownsamplingInfo(false, 0), true, diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java new file mode 100644 index 000000000..6468753d2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/BatchPairHMM.java @@ -0,0 +1,41 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.util.List; + +public interface BatchPairHMM { + public void batchAdd(final List haplotypes, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP); + + public double[] batchGetResult(); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java index ddc1a4559..b83a15d6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/Log10PairHMM.java @@ -1,27 +1,27 @@ /* - * Copyright (c) 2012 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.utils.pairhmm; @@ -32,24 +32,29 @@ import org.broadinstitute.sting.utils.QualityUtils; import java.util.Arrays; +import static java.lang.Math.log10; + /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * * User: rpoplin, carneiro * Date: 3/1/12 */ -public final class Log10PairHMM extends N2MemoryPairHMM { +public class Log10PairHMM extends N2MemoryPairHMM { /** * Should we use exact log10 calculation (true), or an approximation (false)? */ private final boolean doExactLog10; - private static final int matchToMatch = 0; - private static final int indelToMatch = 1; - private static final int matchToInsertion = 2; - private static final int insertionToInsertion = 3; - private static final int matchToDeletion = 4; - private static final int deletionToDeletion = 5; + protected static final int matchToMatch = 0; + protected static final int indelToMatch = 1; + protected static final int matchToInsertion = 2; + protected static final int insertionToInsertion = 3; + protected static final int matchToDeletion = 4; + protected static final int deletionToDeletion = 5; + + // we divide e by 3 because the observed base could have come from any of the non-observed alleles + protected final static double log10_3 = log10(3.0); /** * Create an uninitialized PairHMM @@ -80,9 +85,6 @@ public final class Log10PairHMM extends N2MemoryPairHMM { Arrays.fill(insertionMatrix[iii], Double.NEGATIVE_INFINITY); Arrays.fill(deletionMatrix[iii], Double.NEGATIVE_INFINITY); } - - transition = new double[paddedMaxReadLength][6]; - prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** @@ -96,19 +98,17 @@ public final class Log10PairHMM extends N2MemoryPairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final int nextHapStartIndex) { - if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { - // set the initial value (free deletions in the beginning) for the first row in the deletion matrix - final double initialValue = Math.log10(1.0 / haplotypeBases.length); - for( int j = 0; j < paddedHaplotypeLength; j++ ) { - deletionMatrix[0][j] = initialValue; - } - } if ( ! constantsAreInitialized || recacheReadValues ) initializeProbabilities(insertionGOP, deletionGOP, overallGCP); initializePriors(haplotypeBases, readBases, readQuals, hapStartIndex); + if (previousHaplotypeBases == null || previousHaplotypeBases.length != haplotypeBases.length) { + // set the initial value (free deletions in the beginning) for the first row in the deletion matrix + initializeMatrixValues(haplotypeBases); + } for (int i = 1; i < paddedReadLength; i++) { // +1 here is because hapStartIndex is 0-based, but our matrices are 1 based @@ -120,14 +120,27 @@ public final class Log10PairHMM extends N2MemoryPairHMM { // final probability is the log10 sum of the last element in the Match and Insertion state arrays // this way we ignore all paths that ended in deletions! (huge) // but we have to sum all the paths ending in the M and I matrices, because they're no longer extended. + double finalSumProbabilities = finalLikelihoodCalculation(); + + return finalSumProbabilities; + } + + protected void initializeMatrixValues(final byte[] haplotypeBases) { + final double initialValue = Math.log10(1.0 / haplotypeBases.length); + for( int j = 0; j < paddedHaplotypeLength; j++ ) { + deletionMatrix[0][j] = initialValue; + } + } + + protected double finalLikelihoodCalculation() { final int endI = paddedReadLength - 1; double finalSumProbabilities = myLog10SumLog10(new double[]{matchMatrix[endI][1], insertionMatrix[endI][1]}); for (int j = 2; j < paddedHaplotypeLength; j++) finalSumProbabilities = myLog10SumLog10(new double[]{finalSumProbabilities, matchMatrix[endI][j], insertionMatrix[endI][j]}); - return finalSumProbabilities; } + /** * Initializes the matrix that holds all the constants related to the editing * distance between the read and the haplotype. @@ -148,7 +161,7 @@ public final class Log10PairHMM extends N2MemoryPairHMM { for (int j = startIndex; j < haplotypeBases.length; j++) { final byte y = haplotypeBases[j]; prior[i+1][j+1] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? - QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + QualityUtils.qualToProbLog10(qual) : (QualityUtils.qualToErrorProbLog10(qual) - (doNotUseTristateCorrection ? 0.0 : log10_3)) ); } } } @@ -166,7 +179,7 @@ public final class Log10PairHMM extends N2MemoryPairHMM { "overallGCP != null" }) @Ensures("constantsAreInitialized") - private void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { + protected void initializeProbabilities(final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP) { for (int i = 0; i < insertionGOP.length; i++) { final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE); transition[i+1][matchToMatch] = QualityUtils.qualToProbLog10((byte) qualIndexGOP); @@ -196,7 +209,7 @@ public final class Log10PairHMM extends N2MemoryPairHMM { * @return the log10 of the sum of the probabilities */ @Requires("values != null") - private double myLog10SumLog10(final double[] values) { + protected double myLog10SumLog10(final double[] values) { return doExactLog10 ? MathUtils.log10sumLog10(values) : MathUtils.approximateLog10SumLog10(values); } @@ -211,7 +224,7 @@ public final class Log10PairHMM extends N2MemoryPairHMM { * @param prior the likelihood editing distance matrix for the read x haplotype * @param transition an array with the six transition relevant to this location */ - private void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { + protected void updateCell( final int indI, final int indJ, final double prior, final double[] transition) { matchMatrix[indI][indJ] = prior + myLog10SumLog10(new double[]{matchMatrix[indI - 1][indJ - 1] + transition[matchToMatch], diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java index a091a0716..18cb9054b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/N2MemoryPairHMM.java @@ -26,10 +26,6 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; /** * Superclass for PairHMM that want to use a full read x haplotype matrix for their match, insertion, and deletion matrix @@ -44,6 +40,13 @@ abstract class N2MemoryPairHMM extends PairHMM { protected double[][] insertionMatrix = null; protected double[][] deletionMatrix = null; + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + + public void doNotUseTristateCorrection() { + doNotUseTristateCorrection = true; + } + /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths * @@ -58,6 +61,9 @@ abstract class N2MemoryPairHMM extends PairHMM { matchMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; insertionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; deletionMatrix = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; + + transition = new double[paddedMaxReadLength][6]; + prior = new double[paddedMaxReadLength][paddedMaxHaplotypeLength]; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java index 85ac97f95..ff883c5ae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -28,9 +28,14 @@ package org.broadinstitute.sting.utils.pairhmm; import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; import java.util.Arrays; - +import java.util.List; +import java.util.Map; /** * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. * @@ -43,6 +48,7 @@ public abstract class PairHMM { protected boolean constantsAreInitialized = false; protected byte[] previousHaplotypeBases; + protected int hapStartIndex; public enum HMM_IMPLEMENTATION { /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ @@ -51,12 +57,18 @@ public abstract class PairHMM { ORIGINAL, /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ LOGLESS_CACHING, + /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ + ARRAY_LOGLESS } protected int maxHaplotypeLength, maxReadLength; protected int paddedMaxReadLength, paddedMaxHaplotypeLength; protected int paddedReadLength, paddedHaplotypeLength; - private boolean initialized = false; + protected boolean initialized = false; + + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } /** * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths @@ -83,6 +95,79 @@ public abstract class PairHMM { initialized = true; } + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Map haplotypeMap) { + int listMaxHaplotypeLength = 0; + for( final Allele a: haplotypeMap.keySet() ) { + final Haplotype h = haplotypeMap.get(a); + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param reads the list of reads + * @param alleleHaplotypeMap the list of haplotypes + * @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. + * @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of + * said read coming from the said haplotype under the provided error model + */ + public PerReadAlleleLikelihoodMap computeLikelihoods(final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap) { + + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(reads); + final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + for(GATKSAMRecord read : reads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = GCPArrayMap.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + byte[] currentHaplotypeBases = null; + boolean isFirstHaplotype = true; + Allele currentAllele = null; + double log10l; + for (final Allele allele : alleleHaplotypeMap.keySet()){ + final Haplotype haplotype = alleleHaplotypeMap.get(allele); + final byte[] nextHaplotypeBases = haplotype.getBases(); + if (currentHaplotypeBases != null) { + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases); + likelihoodMap.add(read, currentAllele, log10l); + } + // update the current haplotype + currentHaplotypeBases = nextHaplotypeBases; + currentAllele = allele; + } + // process the final haplotype + if (currentHaplotypeBases != null) { + + // there is no next haplotype, so pass null for nextHaplotypeBases. + log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null); + likelihoodMap.add(read, currentAllele, log10l); + } + } + return likelihoodMap; + } + /** * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion * probabilities. @@ -95,7 +180,7 @@ public abstract class PairHMM { * * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length - * @param readQuals the phred-scaled per base substitition quality scores of read. Must be the same length as readBases + * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases @@ -103,13 +188,15 @@ public abstract class PairHMM { * parameters are the same, and only the haplotype bases are changing underneath us * @return the log10 probability of read coming from the haplotype under the provided error model */ - public final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final boolean recacheReadValues ) { + final boolean recacheReadValues, + final byte[] nextHaploytpeBases) { + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); @@ -123,9 +210,13 @@ public abstract class PairHMM { paddedReadLength = readBases.length + 1; paddedHaplotypeLength = haplotypeBases.length + 1; - final int hapStartIndex = (previousHaplotypeBases == null || haplotypeBases.length != previousHaplotypeBases.length || recacheReadValues) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, previousHaplotypeBases); + hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; - double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues); + // Pre-compute the difference between the current haplotype and the next one to be run + // Looking ahead is necessary for the ArrayLoglessPairHMM implementation + final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); + + double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); if ( ! MathUtils.goodLog10Probability(result) ) throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f", Arrays.toString(haplotypeBases), Arrays.toString(readBases), result)); @@ -134,6 +225,10 @@ public abstract class PairHMM { // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. previousHaplotypeBases = haplotypeBases; + // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype + // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart + hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; + return result; } @@ -149,7 +244,8 @@ public abstract class PairHMM { final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final boolean recacheReadValues ); + final boolean recacheReadValues, + final int nextHapStartIndex); /** * Compute the first position at which two haplotypes differ diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java new file mode 100644 index 000000000..125fee36c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMMReadyHaplotypes.java @@ -0,0 +1,182 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import java.util.*; + +/** + * Collection of haplotypes sorted in a conveniently way to be run efficiently by the PairHMM. + * + * TODO not yet in use but likely to be as part of making graph-base likelihood run faster. + * TODO this could be extended to the classical PairHMM implementation simplifyling the PairHMM API. + */ +public class PairHMMReadyHaplotypes implements Iterable { + + + public class Entry { + + private final byte[] bases; + + private double likelihood = Double.NaN; + + protected Entry(final byte[] bases) { + this.bases = bases; + } + + protected byte[] getBases() { + return bases; + } + + public void setLikelihood(final double lk) { + likelihood = lk; + } + + public double getLikelihood() { + return likelihood; + } + + } + + private Map> commonPrefixLength; + + private SortedSet entries; + + private int capacity; + + private final Comparator comparator = new Comparator() { + @Override + public int compare(final Entry o1, final Entry o2) { + final byte[] b1 = o1.bases; + final byte[] b2 = o2.bases; + Map b1map = commonPrefixLength.get(o1); + if (b1map == null) + commonPrefixLength.put(o1, b1map = new HashMap<>(capacity)); + Map b2map = commonPrefixLength.get(o2); + if (b2map == null) + commonPrefixLength.put(o2, b2map = new HashMap<>(capacity)); + final Integer previousI = b1map.get(o2) == null ? null : b1map.get(o2); + int i; + int result; + final int iLimit = Math.min(b1.length,b2.length); + if (previousI == null) { + for (i = 0; i < iLimit; i++) + if (b1[i] != b2[i]) + break; + b1map.put(o2,i); + b2map.put(o1,i); + } else + i = previousI; + + if (i < iLimit) + result = Byte.compare(b1[i],b2[i]); + else if (b1.length == b2.length) + result = 0; + else + result = b1.length < b2.length ? -1 : 1; + return result; + } + }; + + public PairHMMReadyHaplotypes(final int capacity) { + commonPrefixLength = new HashMap<>(capacity); + entries = new TreeSet<>(comparator); + } + + public void add(final byte[] bases) { + final Entry entry = new Entry(bases); + entries.add(entry); + } + + public int size() { + return entries.size(); + } + + @Override + public Iterator iterator() { + return new Iterator(); + } + + public class Iterator implements java.util.Iterator { + + private java.util.Iterator actualIterator; + private Entry previousEntry; + private Entry currentEntry; + private int startIndex; + private int cmp; + + private Iterator() { + actualIterator = entries.iterator(); + } + + public boolean hasNext() { + return actualIterator.hasNext(); + } + + public Entry next() { + previousEntry = currentEntry; + final Entry result = currentEntry = actualIterator.next(); + startIndex = -1; + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + public byte[] bases() { + if (currentEntry == null) + throw new NoSuchElementException(); + return currentEntry.bases; + } + + public int startIndex() { + if (startIndex >= 0) + return startIndex; + else if (previousEntry == null) + return startIndex = 0; + else { + // The comparator will make sure the common-prefix-length is updated. + // The result in a field so that we avoid dead code elimination. + // perhaps I a bit paranohic but it does not harm to prevent. + cmp = comparator.compare(previousEntry,currentEntry); + return startIndex = commonPrefixLength.get(previousEntry).get(currentEntry); + } + } + + @Override + public String toString() { + return super.toString() + " cmp = " + cmp; + } + + public void setLikelihood(final double likelihood) { + if (currentEntry == null) + throw new NoSuchElementException(); + currentEntry.setLikelihood(likelihood); + } + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index f4c673e61..8a034dde0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -46,7 +46,7 @@ import java.util.List; * Time: 8:54:05 AM */ public class PileupElement implements Comparable { - private final static LinkedList EMPTY_LINKED_LIST = new LinkedList(); + private final static LinkedList EMPTY_LINKED_LIST = new LinkedList<>(); private final static EnumSet ON_GENOME_OPERATORS = EnumSet.of(CigarOperator.M, CigarOperator.EQ, CigarOperator.X, CigarOperator.D); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index e1865ba3c..059c41d64 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -270,7 +270,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * Get an array of the mapping qualities * @return */ - public byte[] getMappingQuals(); + public int[] getMappingQuals(); /** * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 65c47c23b..455a6aa12 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -969,11 +969,11 @@ public class ReadBackedPileupImpl implements ReadBackedPileup { * @return */ @Override - public byte[] getMappingQuals() { - byte[] v = new byte[getNumberOfElements()]; + public int[] getMappingQuals() { + final int[] v = new int[getNumberOfElements()]; int pos = 0; - for (PileupElement pile : pileupElementTracker) { - v[pos++] = (byte) pile.getRead().getMappingQuality(); + for ( final PileupElement pile : pileupElementTracker ) { + v[pos++] = pile.getRead().getMappingQuality(); } return v; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c39245730..93718b04d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -139,7 +139,7 @@ public class GATKSAMRecord extends BAMRecord { } public static GATKSAMRecord createRandomRead(int length) { - List cigarElements = new LinkedList(); + List cigarElements = new LinkedList<>(); cigarElements.add(new CigarElement(length, CigarOperator.M)); Cigar cigar = new Cigar(cigarElements); return ArtificialSAMUtils.createArtificialRead(cigar); @@ -536,10 +536,7 @@ public class GATKSAMRecord extends BAMRecord { * @return True if an attribute has been set for this key. */ public boolean containsTemporaryAttribute(Object key) { - if(temporaryAttributes != null) { - return temporaryAttributes.containsKey(key); - } - return false; + return temporaryAttributes != null && temporaryAttributes.containsKey(key); } /** @@ -556,7 +553,7 @@ public class GATKSAMRecord extends BAMRecord { */ public Object setTemporaryAttribute(Object key, Object value) { if(temporaryAttributes == null) { - temporaryAttributes = new HashMap(); + temporaryAttributes = new HashMap<>(); } return temporaryAttributes.put(key, value); } @@ -750,6 +747,46 @@ public class GATKSAMRecord extends BAMRecord { return emptyRead; } + /** + * Creates a new GATKSAMRecord with the source read's header, read group and mate + * information, but with the following fields set to user-supplied values: + * - Read Bases + * - Base Qualities + * - Base Insertion Qualities + * - Base Deletion Qualities + * + * Cigar string is empty (not-null) + * + * Use this method if you want to create a new GATKSAMRecord based on + * another GATKSAMRecord, but with modified bases and qualities + * + * @param read a read to copy the header from + * @param readBases an array containing the new bases you wish use in place of the originals + * @param baseQualities an array containing the new base qualities you wish use in place of the originals + * @param baseInsertionQualities an array containing the new base insertion qaulities + * @param baseDeletionQualities an array containing the new base deletion qualities + * @return a read with modified bases and qualities, safe for the GATK + */ + public static GATKSAMRecord createQualityModifiedRead(final GATKSAMRecord read, + final byte[] readBases, + final byte[] baseQualities, + final byte[] baseInsertionQualities, + final byte[] baseDeletionQualities) { + if ( baseQualities.length != readBases.length || baseInsertionQualities.length != readBases.length || baseDeletionQualities.length != readBases.length ) + throw new IllegalArgumentException("Read bases and read quality arrays aren't the same size: Bases:" + readBases.length + + " vs Base Q's:" + baseQualities.length + + " vs Insert Q's:" + baseInsertionQualities.length + + " vs Delete Q's:" + baseDeletionQualities.length); + + final GATKSAMRecord processedRead = GATKSAMRecord.emptyRead(read); + processedRead.setReadBases(readBases); + processedRead.setBaseQualities(baseQualities, EventType.BASE_SUBSTITUTION); + processedRead.setBaseQualities(baseInsertionQualities, EventType.BASE_INSERTION); + processedRead.setBaseQualities(baseDeletionQualities, EventType.BASE_DELETION); + + return processedRead; + } + /** * Shallow copy of everything, except for the attribute list and the temporary attributes. * A new list of the attributes is created for both, but the attributes themselves are copied by reference. @@ -762,7 +799,7 @@ public class GATKSAMRecord extends BAMRecord { public Object clone() throws CloneNotSupportedException { final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); if (temporaryAttributes != null) { - clone.temporaryAttributes = new HashMap(); + clone.temporaryAttributes = new HashMap<>(); for (Object attribute : temporaryAttributes.keySet()) clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index ab866013f..39f227840 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -152,8 +152,8 @@ public class ReadUtils { public static SAMFileWriter createSAMFileWriterWithCompression(SAMFileHeader header, boolean presorted, String file, int compression) { validateCompressionLevel(compression); if (file.endsWith(".bam")) - return new SAMFileWriterFactory().makeBAMWriter(header, presorted, new File(file), compression); - return new SAMFileWriterFactory().makeSAMOrBAMWriter(header, presorted, new File(file)); + return new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, presorted, new File(file), compression); + return new SAMFileWriterFactory().setCreateIndex(true).makeSAMOrBAMWriter(header, presorted, new File(file)); } public static int validateCompressionLevel(final int requestedCompressionLevel) { diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java index 1abf9f836..e730870c6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SWPairwiseAlignment.java @@ -69,10 +69,20 @@ public class SWPairwiseAlignment implements SmithWaterman { * Add softclips for the overhangs */ SOFTCLIP, + /* * Treat the overhangs as proper insertions/deletions */ INDEL, + + /* + * Treat the overhangs as proper insertions/deletions for leading (but not trailing) overhangs. + * This is useful e.g. when we want to merge dangling tails in an assembly graph: because we don't + * expect the dangling tail to reach the end of the reference path we are okay ignoring trailing + * deletions - but leading indels are still very much relevant. + */ + LEADING_INDEL, + /* * Just ignore the overhangs */ @@ -125,10 +135,11 @@ public class SWPairwiseAlignment implements SmithWaterman { * * @param seq1 the first sequence we want to align * @param seq2 the second sequence we want to align + * @param parameters the SW parameters to use * @param strategy the overhang strategy to use */ - public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final OVERHANG_STRATEGY strategy) { - this(SWParameterSet.ORIGINAL_DEFAULT.parameters); + public SWPairwiseAlignment(final byte[] seq1, final byte[] seq2, final SWParameterSet parameters, final OVERHANG_STRATEGY strategy) { + this(parameters.parameters); overhang_strategy = strategy; align(seq1, seq2); } @@ -226,7 +237,7 @@ public class SWPairwiseAlignment implements SmithWaterman { final int[] gap_size_h = new int[n+1]; // we need to initialize the SW matrix with gap penalties if we want to keep track of indels at the edges of alignments - if ( overhang_strategy == OVERHANG_STRATEGY.INDEL ) { + if ( overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL ) { // initialize the first row sw[1] = parameters.w_open; double currentValue = parameters.w_open; @@ -371,7 +382,7 @@ public class SWPairwiseAlignment implements SmithWaterman { p1 = refLength; p2 = altLength; } else { - // look for largest score. we use >= combined with the traversal direction + // look for the largest score on the rightmost column. we use >= combined with the traversal direction // to ensure that if two scores are equal, the one closer to diagonal gets picked for ( int i = 1, data_offset = altLength+1+altLength ; i < refLength+1 ; i++, data_offset += (altLength+1) ) { // data_offset is the offset of [i][m] @@ -380,18 +391,21 @@ public class SWPairwiseAlignment implements SmithWaterman { } } - for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) { - // data_offset is the offset of [n][j] - if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) { - p1 = refLength; - p2 = j ; - maxscore = sw[data_offset]; - segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment + // now look for a larger score on the bottom-most row + if ( overhang_strategy != OVERHANG_STRATEGY.LEADING_INDEL ) { + for ( int j = 1, data_offset = refLength*(altLength+1)+1 ; j < altLength+1 ; j++, data_offset++ ) { + // data_offset is the offset of [n][j] + if ( sw[data_offset] > maxscore || sw[data_offset] == maxscore && Math.abs(refLength-j) < Math.abs(p1 - p2)) { + p1 = refLength; + p2 = j ; + maxscore = sw[data_offset]; + segment_length = altLength - j ; // end of sequence 2 is overhanging; we will just record it as 'M' segment + } } } } - List lce = new ArrayList(5); + final List lce = new ArrayList(5); if ( segment_length > 0 && overhang_strategy == OVERHANG_STRATEGY.SOFTCLIP ) { lce.add(makeElement(State.CLIP, segment_length)); @@ -452,7 +466,7 @@ public class SWPairwiseAlignment implements SmithWaterman { } else if ( overhang_strategy == OVERHANG_STRATEGY.IGNORE ) { lce.add(makeElement(state, segment_length + p2)); alignment_offset = p1 - p2; - } else { // overhang_strategy == OVERHANG_STRATEGY.INDEL + } else { // overhang_strategy == OVERHANG_STRATEGY.INDEL || overhang_strategy == OVERHANG_STRATEGY.LEADING_INDEL // take care of the actual alignment lce.add(makeElement(state, segment_length)); diff --git a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java index 3a8afca8c..4cf39d6be 100644 --- a/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java +++ b/public/java/src/org/broadinstitute/sting/utils/smithwaterman/SmithWaterman.java @@ -30,7 +30,7 @@ import net.sf.samtools.Cigar; /** * Generic interface for SmithWaterman calculations * - * This interface allows clients to use a generic SmithWaterman variable, without propogating the specific + * This interface allows clients to use a generic SmithWaterman variable, without propagating the specific * implementation of SmithWaterman throughout their code: * * SmithWaterman sw = new SpecificSmithWatermanImplementation(ref, read, params) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java new file mode 100644 index 000000000..3f00d9fe5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFIndexType.java @@ -0,0 +1,39 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.variant; + +import org.broadinstitute.sting.commandline.EnumerationArgumentDefault; + +/** + * Choose the Tribble indexing strategy + */ +public enum GATKVCFIndexType { + @EnumerationArgumentDefault + DYNAMIC_SEEK, // use DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME) + DYNAMIC_SIZE, // use DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SIZE) + LINEAR, // use LinearIndexCreator() + INTERVAL // use IntervalIndexCreator() +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java index 09db585a6..5a160566e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVCFUtils.java @@ -28,6 +28,12 @@ package org.broadinstitute.sting.utils.variant; import org.broad.tribble.Feature; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.index.DynamicIndexCreator; +import org.broad.tribble.index.IndexCreator; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.index.interval.IntervalIndexCreator; +import org.broad.tribble.index.linear.LinearIndexCreator; +import org.broad.tribble.readers.LineIterator; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -42,6 +48,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.util.*; + /** * A set of GATK-specific static utility methods for common operations on VCF files/records. */ @@ -54,6 +61,9 @@ public class GATKVCFUtils { public final static String GATK_COMMAND_LINE_KEY = "GATKCommandLine"; + public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type. + public final static Integer DEFAULT_INDEX_PARAMETER = -1; // the default DYNAMIC_SEEK does not use a parameter + /** * Gets the appropriately formatted header for a VCF file describing this GATK run * @@ -74,7 +84,7 @@ public class GATKVCFUtils { attributes.put("Date", date.toString()); attributes.put("Epoch", Long.toString(date.getTime())); attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray())); - return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes, Collections.emptyList()); + return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes); } public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { @@ -174,34 +184,50 @@ public class GATKVCFUtils { return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); } + /** + * Create and return an IndexCreator + * @param type + * @param parameter + * @param outFile + * @return + */ + public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile) { + IndexCreator idxCreator; + switch (type) { + case DYNAMIC_SEEK: idxCreator = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); break; + case DYNAMIC_SIZE: idxCreator = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SIZE); break; + case LINEAR: idxCreator = new LinearIndexCreator(); break; + case INTERVAL: idxCreator = new IntervalIndexCreator(); break; + default: throw new IllegalArgumentException("Unknown IndexCreator type: " + type); + } + + idxCreator.initialize(outFile, parameter); + return idxCreator; + } + /** * Utility class to read all of the VC records from a file * - * @param source + * @param file * @param codec * @return * @throws IOException */ - public final static Pair readAllVCs( final File source, final FeatureCodec codec ) throws IOException { + public final static Pair> readAllVCs( final File file, final FeatureCodec codec) throws IOException { // read in the features - PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); - FeatureCodecHeader header = codec.readHeader(pbs); - pbs.close(); - - pbs = new PositionalBufferedStream(new FileInputStream(source)); - pbs.skip(header.getHeaderEnd()); - + SOURCE source = codec.makeSourceFromStream(new FileInputStream(file)); + FeatureCodecHeader header = codec.readHeader(source); final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - return new Pair(vcfHeader, new VCIterable(pbs, codec, vcfHeader)); + return new Pair<>(vcfHeader, new VCIterable<>(source, codec, vcfHeader)); } - public static class VCIterable implements Iterable, Iterator { - final PositionalBufferedStream pbs; - final FeatureCodec codec; + public static class VCIterable implements Iterable, Iterator { + final SOURCE source; + final FeatureCodec codec; final VCFHeader header; - private VCIterable(final PositionalBufferedStream pbs, final FeatureCodec codec, final VCFHeader header) { - this.pbs = pbs; + private VCIterable(final SOURCE source, final FeatureCodec codec, final VCFHeader header) { + this.source = source; this.codec = codec; this.header = header; } @@ -213,17 +239,13 @@ public class GATKVCFUtils { @Override public boolean hasNext() { - try { - return ! pbs.isDone(); - } catch ( IOException e ) { - throw new RuntimeException(e); - } + return ! codec.isDone(source); } @Override public VariantContext next() { try { - final VariantContext vc = codec.decode(pbs); + final VariantContext vc = codec.decode(source); return vc == null ? null : vc.fullyDecode(header, false); } catch ( IOException e ) { throw new RuntimeException(e); @@ -249,20 +271,19 @@ public class GATKVCFUtils { final List vcs = new ArrayList(); final VCFCodec codec = new VCFCodec(); PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); - FeatureCodecHeader header = codec.readHeader(pbs); - pbs.close(); + final LineIterator vcfSource = codec.makeSourceFromStream(pbs); + try { + final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(vcfSource); - pbs = new PositionalBufferedStream(new FileInputStream(source)); - pbs.skip(header.getHeaderEnd()); + while (vcfSource.hasNext()) { + final VariantContext vc = codec.decode(vcfSource); + if ( vc != null ) + vcs.add(vc); + } - final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - - while ( ! pbs.isDone() ) { - final VariantContext vc = codec.decode(pbs); - if ( vc != null ) - vcs.add(vc); + return new Pair>(vcfHeader, vcs); + } finally { + codec.close(vcfSource); } - - return new Pair>(vcfHeader, vcs); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 3bc5da82f..1ae34e268 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -45,7 +45,11 @@ public class GATKVariantContextUtils { public static final int DEFAULT_PLOIDY = 2; public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - protected static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + public final static String MERGE_FILTER_PREFIX = "filterIn"; public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; @@ -108,7 +112,7 @@ public class GATKVariantContextUtils { int averageLengthNum = 0; int averageLengthDenom = 0; int refLength = vc.getReference().length(); - for ( Allele a : vc.getAlternateAlleles() ) { + for ( final Allele a : vc.getAlternateAlleles() ) { int numAllele = vc.getCalledChrCount(a); int alleleSize; if ( a.length() == refLength ) { @@ -182,8 +186,8 @@ public class GATKVariantContextUtils { */ public static VariantContext reverseComplement(VariantContext vc) { // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - for ( Allele originalAllele : vc.getAlleles() ) { + HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); + for ( final Allele originalAllele : vc.getAlleles() ) { Allele newAllele; if ( originalAllele.isNoCall() ) newAllele = originalAllele; @@ -195,8 +199,8 @@ public class GATKVariantContextUtils { // create new Genotype objects GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getAlleles() ) { + List newAlleles = new ArrayList<>(); + for ( final Allele allele : genotype.getAlleles() ) { Allele newAllele = alleleMap.get(allele); if ( newAllele == null ) newAllele = Allele.NO_CALL; @@ -267,7 +271,7 @@ public class GATKVariantContextUtils { final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList(); + final ArrayList lengths = new ArrayList<>(); for ( final Allele allele : vc.getAlternateAlleles() ) { Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); @@ -317,7 +321,7 @@ public class GATKVariantContextUtils { repetitionCount[0] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; repetitionCount[1] = findNumberofRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - return new Pair(repetitionCount, repeatUnit); + return new Pair<>(repetitionCount, repeatUnit); } @@ -449,7 +453,12 @@ public class GATKVariantContextUtils { * rather than the undetermined behavior when using the PLs to assign, which could result * in hom-var or hom-ref for each, depending on the exact PL values. */ - BEST_MATCH_TO_ORIGINAL + BEST_MATCH_TO_ORIGINAL, + + /** + * do not even bother changing the GTs + */ + DO_NOT_ASSIGN_GENOTYPES } /** @@ -457,8 +466,8 @@ public class GATKVariantContextUtils { * * @param vc variant context with genotype likelihoods * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes true if we should update the genotypes based on the (subsetted) PLs - * @return genotypes + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext */ public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, final List allelesToUse, @@ -466,50 +475,109 @@ public class GATKVariantContextUtils { if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); - // optimization: if no input genotypes, just exit - if (oldGTs.isEmpty()) return newGTs; - - // samples - final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); - final int numNewAltAlleles = allelesToUse.size() - 1; + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); - // which PLs should be carried forward? - ArrayList likelihoodIndexesToUse = null; + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); + } + + /** + * Figure out which likelihood indexes to use for a selected down set of alleles + * + * @param originalVC the original VariantContext + * @param allelesToUse the subset of alleles to use + * @return a list of PL indexes to use or null if none + */ + private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( numNewAltAlleles != numOriginalAltAlleles ) { - likelihoodIndexesToUse = new ArrayList<>(30); + if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) + return null; - final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) - altAlleleIndexToUse[i] = true; - } + return getLikelihoodIndexes(originalVC, alleleIndexesToUse); + } - // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(1 + numOriginalAltAlleles, DEFAULT_PLOIDY); - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) - likelihoodIndexesToUse.add(PLindex); - } + /** + * Get the actual likelihoods indexes to use given the corresponding allele indexes + * + * @param originalVC the original VariantContext + * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) + * @return a non-null List + */ + private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + + final List result = new ArrayList<>(30); + + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); + + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) + result.add(PLindex); } + return result; + } + + /** + * Given an original VariantContext and a list of alleles from that VC to keep, + * returns a bitset representing which allele indexes should be kept + * + * @param originalVC the original VC + * @param allelesToKeep the list of alleles to keep + * @return non-null bitset + */ + private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { + final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; + final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; + + // the reference Allele is definitely still used + alleleIndexesToKeep[0] = true; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) + alleleIndexesToKeep[i+1] = true; + } + + return alleleIndexesToKeep; + } + + /** + * Create the new GenotypesContext with the subsetted PLs + * + * @param originalGs the original GenotypesContext + * @param vc the original VariantContext + * @param allelesToUse the actual alleles to use with the new Genotypes + * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, + final VariantContext vc, + final List allelesToUse, + final List likelihoodIndexesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // make sure we are seeing the expected number of likelihoods per sample + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + // create the new genotypes - for ( int k = 0; k < oldGTs.size(); k++ ) { - final Genotype g = oldGTs.get(sampleIndices.get(k)); + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); final GenotypeBuilder gb = new GenotypeBuilder(g); // create the new likelihoods array from the alleles we are allowed to use @@ -528,7 +596,7 @@ public class GATKVariantContextUtils { } else { newLikelihoods = new double[likelihoodIndexesToUse.size()]; int newIndex = 0; - for ( int oldIndex : likelihoodIndexesToUse ) + for ( final int oldIndex : likelihoodIndexesToUse ) newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; // might need to re-normalize @@ -561,18 +629,21 @@ public class GATKVariantContextUtils { * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null * @param allelesToUse the alleles we are using for our subsetting */ - protected static void updateGenotypeAfterSubsetting(final List originalGT, - final GenotypeBuilder gb, - final GenotypeAssignmentMethod assignmentMethod, - final double[] newLikelihoods, - final List allelesToUse) { - gb.noAD(); + public static void updateGenotypeAfterSubsetting(final List originalGT, + final GenotypeBuilder gb, + final GenotypeAssignmentMethod assignmentMethod, + final double[] newLikelihoods, + final List allelesToUse) { switch ( assignmentMethod ) { + case DO_NOT_ASSIGN_GENOTYPES: + break; case SET_TO_NO_CALL: gb.alleles(NO_CALL_ALLELES); + gb.noAD(); gb.noGQ(); break; case USE_PLS_TO_ASSIGN: + gb.noAD(); if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { // if there is no mass on the (new) likelihoods, then just no-call the sample gb.alleles(NO_CALL_ALLELES); @@ -593,6 +664,7 @@ public class GATKVariantContextUtils { } gb.noGQ(); gb.noPL(); + gb.noAD(); gb.alleles(best); break; } @@ -618,7 +690,7 @@ public class GATKVariantContextUtils { if (oldGTs.isEmpty()) return oldGTs; // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(); + final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); final Allele ref = vc.getReference(); final List diploidRefAlleles = Arrays.asList(ref, ref); @@ -718,6 +790,7 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, @@ -728,9 +801,10 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { + final boolean mergeInfoWithMaxAC, + final boolean combineAnnotations ) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); + return simpleMerge(unsortedVCs, Collections.emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations); } /** @@ -738,11 +812,12 @@ public class GATKVariantContextUtils { * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with * the sample name. * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. * * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ * * @param unsortedVCs collection of unsorted VCs + * @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes @@ -751,9 +826,11 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, + final Collection potentialRefVCs, final List priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, @@ -762,7 +839,8 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { + final boolean mergeInfoWithMaxAC, + final boolean combineAnnotations ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -775,12 +853,16 @@ public class GATKVariantContextUtils { final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary - final List VCs = new ArrayList(); + List VCs = new ArrayList<>(); for (final VariantContext vc : preFilteredVCs) { if ( ! filteredAreUncalled || vc.isNotFiltered() ) VCs.add(vc); } + + // cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible + VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs); + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -789,17 +871,18 @@ public class GATKVariantContextUtils { final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - final Set alleles = new LinkedHashSet(); - final Set filters = new HashSet(); - final Map attributes = new LinkedHashMap(); - final Set inconsistentAttributes = new HashSet(); - final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id + final Set alleles = new LinkedHashSet<>(); + final Set filters = new HashSet<>(); + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id VariantContext longestVC = first; int depth = 0; int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap(); + final Map attributesWithMaxAC = new LinkedHashMap<>(); + final Map> annotationMap = new LinkedHashMap<>(); double log10PError = CommonInfo.NO_LOG10_PERROR; boolean anyVCHadFiltersApplied = false; VariantContext vcWithMaxAC = null; @@ -811,7 +894,6 @@ public class GATKVariantContextUtils { boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( final VariantContext vc : VCs ) { if ( longestVC.getStart() != vc.getStart() ) throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); @@ -846,10 +928,10 @@ public class GATKVariantContextUtils { if ( vc.hasID() ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a , separator + // lets see if the string contains a "," separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (String alleleCount : alleleCountArray) { + final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (final String alleleCount : alleleCountArray) { final int ac = Integer.valueOf(alleleCount.trim()); if (ac > maxAC) { maxAC = ac; @@ -866,21 +948,40 @@ public class GATKVariantContextUtils { } for (final Map.Entry p : vc.getAttributes().entrySet()) { - String key = p.getKey(); - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + final String key = p.getKey(); + final Object value = p.getValue(); + boolean badAnnotation = false; + if ( combineAnnotations ) { // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (NumberFormatException e) { + badAnnotation = true; + } + } + if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - if ( alreadyFound && ! boundValue.equals(p.getValue()) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - //System.out.printf("Inconsistent INFO values: %s => %s and %s%n", key, boundValue, p.getValue()); - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - //if ( vc != first ) System.out.printf("Adding key %s => %s%n", p.getKey(), p.getValue()); - attributes.put(key, p.getValue()); + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); + } } } } @@ -906,6 +1007,12 @@ public class GATKVariantContextUtils { // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } } // if at least one record was unfiltered and we want a union, clear all of the filters @@ -922,7 +1029,7 @@ public class GATKVariantContextUtils { else if ( variantSources.isEmpty() ) // everyone was reference setValue = MERGE_REF_IN_ALL; else { - final LinkedHashSet s = new LinkedHashSet(); + final LinkedHashSet s = new LinkedHashSet<>(); for ( final VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); @@ -950,7 +1057,12 @@ public class GATKVariantContextUtils { if ( anyVCHadFiltersApplied ) { builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); } - builder.attributes(new TreeMap(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + if( combineAnnotations ) { + // unfortunately some attributes are just too dangerous to try to combine together + builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); + builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + } // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); @@ -958,6 +1070,68 @@ public class GATKVariantContextUtils { return merged; } + private static final Comparable combineAnnotationValues( final List array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible + * @param VCs the list of VCs in which to fill in symbolic alleles + * @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles + * @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele + */ + protected static final List fillInNonRefSymbolicAlleles( final List VCs, final Collection potentialRefVCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); } + if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); } + + final List VCsToReturn = new ArrayList<>(VCs.size()); + boolean containsNonRefSymbolicAllele = false; + VariantContext nonRefVC = null; + for( final VariantContext vc : VCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + containsNonRefSymbolicAllele = true; + } else if ( nonRefVC == null ) { + nonRefVC = vc; + } + if( nonRefVC != null && containsNonRefSymbolicAllele == true ) { + break; // break out so that we don't run over the whole list unnecessarily + } + } + for( final VariantContext vc : potentialRefVCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + containsNonRefSymbolicAllele = true; + VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below + } + } + + if( !containsNonRefSymbolicAllele ) { + return VCs; + } + + for( final VariantContext vc : VCs ) { + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site + if( nonRefVC != null ) { + final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size()); + int depth = 0; + for( final String sample : vc.getSampleNames() ) { + final Genotype gt = vc.getGenotype(sample); + final ArrayList refAlleles = new ArrayList<>(2); + refAlleles.add(nonRefVC.getReference()); + refAlleles.add(nonRefVC.getReference()); + final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now + depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here + genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make()); + } + VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make()); + } + } else { + VCsToReturn.add(vc); + } + } + + return VCsToReturn; + } + private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); @@ -977,7 +1151,7 @@ public class GATKVariantContextUtils { } public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { - GenotypesContext newGs = GenotypesContext.create(genotypes.size()); + final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for ( final Genotype g : genotypes ) { newGs.add(removePLsAndAD(g)); @@ -986,11 +1160,113 @@ public class GATKVariantContextUtils { return newGs; } + /** + * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles + * from the original VariantContext are no longer present. + * + * @param selectedVC the selected (new) VariantContext + * @param originalVC the original VariantContext + * @return a new non-null GenotypesContext + */ + public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { + final int numNewAlleles = selectedVC.getAlleles().size(); + final int numOriginalAlleles = originalVC.getAlleles().size(); + + // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong + if ( numNewAlleles > numOriginalAlleles ) + throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); + + final GenotypesContext oldGs = selectedVC.getGenotypes(); + + // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything + if ( numNewAlleles == numOriginalAlleles ) + return oldGs; + + final GenotypesContext newGs = fixPLsFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); + + return fixADFromSubsettedAlleles(newGs, originalVC, selectedVC.getAlleles()); + } + + /** + * Fix the PLs for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixPLsFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); + } + + /** + * Fix the AD for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); + } + + return newGTs; + } + + /** + * Fix the AD for the given Genotype + * + * @param genotype the original Genotype + * @param alleleIndexesToUse a bitset describing whether or not to keep a given index + * @param nAllelesToUse how many alleles we are keeping + * @return a non-null Genotype + */ + private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { + // if it ain't broke don't fix it + if ( !genotype.hasAD() ) + return genotype; + + final GenotypeBuilder builder = new GenotypeBuilder(genotype); + + final int[] oldAD = genotype.getAD(); + if ( oldAD.length != alleleIndexesToUse.length ) { + builder.noAD(); + } else { + final int[] newAD = new int[nAllelesToUse]; + int currentIndex = 0; + for ( int i = 0; i < oldAD.length; i++ ) { + if ( alleleIndexesToUse[i] ) + newAD[currentIndex++] = oldAD[i]; + } + builder.AD(newAD); + } + return builder.make(); + } + static private Allele determineReferenceAllele(List VCs) { Allele ref = null; - for ( VariantContext vc : VCs ) { - Allele myRef = vc.getReference(); + for ( final VariantContext vc : VCs ) { + final Allele myRef = vc.getReference(); if ( ref == null || ref.length() < myRef.length() ) ref = myRef; else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) @@ -1024,13 +1300,13 @@ public class GATKVariantContextUtils { // System.out.printf("myref %s%n", myRef ); // System.out.printf("extrabases %s%n", new String(extraBases)); - Map map = new HashMap(); - for ( Allele a : vc.getAlleles() ) { + Map map = new HashMap<>(); + for ( final Allele a : vc.getAlleles() ) { if ( a.isReference() ) map.put(a, refAllele); else { Allele extended = Allele.extend(a, extraBases); - for ( Allele b : allAlleles ) + for ( final Allele b : allAlleles ) if ( extended.equals(b) ) extended = b; // System.out.printf(" Extending %s => %s%n", a, extended); @@ -1050,23 +1326,23 @@ public class GATKVariantContextUtils { throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList(unsortedVCs); + return new ArrayList<>(unsortedVCs); else { - ArrayList sorted = new ArrayList(unsortedVCs); + ArrayList sorted = new ArrayList<>(unsortedVCs); Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); return sorted; } } - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniqifySamples) { + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( Genotype g : oneVC.getGenotypes() ) { - String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniqifySamples); + for ( final Genotype g : oneVC.getGenotypes() ) { + final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); if ( ! mergedGenotypes.containsSample(name) ) { // only add if the name is new Genotype newG = g; - if ( uniqifySamples || alleleMapping.needsRemapping() ) { + if ( uniquifySamples || alleleMapping.needsRemapping() ) { final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); } @@ -1076,8 +1352,8 @@ public class GATKVariantContextUtils { } } - public static String mergedSampleName(String trackName, String sampleName, boolean uniqify ) { - return uniqify ? sampleName + "." + trackName : sampleName; + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { + return uniquify ? sampleName + "." + trackName : sampleName; } /** @@ -1104,8 +1380,8 @@ public class GATKVariantContextUtils { * Trim the alleles in inputVC forward and reverse, as requested * * @param inputVC a non-null input VC whose alleles might need a haircut - * @param trimForward should we trim up the alleles from the foward direction? - * @param trimReverse shold we trim up the alleles from the reverse direction? + * @param trimForward should we trim up the alleles from the forward direction? + * @param trimReverse should we trim up the alleles from the reverse direction? * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles */ @Ensures("result != null") @@ -1140,8 +1416,8 @@ public class GATKVariantContextUtils { if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified return inputVC; - final List alleles = new LinkedList(); - final Map originalToTrimmedAlleleMap = new HashMap(); + final List alleles = new LinkedList<>(); + final Map originalToTrimmedAlleleMap = new HashMap<>(); for (final Allele a : inputVC.getAlleles()) { if (a.isSymbolic()) { @@ -1171,7 +1447,7 @@ public class GATKVariantContextUtils { @Requires("originalGenotypes != null && alleleMapper != null") protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { - final GenotypesContext updatedGenotypes = GenotypesContext.create(); + final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); for ( final Genotype genotype : originalGenotypes ) { final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); @@ -1300,7 +1576,7 @@ public class GATKVariantContextUtils { } private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap(keysToPreserve.size()); + Map attributes = new HashMap<>(keysToPreserve.size()); for ( final String key : keysToPreserve ) { if ( igc.hasAttribute(key) ) attributes.put(key, igc.getAttribute(key)); @@ -1343,7 +1619,7 @@ public class GATKVariantContextUtils { if (!vc1.getReference().equals(vc2.getReference())) return false; - for (Allele a :vc1.getAlternateAlleles()) { + for (final Allele a :vc1.getAlternateAlleles()) { if (!vc2.getAlternateAlleles().contains(a)) return false; } @@ -1351,17 +1627,24 @@ public class GATKVariantContextUtils { return true; } - public static Map> separateVariantContextsByType(Collection VCs) { - HashMap> mappedVCs = new HashMap>(); - for ( VariantContext vc : VCs ) { + public static Map> separateVariantContextsByType( final Collection VCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } + + final HashMap> mappedVCs = new HashMap<>(); + for ( final VariantContext vc : VCs ) { + VariantContext.Type vcType = vc.getType(); + if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { + if( vc.getAlternateAlleles().size() > 1 ) { throw new IllegalStateException("Reference records should not have more than one alternate allele"); } + vcType = VariantContext.Type.NO_VARIATION; + } // look at previous variant contexts of different type. If: // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) // c) neither: do nothing, just add vc to its own list boolean addtoOwnList = true; - for (VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vc.getType())) + for (final VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vcType)) continue; if (!mappedVCs.containsKey(type)) @@ -1376,9 +1659,9 @@ public class GATKVariantContextUtils { // avoid having empty lists if (vcList.size() == 0) mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(otherVC); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(otherVC); break; } else if (allelesAreSubset(vc,otherVC)) { @@ -1390,9 +1673,9 @@ public class GATKVariantContextUtils { } } if (addtoOwnList) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(vc); } } @@ -1403,10 +1686,10 @@ public class GATKVariantContextUtils { if ( allowedAttributes == null ) return vc; - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + final Map attrs = new HashMap<>(); + for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { if ( allowedAttributes.contains(attr.getKey()) ) attrs.put(attr.getKey(), attr.getValue()); } @@ -1427,8 +1710,8 @@ public class GATKVariantContextUtils { public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } public List remap(List as) { - List newAs = new ArrayList(); - for ( Allele a : as ) { + List newAs = new ArrayList<>(); + for ( final Allele a : as ) { //System.out.printf(" Remapping %s => %s%n", a, remap(a)); newAs.add(remap(a)); } @@ -1467,7 +1750,7 @@ public class GATKVariantContextUtils { if ( alleleStrings == null || alleleStrings.isEmpty() ) throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); - final List alleles = new LinkedList(); + final List alleles = new LinkedList<>(); final int length = alleleStrings.get(0).length(); boolean first = true; @@ -1503,7 +1786,7 @@ public class GATKVariantContextUtils { if ( ref.length != alt.length ) throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); - final List result = new ArrayList(ref.length); + final List result = new ArrayList<>(ref.length); for ( int i = 0; i < ref.length; i++ ) { @@ -1518,7 +1801,7 @@ public class GATKVariantContextUtils { final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); // create new genotypes with updated alleles - final Map alleleMap = new HashMap(); + final Map alleleMap = new HashMap<>(); alleleMap.put(vc.getReference(), newRefAllele); alleleMap.put(vc.getAlternateAllele(0), newAltAllele); final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 5efe2dfc3..c1e11e2ce 100644 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -30,6 +30,8 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.log4j.PatternLayout; import org.apache.log4j.spi.LoggingEvent; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.CommandLineUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.crypt.CryptUtils; @@ -88,6 +90,7 @@ public abstract class BaseTest { public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; + public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; @@ -450,8 +453,8 @@ public abstract class BaseTest { } public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final Pair vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); - final Pair bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); + final Pair> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); + final Pair> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); } diff --git a/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java b/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java index 8a8faee8b..06bab8fc0 100644 --- a/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/ExampleToCopyUnitTest.java @@ -104,6 +104,42 @@ public class ExampleToCopyUnitTest extends BaseTest { Assert.assertTrue(size >= 0); } + /** + * DataProvider example using a class-based data structure + */ + private class MyDataProviderClass extends TestDataProvider { + private int start; + private int size; + + private MyDataProviderClass(int start, int size) { + super(MyDataProviderClass.class); + this.start = start; + this.size = size; + } + } + + @DataProvider(name = "MyClassBasedDataProvider") + public Object[][] makeMyDataProviderClass() { + // this functionality can be adapted to provide input data for whatever you might want in your data + for ( final int start : Arrays.asList(1, 10, 100) ) { + for ( final int size : Arrays.asList(1, 10, 100, 1000) ) { + new MyDataProviderClass(start, size); + } + } + + return TestDataProvider.getTests(MyDataProviderClass.class); + } + + /** + * Example testng test using MyClassBasedDataProvider + */ + @Test(dataProvider = "MyClassBasedDataProvider") + public void testMyDataProviderClass(MyDataProviderClass testSpec) { + // adaptor this code to do whatever testing you want given the arguments start and size + Assert.assertTrue(testSpec.start >= 0); + Assert.assertTrue(testSpec.size >= 0); + } + /** * A unit test that creates an artificial read for testing some code that uses reads */ diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java index 772c86563..ae7e41dfe 100644 --- a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java +++ b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java @@ -54,7 +54,7 @@ public class TestNGTestTransformer implements IAnnotationTransformer { Method testMethod) { if ( annotation.getTimeOut() == 0 ) { - logger.warn("test " + testMethod.toString() + " has no specified timeout, adding default timeout " + DEFAULT_TIMEOUT / 1000 / 60 + " minutes"); + logger.warn("test " + (testMethod == null ? "" : testMethod.toString()) + " has no specified timeout, adding default timeout " + DEFAULT_TIMEOUT / 1000 / 60 + " minutes"); annotation.setTimeOut(DEFAULT_TIMEOUT); } } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 78f67967b..994a2419c 100644 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -201,6 +201,7 @@ public class WalkerTest extends BaseTest { this.testClass = getCallingTestClass(); } + // @Test(expectedExceptions) doesn't work in integration tests, so use this instead public WalkerTestSpec(String args, int nOutputFiles, Class expectedException) { this.args = args; this.nOutputFiles = nOutputFiles; @@ -388,7 +389,6 @@ public class WalkerTest extends BaseTest { private void executeTest(String testName, String testClassName, String args, Class expectedException) { CommandLineGATK instance = new CommandLineGATK(); String[] command = Utils.escapeExpressions(args); - // run the executable boolean gotAnException = false; try { diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java index f08e04c56..29ba95963 100644 --- a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java @@ -33,6 +33,7 @@ import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -946,4 +947,146 @@ public class ParsingEngineUnitTest extends BaseTest { FileUtils.deleteQuietly(argsFile); } } + + private class NumericRangeArgProvider { + @Argument(fullName = "intWithHardMinAndMax", minValue = 5, maxValue = 10) + public int intWithHardMinAndMax; + + @Argument(fullName = "intWithHardMin", minValue = 5) + public int intWithHardMin; + + @Argument(fullName = "intWithHardMax", maxValue = 10) + public int intWithHardMax; + + @Argument(fullName = "intWithSoftMinAndMax", minRecommendedValue = 5, maxRecommendedValue = 10) + public int intWithSoftMinAndMax; + + @Argument(fullName = "intWithSoftMin", minRecommendedValue = 5) + public int intWithSoftMin; + + @Argument(fullName = "intWithSoftMax", maxRecommendedValue = 10) + public int intWithSoftMax; + + @Argument(fullName = "intWithHardAndSoftMinAndMax", minValue = 5, minRecommendedValue = 7, maxValue = 10, maxRecommendedValue = 9) + public int intWithHardAndSoftMinAndMax; + + @Argument(fullName = "intWithHardAndSoftMin", minValue = 5, minRecommendedValue = 7) + public int intWithHardAndSoftMin; + + @Argument(fullName = "intWithHardAndSoftMax", maxValue = 10, maxRecommendedValue = 8) + public int intWithHardAndSoftMax; + + @Argument(fullName = "intWithHardMinAndMaxDefaultOutsideRange", minValue = 5, maxValue = 10) + public int intWithHardMinAndMaxDefaultOutsideRange = -1; + + @Argument(fullName = "integerWithHardMinAndMax", minValue = 5, maxValue = 10) + public Integer integerWithHardMinAndMax; + + @Argument(fullName = "byteWithHardMinAndMax", minValue = 5, maxValue = 10) + public byte byteWithHardMinAndMax; + + @Argument(fullName = "byteWithHardMin", minValue = 5) + public byte byteWithHardMin; + + @Argument(fullName = "byteWithHardMax", maxValue = 10) + public byte byteWithHardMax; + + @Argument(fullName = "doubleWithHardMinAndMax", minValue = 5.5, maxValue = 10.0) + public double doubleWithHardMinAndMax; + + @Argument(fullName = "doubleWithHardMin", minValue = 5.5) + public double doubleWithHardMin; + + @Argument(fullName = "doubleWithHardMax", maxValue = 10.0) + public double doubleWithHardMax; + } + + @DataProvider(name = "NumericRangeConstraintViolationDataProvider") + public Object[][] numericRangeConstraintViolationDataProvider() { + return new Object[][] { + { new String[]{"--intWithHardMinAndMax", "11"} }, + { new String[]{"--intWithHardMinAndMax", "4"} }, + { new String[]{"--intWithHardMin", "4"} }, + { new String[]{"--intWithHardMax", "11"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "11"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "4"} }, + { new String[]{"--intWithHardAndSoftMin", "4"} }, + { new String[]{"--intWithHardAndSoftMax", "11"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "11"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "4"} }, + { new String[]{"--integerWithHardMinAndMax", "11"} }, + { new String[]{"--integerWithHardMinAndMax", "4"} }, + { new String[]{"--byteWithHardMinAndMax", "11"} }, + { new String[]{"--byteWithHardMinAndMax", "4"} }, + { new String[]{"--byteWithHardMin", "4"} }, + { new String[]{"--byteWithHardMax", "11"} }, + { new String[]{"--doubleWithHardMinAndMax", "5.4"} }, + { new String[]{"--doubleWithHardMinAndMax", "10.1"} }, + { new String[]{"--doubleWithHardMin", "5.4"} }, + { new String[]{"--doubleWithHardMax", "10.1"} } + }; + } + + @Test(dataProvider = "NumericRangeConstraintViolationDataProvider", + expectedExceptions = ArgumentValueOutOfRangeException.class) + public void testNumericRangeWithConstraintViolation( final String[] commandLine ) { + runNumericArgumentRangeTest(commandLine); + } + + @DataProvider(name = "NumericRangeWithoutConstraintViolationDataProvider") + public Object[][] numericRangeWithoutConstraintViolationDataProvider() { + return new Object[][] { + { new String[]{"--intWithHardMinAndMax", "10"} }, + { new String[]{"--intWithHardMinAndMax", "5"} }, + { new String[]{"--intWithHardMinAndMax", "7"} }, + { new String[]{"--intWithHardMin", "11"} }, + { new String[]{"--intWithHardMax", "4"} }, + { new String[]{"--intWithSoftMinAndMax", "11"} }, + { new String[]{"--intWithSoftMinAndMax", "4"} }, + { new String[]{"--intWithSoftMin", "4"} }, + { new String[]{"--intWithSoftMax", "11"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "5"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "7"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "8"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "9"} }, + { new String[]{"--intWithHardAndSoftMinAndMax", "10"} }, + { new String[]{"--intWithHardAndSoftMin", "5"} }, + { new String[]{"--intWithHardAndSoftMin", "6"} }, + { new String[]{"--intWithHardAndSoftMin", "7"} }, + { new String[]{"--intWithHardAndSoftMax", "10"} }, + { new String[]{"--intWithHardAndSoftMax", "9"} }, + { new String[]{"--intWithHardAndSoftMax", "8"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "10"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "5"} }, + { new String[]{"--intWithHardMinAndMaxDefaultOutsideRange", "7"} }, + { new String[]{"--integerWithHardMinAndMax", "10"} }, + { new String[]{"--integerWithHardMinAndMax", "5"} }, + { new String[]{"--byteWithHardMinAndMax", "10"} }, + { new String[]{"--byteWithHardMinAndMax", "5"} }, + { new String[]{"--byteWithHardMinAndMax", "7"} }, + { new String[]{"--byteWithHardMin", "5"} }, + { new String[]{"--byteWithHardMax", "10"} }, + { new String[]{"--doubleWithHardMinAndMax", "5.5"} }, + { new String[]{"--doubleWithHardMinAndMax", "10.0"} }, + { new String[]{"--doubleWithHardMinAndMax", "7.5"} }, + { new String[]{"--doubleWithHardMin", "5.5"} }, + { new String[]{"--doubleWithHardMin", "15.5"} }, + { new String[]{"--doubleWithHardMax", "10.0"} }, + { new String[]{"--doubleWithHardMax", "7.5"} } + }; + } + + @Test(dataProvider = "NumericRangeWithoutConstraintViolationDataProvider") + public void testNumericRangeWithoutConstraintViolation( final String[] commandLine ) { + // These tests succeed if no exception is thrown, since no constraints have been violated + runNumericArgumentRangeTest(commandLine); + } + + private void runNumericArgumentRangeTest( final String[] commandLine ) { + parsingEngine.addArgumentSource(NumericRangeArgProvider.class); + parsingEngine.parse(commandLine); + + NumericRangeArgProvider argProvider = new NumericRangeArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index aca6cf984..f1839e6ac 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -26,9 +26,10 @@ package org.broadinstitute.sting.gatk; import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; -import org.broad.tribble.readers.AsciiLineReader; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; @@ -48,10 +49,8 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileInputStream; -import java.io.PrintStream; -import java.util.Arrays; +import java.io.*; +import java.util.*; /** * @@ -216,7 +215,8 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { 1, Arrays.asList("")); spec.disableShadowBCF(); final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0); - final VCFHeader header = (VCFHeader)new VCFCodec().readHeader(new AsciiLineReader(new FileInputStream(vcf))); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); final VCFHeaderLine versionLine = header.getMetaDataLine(GATKVCFUtils.GATK_COMMAND_LINE_KEY); Assert.assertNotNull(versionLine); Assert.assertTrue(versionLine.toString().contains("SelectVariants")); @@ -230,7 +230,8 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { 1, Arrays.asList("")); spec.disableShadowBCF(); final File vcf = executeTest("testMultipleGATKVersionsInVCF", spec).first.get(0); - final VCFHeader header = (VCFHeader)new VCFCodec().readHeader(new AsciiLineReader(new FileInputStream(vcf))); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); boolean foundHC = false; boolean foundSV = false; @@ -278,6 +279,12 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, "")); } + // -------------------------------------------------------------------------------- + // + // Test engine-level cigar consolidation + // + // -------------------------------------------------------------------------------- + @Test public void testGATKEngineConsolidatesCigars() { final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + @@ -297,4 +304,232 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine: Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine"); } + + // -------------------------------------------------------------------------------- + // + // Test on-the-fly sample renaming + // + // -------------------------------------------------------------------------------- + + // On-the-fly sample renaming test case: one single-sample bam with multiple read groups + @Test + public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId())); + } + + reader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + final String newSampleName = String.format("newSampleFor%s", inputBamID); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam, + // performing renaming in only SOME of the bams + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException { + // Rename samples for NA12878 and NA12892, but not for NA12891 + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + + // Special-case NA12891, which we're not renaming: + final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID); + + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: two single-sample bams with read group collisions + @Test + public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878")); + + final Set na12878ReadGroups = new HashSet<>(); + final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + na12878ReadGroups.add(readGroup.getId()); + } + inputBamReader.close(); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + String expectedSampleName = ""; + if ( na12878ReadGroups.contains(readGroup.getId()) ) { + expectedSampleName = "newSampleFor12878"; + } + else { + expectedSampleName = "newSampleForNot12878"; + } + + Assert.assertEquals(readGroup.getSample(), expectedSampleName, + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException) + @Test + public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec); + } + + // On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads + @Test + public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " --newSampleName myNewSampleName" + + " -L 20:10000000-10001000", + 1, Arrays.asList("")); + + // Test is a success if our custom walker doesn't throw an exception + executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker { + @Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true) + String newSampleName = null; + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) { + throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s", + newSampleName, read.getReadGroup().getSample())); + } + + return 1; + } + + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 3f74e0eae..84bc6e080 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -42,17 +42,16 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; /** * Tests selected functionality in the GenomeAnalysisEngine class */ public class GenomeAnalysisEngineUnitTest extends BaseTest { - @Test(expectedExceptions=ArgumentException.class) + @Test(expectedExceptions=UserException.class) public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); @@ -64,7 +63,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.checkForDuplicateSamFiles(); } - @Test(expectedExceptions=ArgumentException.class) + @Test(expectedExceptions=UserException.class) public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); @@ -78,6 +77,20 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.checkForDuplicateSamFiles(); } + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + final File relativePathToBAMFile = new File("public/testdata/exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); + samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + @Test public void testEmptyIntervalSetHandling() throws Exception { GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); @@ -90,6 +103,64 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.validateSuppliedIntervals(); } + @Test + public void testLoadWellFormedSampleRenameMapFile() throws IOException { + final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", + "/foo/bar/second.bam newSample2", + "/foo/bar2/third.bam newSample3")); + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + + Assert.assertEquals(renameMap.size(), 3, "Sample rename map was wrong size after loading from file"); + + final Iterator expectedResultsIterator = Arrays.asList("/foo/bar/first.bam", "newSample1", "/foo/bar/second.bam", "newSample2", "/foo/bar2/third.bam", "newSample3").iterator(); + while ( expectedResultsIterator.hasNext() ) { + final String expectedKey = expectedResultsIterator.next(); + final String expectedValue = expectedResultsIterator.next(); + + Assert.assertNotNull(renameMap.get(new SAMReaderID(expectedKey, new Tags())), String.format("Entry for %s not found in sample rename map", expectedKey)); + Assert.assertEquals(renameMap.get(new SAMReaderID(expectedKey, new Tags())), expectedValue, "Wrong value in sample rename map for " + expectedKey); + } + } + + @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") + public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { + final List tests = new ArrayList(); + + tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", + new File("/foo/bar/nonexistent")}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine1", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine2", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam newSample extraField"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", + createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", + createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", + "/path/to/dupe.bam newSample2"))}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) + public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { + logger.info("Executing test " + testName); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } /////////////////////////////////////////////////// // Test the ReadTransformer ordering enforcement // diff --git a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java index 56725147e..02d0c66b9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/ReadMetricsUnitTest.java @@ -158,7 +158,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); @@ -193,7 +193,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); @@ -234,7 +234,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); @@ -281,7 +281,7 @@ public class ReadMetricsUnitTest extends BaseTest { new ValidationExclusion(), filters, new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 8d33aa8b6..52285fb2e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -183,7 +183,8 @@ public class SAMDataSourceUnitTest extends BaseTest { false, (byte) -1, removeProgramRecords, - false); + false, + null); List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); @@ -203,7 +204,8 @@ public class SAMDataSourceUnitTest extends BaseTest { false, (byte) -1, removeProgramRecords, - false); + false, + null); List doRemoveProgramRecords = data.getHeader().getProgramRecords(); assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java new file mode 100644 index 000000000..a594573e5 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMReaderIDUnitTest.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class SAMReaderIDUnitTest extends BaseTest { + + @Test + public void testSAMReaderIDHashingAndEquality() { + // Test to make sure that two SAMReaderIDs that point at the same file via an absolute vs. relative + // path are equal according to equals() and have the same hash code + final File relativePathToBAMFile = new File("public/testdata/exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + final SAMReaderID relativePathSAMReaderID = new SAMReaderID(relativePathToBAMFile, new Tags()); + final SAMReaderID absolutePathSAMReaderID = new SAMReaderID(absolutePathToBAMFile, new Tags()); + + Assert.assertEquals(relativePathSAMReaderID, absolutePathSAMReaderID, "Absolute-path and relative-path SAMReaderIDs not equal according to equals()"); + Assert.assertEquals(relativePathSAMReaderID.hashCode(), absolutePathSAMReaderID.hashCode(), "Absolute-path and relative-path SAMReaderIDs have different hash codes"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java index adcc21291..988355813 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/utils/TestFeatureReader.java @@ -34,7 +34,7 @@ import java.io.IOException; /** * Feature reader with additional test utilities. The iterators can be checked to see if they are closed. */ -public class TestFeatureReader extends TribbleIndexedFeatureReader { +public class TestFeatureReader extends TribbleIndexedFeatureReader { public TestFeatureReader(String featurePath, FeatureCodec codec) throws IOException { super(featurePath, codec, true); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index e4b6c37cc..30c0c83b5 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -481,7 +481,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { new ValidationExclusion(), new ArrayList(), new ArrayList(), - false, (byte)30, false, true); + false, (byte)30, false, true, null); engine.setReadsDataSource(dataSource); final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java new file mode 100644 index 000000000..24a274cee --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SnpEffUtilUnitTest.java @@ -0,0 +1,111 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.annotator; + +/** + * Created with IntelliJ IDEA. + * User: farjoun + * Date: 6/5/13 + * Time: 2:31 PM + * To change this template use File | Settings | File Templates. + */ + + +import org.broadinstitute.sting.gatk.walkers.annotator.SnpEff.EffectType; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + +public class SnpEffUtilUnitTest { + + + @DataProvider(name="effects") + public Object[][] childParentpairs() { + List tests = new ArrayList(); + + tests.add(new Object[]{EffectType.GENE,EffectType.CHROMOSOME}); + tests.add(new Object[]{EffectType.UTR_3_PRIME,EffectType.TRANSCRIPT}); + tests.add(new Object[]{EffectType.CODON_CHANGE,EffectType.CDS}); + tests.add(new Object[]{EffectType.STOP_GAINED,EffectType.EXON}); + tests.add(new Object[]{EffectType.SYNONYMOUS_START,EffectType.TRANSCRIPT}); + tests.add(new Object[]{EffectType.FRAME_SHIFT,EffectType.CDS}); + tests.add(new Object[]{EffectType.UPSTREAM,EffectType.INTERGENIC}); + tests.add(new Object[]{EffectType.SPLICE_SITE_DONOR,EffectType.INTRON}); + tests.add(new Object[]{EffectType.SPLICE_SITE_ACCEPTOR,EffectType.INTRON}); + tests.add(new Object[]{EffectType.STOP_LOST,EffectType.NON_SYNONYMOUS_CODING}); + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name="self") + public Object[][] childEqualsParentpairs() { + List tests = new ArrayList(); + + for(EffectType type:EffectType.values()){ + tests.add(new Object[]{type,type}); + } + return tests.toArray(new Object[][]{}); + } + + @DataProvider(name="noneffects") + public Object[][] nonchildParentpairs() { + List tests = new ArrayList(); + + tests.add(new Object[]{EffectType.START_GAINED,EffectType.NON_SYNONYMOUS_CODING}); + tests.add(new Object[]{EffectType.GENE,EffectType.NONE}); + tests.add(new Object[]{EffectType.UTR_3_PRIME,EffectType.CDS}); + tests.add(new Object[]{EffectType.CODON_CHANGE,EffectType.REGULATION}); + tests.add(new Object[]{EffectType.DOWNSTREAM,EffectType.REGULATION}); + tests.add(new Object[]{EffectType.SPLICE_SITE_ACCEPTOR,EffectType.EXON}); + tests.add(new Object[]{EffectType.START_GAINED,EffectType.SYNONYMOUS_START}); + tests.add(new Object[]{EffectType.NON_SYNONYMOUS_CODING,EffectType.DOWNSTREAM}); + tests.add(new Object[]{EffectType.CODON_DELETION,EffectType.INTRON}); + tests.add(new Object[]{EffectType.UTR_5_PRIME,EffectType.EXON_DELETED}); + tests.add(new Object[]{EffectType.INTRON,EffectType.NONE}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "effects") + public void testSubType(EffectType subType,EffectType parentType) { + Assert.assertTrue(SnpEffUtil.isSubTypeOf(subType,parentType),String.format("testing that %s is subtype of %s.",subType,parentType)); + } + @Test(dataProvider = "self") + public void testSubTypeSelf(EffectType subType,EffectType parentType) { + Assert.assertTrue(SnpEffUtil.isSubTypeOf(subType,parentType),String.format("testing that %s is subtype of %s.",subType,parentType)); + } + @Test(dataProvider = "effects") + public void testNonSubTypeSelf(EffectType parentType,EffectType subType) { + Assert.assertTrue(!SnpEffUtil.isSubTypeOf(subType,parentType),String.format("testing that %s is subtype of %s.",subType,parentType)); + } + @Test(dataProvider = "noneffects") + public void testNonSubType(EffectType subType,EffectType parentType) { + Assert.assertTrue(!SnpEffUtil.isSubTypeOf(subType, parentType), String.format("testing that %s is NOT subtype of %s.", subType, parentType)); + Assert.assertTrue(!SnpEffUtil.isSubTypeOf(parentType,subType), String.format("testing that %s is NOT subtype of %s.", parentType,subType)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java new file mode 100644 index 000000000..83d571748 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariantsUnitTest.java @@ -0,0 +1,54 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; + + +public class FilterLiftedVariantsUnitTest extends BaseTest { + + @Test + public void testIndelAtEndOfContig() { + + final List alleles = new ArrayList<>(2); + alleles.add(Allele.create("AAAAA", true)); + alleles.add(Allele.create("A", false)); + final VariantContext vc = new VariantContextBuilder("test", "1", 10, 14, alleles).make(); + + final FilterLiftedVariants filter = new FilterLiftedVariants(); + + Assert.assertFalse(filter.filterOrWrite(new byte[]{'A'}, vc)); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java index 9621aecda..b94b6cda1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocParserUnitTest.java @@ -66,7 +66,7 @@ public class GenomeLocParserUnitTest extends BaseTest { private SAMFileHeader header; @BeforeClass - public void init() { + public void init() { header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index f2718fb8c..de049fe89 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -500,7 +500,7 @@ public class MathUtilsUnitTest extends BaseTest { @DataProvider(name = "MedianData") public Object[][] makeMedianData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{Arrays.asList(10), 10}); @@ -510,12 +510,350 @@ public class MathUtilsUnitTest extends BaseTest { tests.add(new Object[]{values, 1}); } + for ( final List values : Utils.makePermutations(Arrays.asList(1.1,2.1,-3.1), 3, false) ) { + tests.add(new Object[]{values, 1.1}); + } + return tests.toArray(new Object[][]{}); } @Test(dataProvider = "MedianData") - public void testMedian(final List values, final int expected) { - final int actual = MathUtils.median(values); + public void testMedian(final List values, final Comparable expected) { + final Comparable actual = MathUtils.median(values); Assert.assertEquals(actual, expected, "Failed with " + values); } + + + + // man. All this to test dirichlet. + + private double[] unwrap(List stuff) { + double[] unwrapped = new double[stuff.size()]; + int idx = 0; + for ( Double d : stuff ) { + unwrapped[idx++] = d == null ? 0.0 : d; + } + + return unwrapped; + } + + /** + * The PartitionGenerator generates all of the partitions of a number n, e.g. + * 5 + 0 + * 4 + 1 + * 3 + 2 + * 3 + 1 + 1 + * 2 + 2 + 1 + * 2 + 1 + 1 + 1 + * 1 + 1 + 1 + 1 + 1 + * + * This is used to help enumerate the state space over which the Dirichlet-Multinomial is defined, + * to ensure that the distribution function is properly implemented + */ + class PartitionGenerator implements Iterator> { + // generate the partitions of an integer, each partition sorted numerically + int n; + List a; + int y; + int k; + int state; + int x; + int l; + + public PartitionGenerator(int n) { + this.n = n; + this.y = n - 1; + this.k = 1; + this.a = new ArrayList(); + for ( int i = 0; i < n; i++ ) { + this.a.add(i); + } + this.state = 0; + } + + public void remove() { /* do nothing */ } + + public boolean hasNext() { return ! ( this.k == 0 && state == 0 ); } + + private String dataStr() { + return String.format("a = [%s] k = %d y = %d state = %d x = %d l = %d", + Utils.join(",",a), k, y, state, x, l); + } + + public List next() { + if ( this.state == 0 ) { + this.x = a.get(k-1)+1; + k -= 1; + this.state = 1; + } + + if ( this.state == 1 ) { + while ( 2*x <= y ) { + this.a.set(k,x); + this.y -= x; + this.k++; + } + this.l = 1+this.k; + this.state = 2; + } + + if ( this.state == 2 ) { + if ( x <= y ) { + this.a.set(k,x); + this.a.set(l,y); + x += 1; + y -= 1; + return this.a.subList(0, this.k + 2); + } else { + this.state =3; + } + } + + if ( this.state == 3 ) { + this.a.set(k,x+y); + this.y = x + y - 1; + this.state = 0; + return a.subList(0, k + 1); + } + + throw new IllegalStateException("Cannot get here"); + } + + public String toString() { + StringBuffer buf = new StringBuffer(); + buf.append("{ "); + while ( hasNext() ) { + buf.append("["); + buf.append(Utils.join(",",next())); + buf.append("],"); + } + buf.deleteCharAt(buf.lastIndexOf(",")); + buf.append(" }"); + return buf.toString(); + } + + } + + /** + * NextCounts is the enumerator over the state space of the multinomial dirichlet. + * + * It filters the partition of the total sum to only those with a number of terms + * equal to the number of categories. + * + * It then generates all permutations of that partition. + * + * In so doing it enumerates over the full state space. + */ + class NextCounts implements Iterator { + + private PartitionGenerator partitioner; + private int numCategories; + private int[] next; + + public NextCounts(int numCategories, int totalCounts) { + partitioner = new PartitionGenerator(totalCounts); + this.numCategories = numCategories; + next = nextFromPartitioner(); + } + + public void remove() { /* do nothing */ } + + public boolean hasNext() { return next != null; } + + public int[] next() { + int[] toReturn = clone(next); + next = nextPermutation(); + if ( next == null ) { + next = nextFromPartitioner(); + } + + return toReturn; + } + + private int[] clone(int[] arr) { + int[] a = new int[arr.length]; + for ( int idx = 0; idx < a.length ; idx ++) { + a[idx] = arr[idx]; + } + + return a; + } + + private int[] nextFromPartitioner() { + if ( partitioner.hasNext() ) { + List nxt = partitioner.next(); + while ( partitioner.hasNext() && nxt.size() > numCategories ) { + nxt = partitioner.next(); + } + + if ( nxt.size() > numCategories ) { + return null; + } else { + int[] buf = new int[numCategories]; + for ( int idx = 0; idx < nxt.size(); idx++ ) { + buf[idx] = nxt.get(idx); + } + Arrays.sort(buf); + return buf; + } + } + + return null; + } + + public int[] nextPermutation() { + return MathUtilsUnitTest.nextPermutation(next); + } + + } + + public static int[] nextPermutation(int[] next) { + // the counts can swap among each other. The int[] is originally in ascending order + // this generates the next array in lexicographic order descending + + // locate the last occurrence where next[k] < next[k+1] + int gt = -1; + for ( int idx = 0; idx < next.length-1; idx++) { + if ( next[idx] < next[idx+1] ) { + gt = idx; + } + } + + if ( gt == -1 ) { + return null; + } + + int largestLessThan = gt+1; + for ( int idx = 1 + largestLessThan; idx < next.length; idx++) { + if ( next[gt] < next[idx] ) { + largestLessThan = idx; + } + } + + int val = next[gt]; + next[gt] = next[largestLessThan]; + next[largestLessThan] = val; + + // reverse the tail of the array + int[] newTail = new int[next.length-gt-1]; + int ctr = 0; + for ( int idx = next.length-1; idx > gt; idx-- ) { + newTail[ctr++] = next[idx]; + } + + for ( int idx = 0; idx < newTail.length; idx++) { + next[gt+idx+1] = newTail[idx]; + } + + return next; + } + + + // before testing the dirichlet multinomial, we need to test the + // classes used to test the dirichlet multinomial + + @Test + public void testPartitioner() { + int[] numsToTest = new int[]{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; + int[] expectedSizes = new int[]{1, 2, 3, 5, 7, 11, 15, 22, 30, 42, 56, 77, 101, 135, 176, 231, 297, 385, 490, 627}; + for ( int testNum = 0; testNum < numsToTest.length; testNum++ ) { + PartitionGenerator gen = new PartitionGenerator(numsToTest[testNum]); + int size = 0; + while ( gen.hasNext() ) { + logger.debug(gen.dataStr()); + size += 1; + gen.next(); + } + Assert.assertEquals(size,expectedSizes[testNum], + String.format("Expected %d partitions, observed %s",expectedSizes[testNum],new PartitionGenerator(numsToTest[testNum]).toString())); + } + } + + @Test + public void testNextPermutation() { + int[] arr = new int[]{1,2,3,4}; + int[][] gens = new int[][] { + new int[]{1,2,3,4}, + new int[]{1,2,4,3}, + new int[]{1,3,2,4}, + new int[]{1,3,4,2}, + new int[]{1,4,2,3}, + new int[]{1,4,3,2}, + new int[]{2,1,3,4}, + new int[]{2,1,4,3}, + new int[]{2,3,1,4}, + new int[]{2,3,4,1}, + new int[]{2,4,1,3}, + new int[]{2,4,3,1}, + new int[]{3,1,2,4}, + new int[]{3,1,4,2}, + new int[]{3,2,1,4}, + new int[]{3,2,4,1}, + new int[]{3,4,1,2}, + new int[]{3,4,2,1}, + new int[]{4,1,2,3}, + new int[]{4,1,3,2}, + new int[]{4,2,1,3}, + new int[]{4,2,3,1}, + new int[]{4,3,1,2}, + new int[]{4,3,2,1} }; + for ( int gen = 0; gen < gens.length; gen ++ ) { + for ( int idx = 0; idx < 3; idx++ ) { + Assert.assertEquals(arr[idx],gens[gen][idx], + String.format("Error at generation %d, expected %s, observed %s",gen,Arrays.toString(gens[gen]),Arrays.toString(arr))); + } + arr = nextPermutation(arr); + } + } + + private double[] addEpsilon(double[] counts) { + double[] d = new double[counts.length]; + for ( int i = 0; i < counts.length; i ++ ) { + d[i] = counts[i] + 1e-3; + } + return d; + } + + @Test + public void testDirichletMultinomial() { + List testAlleles = Arrays.asList( + new double[]{80,240}, + new double[]{1,10000}, + new double[]{0,500}, + new double[]{5140,20480}, + new double[]{5000,800,200}, + new double[]{6,3,1000}, + new double[]{100,400,300,800}, + new double[]{8000,100,20,80,2}, + new double[]{90,20000,400,20,4,1280,720,1} + ); + + Assert.assertTrue(! Double.isInfinite(MathUtils.log10Gamma(1e-3)) && ! Double.isNaN(MathUtils.log10Gamma(1e-3))); + + int[] numAlleleSampled = new int[]{2,5,10,20,25}; + for ( double[] alleles : testAlleles ) { + for ( int count : numAlleleSampled ) { + // test that everything sums to one. Generate all multinomial draws + List likelihoods = new ArrayList(100000); + NextCounts generator = new NextCounts(alleles.length,count); + double maxLog = Double.MIN_VALUE; + //List countLog = new ArrayList(200); + while ( generator.hasNext() ) { + int[] thisCount = generator.next(); + //countLog.add(Arrays.toString(thisCount)); + Double likelihood = MathUtils.dirichletMultinomial(addEpsilon(alleles),thisCount); + Assert.assertTrue(! Double.isNaN(likelihood) && ! Double.isInfinite(likelihood), + String.format("Likelihood for counts %s and nAlleles %d was %s", + Arrays.toString(thisCount),alleles.length,Double.toString(likelihood))); + if ( likelihood > maxLog ) + maxLog = likelihood; + likelihoods.add(likelihood); + } + //System.out.printf("%d likelihoods and max is (probability) %e\n",likelihoods.size(),Math.pow(10,maxLog)); + Assert.assertEquals(MathUtils.sumLog10(unwrap(likelihoods)),1.0,1e-7, + String.format("Counts %d and alleles %d have nLikelihoods %d. \n Counts: %s", + count,alleles.length,likelihoods.size(), "NODEBUG"/*,countLog*/)); + } + } + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java index 2470364c4..fb238ef54 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -31,6 +31,7 @@ package org.broadinstitute.sting.utils.activeregion; import net.sf.picard.reference.ReferenceSequenceFile; import org.apache.commons.lang.ArrayUtils; +import org.broad.tribble.readers.LineIterator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -48,7 +49,10 @@ import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; public class BandPassActivityProfileUnitTest extends BaseTest { @@ -261,7 +265,7 @@ public class BandPassActivityProfileUnitTest extends BaseTest { final File file = new File(path); final VCFCodec codec = new VCFCodec(); - final Pair reader = GATKVCFUtils.readAllVCs(file, codec); + final Pair> reader = GATKVCFUtils.readAllVCs(file, codec); final List incRegions = new ArrayList(); final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java index 5cd13b818..1ac79dcf3 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java @@ -26,7 +26,9 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.annotation.Strand; -import org.broad.tribble.readers.AsciiLineReader; +import org.broad.tribble.readers.LineIterator; +import org.broad.tribble.readers.LineIteratorImpl; +import org.broad.tribble.readers.LineReaderUtil; import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; @@ -53,15 +55,13 @@ public class HapMapUnitTest extends BaseTest { @Test public void testReadHeader() { RawHapMapCodec codec = new RawHapMapCodec(); - AsciiLineReader reader = getReader(); + final LineIterator reader = getLineIterator(); try { - String header = reader.readLine(); - reader.close(); - Assert.assertTrue(header.equals(codec.readHeader(getReader()))); - } catch (IOException e) { - Assert.fail("Unable to read from file " + hapMapFile); + String header = reader.next(); + Assert.assertTrue(header.equals(codec.readActualHeader(getLineIterator()))); + } finally { + codec.close(reader); } - reader.close(); } @Test @@ -114,22 +114,20 @@ public class HapMapUnitTest extends BaseTest { public void testReadCorrectNumberOfRecords() { // setup the record for reading our 500 line file (499 records, 1 header line) RawHapMapCodec codec = new RawHapMapCodec(); - AsciiLineReader reader = getReader(); + final LineIterator reader = getLineIterator(); - String line; int count = 0; try { codec.readHeader(reader); - line = reader.readLine(); - while (line != null) { - codec.decode(line); - line = reader.readLine(); + while (reader.hasNext()) { + codec.decode(reader.next()); ++count; } } catch (IOException e) { Assert.fail("IOException " + e.getMessage()); + } finally { + codec.close(reader); } - reader.close(); Assert.assertEquals(count,499); } @@ -137,25 +135,26 @@ public class HapMapUnitTest extends BaseTest { public void testGetSampleNames() { // setup the record for reading our 500 line file (499 records, 1 header line) RawHapMapCodec codec = new RawHapMapCodec(); - AsciiLineReader reader = getReader(); + final LineIterator reader = getLineIterator(); String line; try { codec.readHeader(reader); - line = reader.readLine(); + line = reader.next(); RawHapMapFeature feature = (RawHapMapFeature) codec.decode(line); Assert.assertEquals(feature.getSampleIDs().length,87); } catch (IOException e) { Assert.fail("IOException " + e.getMessage()); + } finally { + codec.close(reader); } - reader.close(); } - public AsciiLineReader getReader() { + public LineIterator getLineIterator() { try { - return new AsciiLineReader(new PositionalBufferedStream(new FileInputStream(hapMapFile))); + return new LineIteratorImpl(LineReaderUtil.fromBufferedStream(new PositionalBufferedStream(new FileInputStream(hapMapFile)))); } catch (FileNotFoundException e) { Assert.fail("Unable to open hapmap file : " + hapMapFile); } diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java index e2e253d0f..93de7c9cf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -224,7 +224,7 @@ public class FragmentUtilsUnitTest extends BaseTest { } @Test(enabled = !DEBUG, dataProvider = "MergeFragmentsTest") - public void testMergingTwoReads(final String name, final GATKSAMRecord read1, GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { + public void testMergingTwoReads(final String name, final GATKSAMRecord read1, final GATKSAMRecord read2, final GATKSAMRecord expectedMerged) { final GATKSAMRecord actual = FragmentUtils.mergeOverlappingPairedFragments(read1, read2); if ( expectedMerged == null ) { @@ -349,4 +349,42 @@ public class FragmentUtilsUnitTest extends BaseTest { read.setReadGroup(new GATKSAMReadGroupRecord("foo")); return read; } + + + private static final byte highQuality = 30; + private static final byte overlappingQuality = 20; + + @DataProvider(name = "AdjustFragmentsTest") + public Object[][] createAdjustFragmentsTest() throws Exception { + List tests = new ArrayList(); + + final String leftFlank = "CCC"; + final String rightFlank = "AAA"; + final String allOverlappingBases = "ACGTACGTGGAACCTTAG"; + for ( int overlapSize = 1; overlapSize < allOverlappingBases.length(); overlapSize++ ) { + final String overlappingBases = allOverlappingBases.substring(0, overlapSize); + final byte[] overlappingBaseQuals = new byte[overlapSize]; + for ( int i = 0; i < overlapSize; i++ ) overlappingBaseQuals[i] = highQuality; + final GATKSAMRecord read1 = makeOverlappingRead(leftFlank, highQuality, overlappingBases, overlappingBaseQuals, "", highQuality, 1); + final GATKSAMRecord read2 = makeOverlappingRead("", highQuality, overlappingBases, overlappingBaseQuals, rightFlank, highQuality, leftFlank.length() + 1); + tests.add(new Object[]{read1, read2, overlapSize}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "AdjustFragmentsTest") + public void testAdjustingTwoReads(final GATKSAMRecord read1, final GATKSAMRecord read2, final int overlapSize) { + FragmentUtils.adjustQualsOfOverlappingPairedFragments(read1, read2); + + for ( int i = 0; i < read1.getReadLength() - overlapSize; i++ ) + Assert.assertEquals(read1.getBaseQualities()[i], highQuality); + for ( int i = read1.getReadLength() - overlapSize; i < read1.getReadLength(); i++ ) + Assert.assertEquals(read1.getBaseQualities()[i], overlappingQuality); + + for ( int i = 0; i < overlapSize; i++ ) + Assert.assertEquals(read2.getBaseQualities()[i], overlappingQuality); + for ( int i = overlapSize; i < read2.getReadLength(); i++ ) + Assert.assertEquals(read2.getBaseQualities()[i], highQuality); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index 18fa8a302..02b11b970 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -296,7 +296,6 @@ public class ReadBackedPileupUnitTest { testRBPCounts(pileup, new RBPCountTest(params.nReads + 2, params.nMapq0 + 1, params.nDeletions + 1)); } - private void testRBPCounts(final ReadBackedPileup rbp, RBPCountTest expected) { for ( int cycles = 0; cycles < 3; cycles++ ) { // multiple cycles to make sure caching is working @@ -306,4 +305,24 @@ public class ReadBackedPileupUnitTest { Assert.assertEquals(rbp.getNumberOfMappingQualityZeroReads(), expected.nMapq0); } } + + @Test + public void testRBPMappingQuals() { + + // create a read with high MQ + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 10); + read.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read.setBaseQualities(Utils.dupBytes((byte) 30, 10)); + read.setCigarString("10M"); + read.setMappingQuality(200); // set a MQ higher than max signed byte + + // now create the RBP + final List elts = new LinkedList<>(); + elts.add(new PileupElement(read, 0, read.getCigar().getCigarElement(0), 0, 0)); + final Map pileupsBySample = new HashMap<>(); + pileupsBySample.put("foo", new ReadBackedPileupImpl(loc, elts)); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, pileupsBySample); + + Assert.assertEquals(pileup.getMappingQuals()[0], 200); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java index a2aec1b1e..68ba0b624 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialBAMBuilderUnitTest.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; -import org.apache.commons.collections.IteratorUtils; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.DataProvider; diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java index 051d0bcec..57020424c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVCFUtilsUnitTest.java @@ -25,6 +25,10 @@ package org.broadinstitute.sting.utils.variant; +import org.broad.tribble.index.DynamicIndexCreator; +import org.broad.tribble.index.IndexCreator; +import org.broad.tribble.index.interval.IntervalIndexCreator; +import org.broad.tribble.index.linear.LinearIndexCreator; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -35,8 +39,10 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; import java.util.Collections; import java.util.Set; @@ -83,4 +89,41 @@ public class GATKVCFUtilsUnitTest extends BaseTest { Assert.assertTrue(lines2.contains(line1)); Assert.assertTrue(lines2.contains(line2)); } + + private class IndexCreatorTest extends TestDataProvider { + private final GATKVCFIndexType type; + private final int parameter; + private final Class expectedClass; + private final int expectedDefaultBinSize; + private final int expectedBinSize; + + private IndexCreatorTest(GATKVCFIndexType type, int parameter, Class expectedClass, int expectedDefaultBinSize, int expectedBinSize) { + super(IndexCreatorTest.class); + + this.type = type; + this.parameter = parameter; + this.expectedClass = expectedClass; + this.expectedDefaultBinSize = expectedDefaultBinSize; + this.expectedBinSize = expectedBinSize; + } + } + + @DataProvider(name = "indexCreator") + public Object[][] indexCreatorData() { + new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0, DynamicIndexCreator.class, -1, -1); + new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0, DynamicIndexCreator.class, -1, -1); + new IndexCreatorTest(GATKVCFIndexType.LINEAR, 100, LinearIndexCreator.class, LinearIndexCreator.DEFAULT_BIN_WIDTH, 100); + new IndexCreatorTest(GATKVCFIndexType.INTERVAL, 200, IntervalIndexCreator.class, IntervalIndexCreator.DEFAULT_FEATURE_COUNT, 200); + + return IndexCreatorTest.getTests(IndexCreatorTest.class); + } + + @Test(dataProvider = "indexCreator") + public void testGetIndexCreator(IndexCreatorTest spec) { + File dummy = new File(""); + IndexCreator ic = GATKVCFUtils.getIndexCreator(spec.type, spec.parameter, dummy); + Assert.assertEquals(ic.getClass(), spec.expectedClass, "Wrong IndexCreator type"); + Assert.assertEquals(ic.defaultBinSize(), spec.expectedDefaultBinSize, "Wrong default bin size"); + Assert.assertEquals(ic.getBinSize(), spec.expectedBinSize, "Wrong bin size"); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 937698d82..30f112241 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.vcf.VCFConstants; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -56,11 +57,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { ATCATC = Allele.create("ATCATC"); } - private Genotype makeG(String sample, Allele a1, Allele a2) { - return GenotypeBuilder.create(sample, Arrays.asList(a1, a2)); - } - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, double... pls) { + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); } @@ -107,7 +104,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { private MergeAllelesTest(List... arg) { super(MergeAllelesTest.class); - LinkedList> all = new LinkedList>(Arrays.asList(arg)); + LinkedList> all = new LinkedList<>(Arrays.asList(arg)); expected = all.pollLast(); inputs = all; } @@ -185,7 +182,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -243,7 +240,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false); Assert.assertEquals(merged.getID(), cfg.expected); } @@ -358,7 +355,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -485,7 +482,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = GATKVariantContextUtils.simpleMerge( cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -526,10 +523,10 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false); // test genotypes - Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); + Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); } // TODO: remove after testing @@ -540,7 +537,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { // // final VariantContext merged = VariantContextUtils.simpleMerge( // Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); // } // -------------------------------------------------------------------------------- @@ -559,7 +556,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false); if ( annotate ) Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); @@ -570,7 +567,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } private static final List vcs2priority(final Collection vcs) { - final List priority = new ArrayList(); + final List priority = new ArrayList<>(); for ( final VariantContext vc : vcs ) { priority.add(vc.getSource()); @@ -997,7 +994,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "PrimitiveAlleleSplittingData") public Object[][] makePrimitiveAlleleSplittingData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); // no split tests.add(new Object[]{"A", "C", 0, null}); @@ -1039,6 +1036,26 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } + @Test(enabled = !DEBUG) + public void testFillInNonRefSymbolicAlleles() { + final int start = 10; + final String ref = "A"; + final String alt = "C"; + final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">")); + + List VCs = Arrays.asList(vcAlt, vcRef); + VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.emptyList()); + + // make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele + for( final VariantContext vc : VCs ) { + Assert.assertTrue(vc.getAlternateAlleles().size() == 1); + Assert.assertTrue(vc.getAlternateAllele(0).isNonReference()); + Assert.assertTrue(!vc.getReference().isSymbolic()); + Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic()); + } + } + // -------------------------------------------------------------------------------- // // test allele remapping @@ -1047,7 +1064,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "AlleleRemappingData") public Object[][] makeAlleleRemappingData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final Allele originalBase1 = Allele.create((byte)'A'); final Allele originalBase2 = Allele.create((byte)'T'); @@ -1055,7 +1072,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { for ( final byte base1 : BaseUtils.BASES ) { for ( final byte base2 : BaseUtils.BASES ) { for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { - Map map = new HashMap(2); + Map map = new HashMap<>(2); map.put(originalBase1, Allele.create(base1)); map.put(originalBase2, Allele.create(base2)); @@ -1303,4 +1320,112 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } } + + // -------------------------------------------------------------------------------- + // + // Test updatePLsAndAD + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "updatePLsAndADData") + public Object[][] makeUpdatePLsAndADData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); + + // make sure we don't screw up the simple case where no selection happens + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test cases + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + + final int[] homRef3AllelesAD = new int[]{20, 0, 1}; + final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; + final int[] homC3AllelesAD = new int[]{0, 20, 1}; + final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; + final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG + final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "updatePLsAndADData") + public void testUpdatePLsAndADData(final VariantContext originalVC, + final VariantContext selectedVC, + final List expectedGenotypes) { + final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); + final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java index f4cef7730..f29a1106c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VCFIntegrationTest.java @@ -25,11 +25,24 @@ package org.broadinstitute.sting.utils.variant; +import org.broad.tribble.index.AbstractIndex; +import org.broad.tribble.index.ChrIndex; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.index.interval.IntervalTreeIndex; +import org.broad.tribble.index.linear.LinearIndex; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.variant.vcf.VCFCodec; +import org.testng.Assert; +import org.testng.TestException; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.lang.reflect.Field; import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; public class VCFIntegrationTest extends WalkerTest { @@ -141,4 +154,122 @@ public class VCFIntegrationTest extends WalkerTest { spec1.disableShadowBCF(); executeTest("Test reading VCF without header lines with additional args " + moreArgs, spec1); } + + // + // + // IndexCreator tests + // + // + + private class VCFIndexCreatorTest extends TestDataProvider { + private final GATKVCFIndexType type; + private final int parameter; + + private VCFIndexCreatorTest(GATKVCFIndexType type, int parameter) { + super(VCFIndexCreatorTest.class); + + this.type = type; + this.parameter = parameter; + } + + public String toString() { + return String.format("Index Type %s, Index Parameter %s", type, parameter); + } + + public Index getIndex(final File vcfFile) { + switch (type) { + case DYNAMIC_SEEK : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + case DYNAMIC_SIZE : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SIZE); + case LINEAR : return IndexFactory.createLinearIndex(vcfFile, new VCFCodec(), parameter); + case INTERVAL : return IndexFactory.createIntervalIndex(vcfFile, new VCFCodec(), parameter); + default : throw new TestException("Invalid index type"); + } + } + } + + @DataProvider(name = "IndexDataProvider") + public Object[][] indexCreatorData() { + new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0); + new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0); + new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 100); + new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 10000); + new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 20); + new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 2000); + + return TestDataProvider.getTests(VCFIndexCreatorTest.class); + } + + @Test(dataProvider = "IndexDataProvider") + public void testVCFIndexCreation(VCFIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException { + + final String commandLine = " -T SelectVariants" + + " -R " + b37KGReference + + " --no_cmdline_in_header" + + " -L 20" + + " -V " + b37_NA12878_OMNI + + " --variant_index_type " + testSpec.type + + " --variant_index_parameter " + testSpec.parameter + + " -o %s "; + final String name = "testVCFIndexCreation: " + testSpec.toString(); + + final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("")); + spec.disableShadowBCF(); + + File outVCF = executeTest(name, spec).first.get(0); + File outIdx = new File(outVCF.getAbsolutePath() + ".idx"); + + final Index actualIndex = IndexFactory.loadIndex(outIdx.getAbsolutePath()); + final Index expectedIndex = testSpec.getIndex(outVCF); + + if (testSpec.type.equals("LINEAR")) + Assert.assertTrue(actualIndex instanceof LinearIndex, "Index is not a LinearIndex"); + else if (testSpec.type.equals("INTERVAL")) + Assert.assertTrue(actualIndex instanceof IntervalTreeIndex, "Index is not a IntervalTreeIndex"); + // dynamic indices ultimately resolve to one of LinearIndex or IntervalTreeIndex + + Assert.assertTrue(equivalentAbstractIndices((AbstractIndex)actualIndex, (AbstractIndex)expectedIndex), "Indices are not equivalent"); + + if (actualIndex instanceof LinearIndex && expectedIndex instanceof LinearIndex) { + Assert.assertTrue(equivalentLinearIndices((LinearIndex)actualIndex, (LinearIndex)expectedIndex, "20"), "Linear indices are not equivalent"); + } + else if (actualIndex instanceof IntervalTreeIndex && expectedIndex instanceof IntervalTreeIndex) { + Assert.assertTrue(equivalentIntervalIndices((IntervalTreeIndex)actualIndex, (IntervalTreeIndex)expectedIndex, "20"), "Interval indices are not equivalent"); + } + else { + Assert.fail("Indices are not of the same type"); + } + } + + private static boolean equivalentAbstractIndices(AbstractIndex thisIndex, AbstractIndex otherIndex){ + return thisIndex.getVersion() == otherIndex.getVersion() && + thisIndex.getIndexedFile().equals(otherIndex.getIndexedFile()) && + thisIndex.getIndexedFileSize() == otherIndex.getIndexedFileSize() && + thisIndex.getIndexedFileMD5().equals(otherIndex.getIndexedFileMD5()) && + thisIndex.getFlags() == otherIndex.getFlags(); + } + + private static boolean equivalentLinearIndices(LinearIndex thisIndex, LinearIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { + org.broad.tribble.index.linear.LinearIndex.ChrIndex thisChr = (org.broad.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(thisIndex, chr); + org.broad.tribble.index.linear.LinearIndex.ChrIndex otherChr = (org.broad.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(otherIndex, chr); + + return thisChr.getName().equals(otherChr.getName()) && + //thisChr.getTotalSize() == otherChr.getTotalSize() && TODO: why does this differ? + thisChr.getNFeatures() == otherChr.getNFeatures() && + thisChr.getNBlocks() == otherChr.getNBlocks(); + } + + private static boolean equivalentIntervalIndices(IntervalTreeIndex thisIndex, IntervalTreeIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { + org.broad.tribble.index.interval.IntervalTreeIndex.ChrIndex thisChr = (org.broad.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(thisIndex, chr); + org.broad.tribble.index.interval.IntervalTreeIndex.ChrIndex otherChr = (org.broad.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(otherIndex, chr); + + // TODO: compare trees? + return thisChr.getName().equals(otherChr.getName()); + } + + private static ChrIndex getChrIndex(AbstractIndex index, String chr) throws NoSuchFieldException, IllegalAccessException { + Field f = AbstractIndex.class.getDeclaredField("chrIndices"); + f.setAccessible(true); + LinkedHashMap chrIndices = (LinkedHashMap) f.get(index); + return chrIndices.get(chr); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java index 51a47d86d..a1b75a3f1 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -105,7 +105,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { public void run(T vc); } - private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { + private void runBenchmark(FeatureCodec codec, FunctionToBenchmark func) { // TODO -- update for new Tribble interface // try { // InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes()); @@ -129,7 +129,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { public void timeV14(int rep) { for ( int i = 0; i < rep; i++ ) { FunctionToBenchmark func = getV14FunctionToBenchmark(); - FeatureCodec codec = new VCFCodec(); + final VCFCodec codec = new VCFCodec(); runBenchmark(codec, func); } } @@ -147,7 +147,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { Set samples; public void run(final VariantContext vc) { if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); VariantContext sub = vc.subContextFromSamples(samples); sub.getNSamples(); } @@ -176,7 +176,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { Set samples; public void run(final VariantContext vc) { if ( samples == null ) - samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); + samples = new HashSet<>(new ArrayList<>(vc.getSampleNames()).subList(0, nSamplesToTake)); vc.getGenotypes(samples).size(); } }; @@ -221,7 +221,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { case MERGE: return new FunctionToBenchmark() { public void run(final VariantContext vc) { - List toMerge = new ArrayList(); + List toMerge = new ArrayList<>(); for ( int i = 0; i < dupsToMerge; i++ ) { GenotypesContext gc = GenotypesContext.create(vc.getNSamples()); @@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { GATKVariantContextUtils.simpleMerge(toMerge, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true); + true, false, "set", false, true, false); } }; @@ -363,7 +363,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { // toMerge, null, // org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, // org.broadinstitute.variant.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED, -// true, false, "set", false, true); +// true, false, "set", false, true, false); // } // }; // diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl index ba4198292..a942145d7 100755 --- a/public/perl/liftOverVCF.pl +++ b/public/perl/liftOverVCF.pl @@ -36,7 +36,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -V:variant $in -o $unsorted_vcf -chain $chain -dict $newRef.dict -U LENIENT_VCF_PROCESSING"; if ($recordOriginalLocation) { $cmd .= " -recordOriginalLocation"; } @@ -66,7 +66,7 @@ system($cmd) == 0 or quit("The sorting step failed. Please correct the necessar # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -V:variant $sorted_vcf -o $out -U LENIENT_VCF_PROCESSING"; system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); # clean up diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 1736adc17..d0c917a9e 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -95,6 +95,7 @@ class GATKResourcesBundle extends QScript { def isBAM(file: File) = file.getName.endsWith(".bam") def isOUT(file: File) = file.getName.endsWith(".out") def isFASTA(file: File) = file.getName.endsWith(".fasta") + def isIntervalList(file: File) = file.getName.endsWith(".interval_list") var RESOURCES: List[Resource] = Nil def addResource(comp: Resource) { RESOURCES = comp :: RESOURCES } @@ -148,8 +149,8 @@ class GATKResourcesBundle extends QScript { // // standard VCF files. Will be lifted to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_137_b37.leftAligned.vcf", - "dbsnp_137", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_138_b37.leftAligned.vcf", + "dbsnp_138", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", "1000G_omni2.5", b37, true, false)) @@ -167,11 +168,18 @@ class GATKResourcesBundle extends QScript { "Mills_and_1000G_gold_standard.indels", b37, true, false)) // - // CEU trio (NA12878,NA12891,NA12892) best practices results (including PBT) + // CEU trio (NA12878,NA12891,NA12892) best practices results // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/callsets/CEUtrio_BestPractices/CEUTrio.HiSeq.WGS.b37.snps_and_indels.recalibrated.filtered.phased.CURRENT.vcf", - "CEUTrio.HiSeq.WGS.b37.bestPractices.phased",b37,true,false)) + addResource(new Resource("/humgen/1kg/processing/production_wgs_final/trio/CEU/CEU.wgs.HaplotypeCaller.20131118.snps_indels.high_coverage_pcr_free.genotypes.vcf", + "CEUTrio.HiSeq.WGS.b37.bestPractices",b37,true,false)) + + // + // NA12878 knowledgebase snapshot + // + + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/knowledgeBase/snapshots/NA12878.wgs.broad_truth_set.20131119.snps_and_indels.genotypes.vcf", + "NA12878.knowledgebase.snapshot.20131119",b37,true,false)) // // example call set for documentation guide tutorial @@ -180,11 +188,17 @@ class GATKResourcesBundle extends QScript { "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) // - // Test BAM file, specific to each reference + // Test BAM file, only for the b37 reference // addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.NA12878.bam", "IGNORE", b37, false, false)) + // + // Exome targets file, only for the b37 reference + // + addResource(new Resource("/seq/references/HybSelOligos/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", + "Broad.human.exome", b37, true, false, false)) + // // refGene files specific to each reference // @@ -217,7 +231,7 @@ class GATKResourcesBundle extends QScript { val currentLink = new File(BUNDLE_ROOT + "/current") - if ( currentLink.exists ) currentLink.delete() + if ( currentLink.exists ) add(new deleteLink(currentLink)) add(new linkFile(bundleDir, currentLink)) } @@ -275,14 +289,16 @@ class GATKResourcesBundle extends QScript { } } } + } else if ( isIntervalList(resource.file) ) { + val out = destFile(BUNDLE_DIR, resource.ref, resource.destname(resource.ref)) + add(new cpFile(resource.file, out)) } else { //throw new ReviewedStingException("Unknown file type: " + resource) } } - createCurrentLink(BUNDLE_DIR) - } else { + createCurrentLink(BUNDLE_DIR) createBundleDirectories(DOWNLOAD_DIR) createDownloadsFromBundle(BUNDLE_DIR, DOWNLOAD_DIR) } @@ -354,6 +370,10 @@ class GATKResourcesBundle extends QScript { def commandLine = "cp %s %s".format(in.getAbsolutePath, out.getAbsolutePath) } + class deleteLink(@Input val in: File) extends CommandLineFunction { + def commandLine = "rm %s".format(in.getAbsolutePath) + } + class linkFile(@Input val in: File, @Output val out: File) extends CommandLineFunction { def commandLine = "ln -s %s %s".format(in.getAbsolutePath, out.getAbsolutePath) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 1c48e3bc7..fc1d4599e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -105,7 +105,7 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon this.functions ++= functions } - def addAll(functions: Seq[QFunction]) { + def addAll(functions: Traversable[QFunction]) { functions.foreach( f => add(f) ) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala index 37c4a5bbe..80dd53302 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala @@ -33,10 +33,10 @@ import java.io.File import scala.tools.nsc.reporters.AbstractReporter import java.lang.String import org.apache.log4j.Level -import scala.tools.nsc.util.{FakePos, NoPosition, Position} import org.broadinstitute.sting.queue.util.TextFormatUtils._ import org.broadinstitute.sting.utils.classloader.JVMUtils -import tools.util.StringOps +import scala.reflect.internal.util.{FakePos, NoPosition, Position, StringOps} +import org.broadinstitute.sting.utils.exceptions.UserException /** * Plugin manager for QScripts which loads QScripts into the current class loader. @@ -47,13 +47,21 @@ class QScriptManager() extends Logging { * Heavily based on scala/src/compiler/scala/tools/ant/Scalac.scala */ def loadScripts(scripts: Seq[File], tempDir: File) { + // Make sure the scripts actually exist. + scripts.foreach{ + file => if( !file.exists()) throw new UserException.CouldNotReadInputFile(file, "it does not exist.") + } + if (scripts.size > 0) { val settings = new Settings((error: String) => logger.error(error)) settings.deprecation.value = true settings.outdir.value = tempDir.getPath // Set the classpath to the current class path. - JVMUtils.getClasspathURLs.foreach(url => settings.classpath.append(url.getPath)) + JVMUtils.getClasspathURLs.foreach(url => { + settings.bootclasspath.append(url.getPath) + settings.classpath.append(url.getPath) + }) val reporter = new QScriptManager.Log4JReporter(settings) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala index 9cfd69247..b405c91a2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/drmaa/DrmaaJobRunner.scala @@ -50,7 +50,7 @@ class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) ex session.synchronized { val drmaaJob: JobTemplate = session.createJobTemplate - drmaaJob.setJobName(function.description.take(jobNameLength).replaceAll(jobNameFilter, "_")) + drmaaJob.setJobName(function.jobRunnerJobName.take(jobNameLength).replaceAll(jobNameFilter, "_")) // Set the current working directory drmaaJob.setWorkingDirectory(function.commandDirectory.getPath) @@ -160,7 +160,7 @@ class DrmaaJobRunner(val session: Session, val function: CommandLineFunction) ex // resource of the designated queue to SIGTERM session.control(jobId, Session.TERMINATE) } catch { - case e => + case e: Exception => logger.error("Unable to kill job " + jobId, e) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala index 1140c4945..e9f141880 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/lsf/Lsf706JobRunner.scala @@ -71,7 +71,7 @@ class Lsf706JobRunner(val function: CommandLineFunction) extends CommandLineJobR for (i <- 0 until LibLsf.LSF_RLIM_NLIMITS) request.rLimits(i) = LibLsf.DEFAULT_RLIMIT; - request.jobName = function.description.take(LibBat.MAX_JOB_NAME_LEN) + request.jobName = function.jobRunnerJobName.take(LibBat.MAX_JOB_NAME_LEN) request.options |= LibBat.SUB_JOB_NAME // Set the output file for stdout @@ -361,7 +361,7 @@ object Lsf706JobRunner extends Logging { if (LibBat.lsb_signaljob(runner.jobId, SIGTERM) < 0) logger.error(LibBat.lsb_sperror("Unable to kill job " + runner.jobId)) } catch { - case e => + case e: Exception=> logger.error("Unable to kill job " + runner.jobId, e) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala index 13b3c7cb3..e3528f54f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/shell/ShellJobRunner.scala @@ -83,8 +83,8 @@ class ShellJobRunner(val function: CommandLineFunction) extends CommandLineJobRu try { controller.tryDestroy() } catch { - case e => - logger.error("Unable to kill shell job: " + function.description) + case e: Exception => + logger.error("Unable to kill shell job: " + function.description, e) } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala index 30fd4c81f..940d98860 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/CatVariantsGatherer.scala @@ -48,6 +48,9 @@ class CatVariantsGatherer extends CatVariants with GatherFunction with RetryMemo this.variant = this.gatherParts.zipWithIndex map { case (input, index) => new TaggedFile(input, "input"+index) } this.outputFile = this.originalOutput this.assumeSorted = true + this.variant_index_type = originalGATK.variant_index_type + this.variant_index_parameter = originalGATK.variant_index_parameter + super.freezeFieldValues() } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala index 92d71536e..3fd9a3c6d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala @@ -38,6 +38,7 @@ import net.sf.picard.analysis.MetricAccumulationLevel */ class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction { analysisName = "CalculateHsMetrics" + javaMainClass = "net.sf.picard.analysis.directed.CalculateHsMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala index 7c4c3f26a..14c10c581 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala @@ -37,6 +37,7 @@ import java.io.File */ class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction { analysisName = "CollectGcBiasMetrics" + javaMainClass = "net.sf.picard.analysis.CollectGcBiasMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala index cac436906..3c122ee60 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala @@ -37,6 +37,7 @@ import java.io.File */ class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardMetricsFunction{ analysisName = "CollectMultipleMetrics" + javaMainClass = "net.sf.picard.analysis.CollectMultipleMetrics" @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) var input: Seq[File] = Nil diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala index 1f3df2dfa..e09e0c100 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala @@ -81,6 +81,9 @@ class SamToFastq extends org.broadinstitute.sting.queue.function.JavaCommandLine @Argument(shortName = "inpa", fullName = "include_non_primary_alignments", required = false, doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.") var includeNonPrimaryAlignments: Boolean = false + @Argument(shortName = "il", fullName = "interleave", required = false, doc = "Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe which end it came from") + var interleave: Boolean = false + override def inputBams = input override def outputBam = null this.sortOrder = null @@ -98,5 +101,7 @@ class SamToFastq extends org.broadinstitute.sting.queue.function.JavaCommandLine conditional(readOneMaxBasesToWrite >= 0, "READ1_MAX_BASES_TO_WRITE=" + readOneMaxBasesToWrite) + conditional(readTwoTrim >= 0, "READ2_TRIM=" + readTwoTrim) + conditional(readTwoMaxBasesToWrite >= 0, "READ2_MAX_BASES_TO_WRITE=" + readTwoMaxBasesToWrite) + - conditional(includeNonPrimaryAlignments, "INCLUDE_NON_PRIMARY_ALIGNMENTS=" + includeNonPrimaryAlignments) + conditional(includeNonPrimaryAlignments, "INCLUDE_NON_PRIMARY_ALIGNMENTS=" + includeNonPrimaryAlignments) + + conditional(interleave, "INTERLEAVE=" + interleave) + } \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 81c76dd29..3afd289af 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -32,6 +32,7 @@ import org.broadinstitute.sting.queue.{QException, QSettings} import java.lang.IllegalStateException import org.broadinstitute.sting.queue.util._ import org.broadinstitute.sting.utils.io.IOUtils +import scala.language.reflectiveCalls /** * The base interface for all functions in Queue. @@ -149,6 +150,11 @@ trait QFunction extends Logging with QJobReport { case _ => analysisName } } + + /** + * The name of the job as submitted to the job runner + */ + def jobRunnerJobName = shortDescription /** * Returns true if the function is done. diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala index 6dcc69854..ddff95f21 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala @@ -39,7 +39,7 @@ class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extend @Argument(doc="The samples to extract from the VCF") var extractSamples : List[String] = samples var out : PrintWriter = _ - var columns : List[Int] = 0 to 8 toList + var columns : List[Int] = (0 to 8).toList def run = { out = new PrintWriter(new PrintStream(outputVCF)) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala index 4d3bf719c..96a5973be 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/EmailMessage.scala @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.util import org.apache.commons.mail.{MultiPartEmail, EmailAttachment} -import java.io.{FileReader, File} +import java.io.{IOException, FileReader, File} import javax.mail.internet.InternetAddress import scala.collection.JavaConversions._ @@ -105,7 +105,7 @@ class EmailMessage extends Logging { try { Retry.attempt(() => send(settings), .5) } catch { - case e => logger.error("Error sending message: %n%s".format(this.toString), e) + case e: RetryException=> logger.error("Error sending message: %n%s".format(this.toString), e) } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala index 4acd27497..cb7b95b76 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/PrimitiveOptionConversions.scala @@ -25,6 +25,8 @@ package org.broadinstitute.sting.queue.util +import scala.language.implicitConversions + /** * An importable object that provides automatic primitive to option conversion. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 0b6fc44d9..b078bcd4f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -73,14 +73,29 @@ object QScriptUtils { /** * Check if there are multiple samples in a BAM file */ - def hasMultipleSamples(readGroups: java.util.List[SAMReadGroupRecord]): Boolean = { + def hasMultipleSamples(readGroups: Seq[SAMReadGroupRecord]): Boolean = { var sample: String = "" for (r <- readGroups) { if (sample.isEmpty) sample = r.getSample else if (sample != r.getSample) - return true; + return true } false } + + /** + * Returns all distinct samples in the BAM file + * + * @param bam the bam file + * @return a set with all distinct samples (in no particular order) + */ + def getSamplesFromBAM(bam: File) : Set[String] = { + val reader = new SAMFileReader(bam) + var samples: Set[String] = Set() + for (rg <- reader.getFileHeader.getReadGroups) { + samples += rg.getSample + } + samples + } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala b/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala index b112ed9a3..5b9e42a1e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/Retry.scala @@ -49,7 +49,7 @@ object Retry extends Logging { result = f() success = true } catch { - case e => { + case e: Exception=> { count += 1 if (count < tries) { val minutes = wait(count-1) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala index 35f872848..ff99cb346 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/StringFileConversions.scala @@ -26,6 +26,7 @@ package org.broadinstitute.sting.queue.util import java.io.{Serializable, File} +import scala.language.implicitConversions /** * Converts String to/from File @@ -77,21 +78,6 @@ object StringFileConversions { }) } - implicit def stringsAsFiles(x: Set[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable] with Serializable]): Set[File] = { - x.map(_ match { - case string: String => stringAsFile(string) - case file: File => file - case null => null - }) - } - - implicit def filesAsStrings(x: Set[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable] with Serializable]): Set[String] = { - x.map(_ match { - case file: File => fileAsString(file) - case string: String => string - case null => null - }) - } } /** @@ -124,11 +110,4 @@ trait StringFileConversions { StringFileConversions.filesAsStringsList(x) } - implicit def stringsAsFiles(x: Set[Comparable[_ >: File with String <: Comparable[_ >: File with String <: Comparable[_ >: File with String <: Serializable] with Serializable] with Serializable] with Serializable]): Set[File] = { - StringFileConversions.stringsAsFiles(x) - } - - implicit def filesAsStrings(x: Set[Comparable[_ >: String with File <: Comparable[_ >: String with File <: Comparable[_ >: String with File <: Serializable] with Serializable] with Serializable] with Serializable]): Set[String] = { - StringFileConversions.filesAsStrings(x) - } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala index 73c4ccf64..5802faff2 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala @@ -31,14 +31,12 @@ import scala.io.Source._ import net.sf.samtools.SAMFileReader import org.broadinstitute.variant.vcf.{VCFHeader, VCFCodec} import scala.collection.JavaConversions._ -import org.broad.tribble.{FeatureCodec, AbstractFeatureReader} -import org.broadinstitute.variant.variantcontext.VariantContext +import org.broad.tribble.AbstractFeatureReader object VCF_BAM_utilities { def getSamplesFromVCF(vcfFile: File): List[String] = { - val codec: FeatureCodec[VariantContext] = new VCFCodec().asInstanceOf[FeatureCodec[VariantContext]] - AbstractFeatureReader.getFeatureReader(vcfFile.getPath, codec).getHeader.asInstanceOf[VCFHeader].getGenotypeSamples.toList + AbstractFeatureReader.getFeatureReader(vcfFile.getPath, new VCFCodec()).getHeader.asInstanceOf[VCFHeader].getGenotypeSamples.toList } def getSamplesInBAM(bam: File): List[String] = { diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala index e9a288117..251b1c511 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PipelineTest.scala @@ -182,7 +182,7 @@ object PipelineTest extends BaseTest with Logging { println("Executing test %s with Queue arguments: %s".format(name, Utils.join(" ",command))) CommandLineProgram.start(instance, command) } catch { - case e => + case e: Exception => gotAnException = true if (expectedException != null) { // we expect an exception @@ -224,7 +224,7 @@ object PipelineTest extends BaseTest with Logging { try { commandLine.shutdown() } catch { - case _ => /* ignore */ + case _: Throwable => /* ignore */ }) } }) diff --git a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala index 22a7a8a04..5ee02b8bc 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/util/StringFileConversionsUnitTest.scala @@ -87,44 +87,6 @@ class StringFileConversionsUnitTest { Assert.assertEquals(strings, Seq(null, null)) } - @Test - def testStringToFileSet() { - var files = Set(new File("foo")) - files += "bar" - Assert.assertEquals(files, Set(new File("foo"), new File("bar"))) - - files = Set(new File("foo")) - files += null.asInstanceOf[String] - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files += "foo" - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files += null.asInstanceOf[String] - Assert.assertEquals(files, Set(null)) - } - - @Test - def testFileToStringSet() { - var strings = Set("foo") - strings += new File("bar") - Assert.assertEquals(strings, Set("foo", "bar")) - - strings = Set("foo") - strings += null.asInstanceOf[File] - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings += new File("foo") - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings += null.asInstanceOf[File] - Assert.assertEquals(strings, Set(null)) - } - @Test def testStringListToFileList() { var files = Seq(new File("foo")) @@ -163,41 +125,4 @@ class StringFileConversionsUnitTest { Assert.assertEquals(strings, Seq(null, null)) } - @Test - def testStringSetToFileSet() { - var files = Set(new File("foo")) - files ++= Set("bar") - Assert.assertEquals(files, Set(new File("foo"), new File("bar"))) - - files = Set(new File("foo")) - files ++= Set[String](null) - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files ++= Set("foo") - Assert.assertEquals(files, Set(new File("foo"), null)) - - files = Set[File](null) - files ++= Set[String](null) - Assert.assertEquals(files, Set(null)) - } - - @Test - def testFileSetToStringSet() { - var strings = Set("foo") - strings ++= Set(new File("bar")) - Assert.assertEquals(strings, Set("foo", "bar")) - - strings = Set("foo") - strings ++= Set[File](null) - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings ++= Set(new File("foo")) - Assert.assertEquals(strings, Set("foo", null)) - - strings = Set[String](null) - strings ++= Set[File](null) - Assert.assertEquals(strings, Set(null)) - } } diff --git a/settings/helpTemplates/common.html b/settings/helpTemplates/common.html index 677fdf861..f4fb74af1 100644 --- a/settings/helpTemplates/common.html +++ b/settings/helpTemplates/common.html @@ -58,7 +58,7 @@ @@ -82,7 +82,7 @@

See also Guide Index | - Technical Documentation Index | + Tool Documentation Index | Support Forum

diff --git a/settings/helpTemplates/generic.index.template.html b/settings/helpTemplates/generic.index.template.html index b3e3d0212..a5650d55e 100644 --- a/settings/helpTemplates/generic.index.template.html +++ b/settings/helpTemplates/generic.index.template.html @@ -53,8 +53,8 @@ -<@makeHeader title="Technical Documentation Index" isIndex=true /> -

Technical Documentation Index +<@makeHeader title="GATK | Tool Documentation Index" isIndex=true /> +

Tool Documentation Index ${version}

diff --git a/settings/helpTemplates/generic.template.html b/settings/helpTemplates/generic.template.html index b05ad65c0..eea741669 100644 --- a/settings/helpTemplates/generic.template.html +++ b/settings/helpTemplates/generic.template.html @@ -88,7 +88,7 @@ -<@makeHeader title="${name} documentation" isIndex=false /> +<@makeHeader title="GATK | ${name} documentation" isIndex=false />