From 94800771e3c48e39fcef5280e5c31919031e8066 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 15 Jan 2013 10:19:18 -0500 Subject: [PATCH] 1. Initial implementation of bam writing for the HaplotypeCaller with -bam argument; currently only assembled haplotypes are emitted. 2. Framework is set up in the VariantAnnotator for the HaplotypeCaller to be able to call in to annotate dbSNP plus comp RODs. Until the HC uses meta data though, this won't work. --- .../annotator/VariantAnnotatorEngine.java | 27 ++++++--- .../haplotypecaller/HaplotypeCaller.java | 57 ++++++++++++++++++- .../HaplotypeCallerIntegrationTest.java | 5 ++ .../broadinstitute/sting/utils/Haplotype.java | 12 ++++ 4 files changed, 92 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 99dadea54..f03a25c04 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -52,6 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -214,10 +215,10 @@ public class VariantAnnotatorEngine { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences - vc = annotateDBs(tracker, ref, vc, infoAnnotations); + vc = annotateDBs(tracker, ref.getLocus(), vc, infoAnnotations); // annotate expressions where available - annotateExpressions(tracker, ref, infoAnnotations); + annotateExpressions(tracker, ref.getLocus(), infoAnnotations); // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { @@ -254,10 +255,22 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); } - private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { + public VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc) { + final Map newInfoAnnotations = new HashMap(0); + vc = annotateDBs(tracker, loc, vc, newInfoAnnotations); + + if ( !newInfoAnnotations.isEmpty() ) { + final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(newInfoAnnotations); + vc = builder.make(); + } + + return vc; + } + + private VariantContext annotateDBs(final RefMetaDataTracker tracker, final GenomeLoc loc, VariantContext vc, final Map infoAnnotations) { for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) { - final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType()); + final String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), loc), vc.getType()); // add the ID if appropriate if ( rsID != null ) { @@ -273,7 +286,7 @@ public class VariantAnnotatorEngine { } } else { boolean overlapsComp = false; - for ( VariantContext comp : tracker.getValues(dbSet.getKey(), ref.getLocus()) ) { + for ( VariantContext comp : tracker.getValues(dbSet.getKey(), loc) ) { if ( !comp.isFiltered() && ( !requireStrictAlleleMatch || comp.getAlleles().equals(vc.getAlleles()) ) ) { overlapsComp = true; break; @@ -287,9 +300,9 @@ public class VariantAnnotatorEngine { return vc; } - private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { + private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { for ( VAExpression expression : requestedExpressions ) { - Collection VCs = tracker.getValues(expression.binding, ref.getLocus()); + Collection VCs = tracker.getValues(expression.binding, loc); if ( VCs.size() == 0 ) continue; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 439a9b3b8..00db62bff 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -47,6 +47,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; +import com.sun.corba.se.impl.logging.UtilSystemException; +import net.sf.samtools.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -57,6 +59,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -67,6 +70,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; @@ -142,6 +146,17 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) protected PrintStream graphWriter = null; + /** + * The assembled haplotypes will be written as BAM to this file if requested. Really for debugging purposes only. Note that the output here + * does not include uninformative reads so that not every input read is emitted to the bam. + */ + @Hidden + @Output(fullName="bamOutput", shortName="bam", doc="File to which assembled haplotypes should be written", required = false) + protected StingSAMFileWriter bamWriter = null; + private SAMFileHeader bamHeader = null; + private long uniqueNameCounter = 1; + private final String readGroupId = "ArtificialHaplotype"; + /** * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. */ @@ -242,6 +257,8 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // the genotyping engine private GenotypingEngine genotypingEngine = null; + private VariantAnnotatorEngine annotationEngine = null; + // fasta reference reader to supplement the edges of the reference sequence private CachingIndexedFastaSequenceFile referenceReader; @@ -286,7 +303,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); // initialize the output VCF header - final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); Set headerInfo = new HashSet(); @@ -320,6 +337,21 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter, minKmer ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); genotypingEngine = new GenotypingEngine( DEBUG, annotationEngine, USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ); + + if ( bamWriter != null ) { + // prepare the bam header + bamHeader = new SAMFileHeader(); + bamHeader.setSequenceDictionary(getToolkit().getSAMFileHeader().getSequenceDictionary()); + final List readGroups = new ArrayList(1); + final SAMReadGroupRecord rg = new SAMReadGroupRecord(readGroupId); + rg.setSample("HC"); + rg.setSequencingCenter("BI"); + readGroups.add(rg); + bamHeader.setReadGroups(readGroups); + bamHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); + bamWriter.writeHeader(bamHeader); + bamWriter.setPresorted(true); + } } //--------------------------------------------------------------------------------------------------------------- @@ -408,7 +440,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem //--------------------------------------------------------------------------------------------------------------- @Override - public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { + public Integer map( final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) { if ( justDetermineActiveRegions ) // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; @@ -461,9 +493,30 @@ public class HaplotypeCaller extends ActiveRegionWalker implem activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { + annotationEngine.annotateDBs(metaDataTracker, getToolkit().getGenomeLocParser().createGenomeLoc(call), call); vcfWriter.add( call ); } + if ( bamWriter != null ) { + Collections.sort( haplotypes, new Haplotype.HaplotypePositionComparator() ); + final GenomeLoc paddedRefLoc = getPaddedLoc(activeRegion); + for ( Haplotype haplotype : haplotypes ) { + // TODO -- clean up this code + final GATKSAMRecord record = new GATKSAMRecord(bamHeader); + record.setReadBases(haplotype.getBases()); + record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef()); + record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length)); + record.setCigar(haplotype.getCigar()); + record.setMappingQuality(bestHaplotypes.contains(haplotype) ? 60 : 0); + record.setReadName("HC" + uniqueNameCounter++); + record.setReadUnmappedFlag(false); + record.setReferenceIndex(activeRegion.getReferenceLoc().getContigIndex()); + record.setAttribute(SAMTag.RG.toString(), readGroupId); + record.setFlags(16); + bamWriter.addAlignment(record); + } + } + if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); } return 1; // One active region was processed during this map call diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 8f5e275e6..e39975ea0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -75,6 +75,11 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } + @Test(enabled = false) + public void testHaplotypeCallerSingleSampleWithDbsnp() { + HCTest(NA12878_BAM, "-D " + b37dbSNP132, ""); + } + @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index efe9460cb..2706f2f99 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -191,6 +191,10 @@ public class Haplotype { public static class HaplotypeBaseComparator implements Comparator, Serializable { @Override public int compare( final Haplotype hap1, final Haplotype hap2 ) { + return compareHaplotypeBases(hap1, hap2); + } + + public static int compareHaplotypeBases(final Haplotype hap1, final Haplotype hap2) { final byte[] arr1 = hap1.getBases(); final byte[] arr2 = hap2.getBases(); // compares byte arrays using lexical ordering @@ -203,6 +207,14 @@ public class Haplotype { } } + public static class HaplotypePositionComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final int comp = hap1.getAlignmentStartHapwrtRef() - hap2.getAlignmentStartHapwrtRef(); + return comp == 0 ? HaplotypeBaseComparator.compareHaplotypeBases(hap1, hap2) : comp; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref,