From 12c0de61702ceb8c7d588e2e12867aade78f01f1 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 30 Jun 2010 01:20:56 +0000 Subject: [PATCH] Added ability to clean using only known indels. Added integration test for it. Fixed vcf->vc conversion for indels which was busted. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3678 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/refdata/VariantContextAdaptors.java | 7 +- .../gatk/walkers/indels/IndelRealigner.java | 212 ++++++++++-------- .../indels/IndelRealignerIntegrationTest.java | 18 +- 3 files changed, 131 insertions(+), 106 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 84da1493a..dace443ad 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -13,10 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.CalledGenotype; -import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter; -import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall; -import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import java.util.*; @@ -218,13 +215,13 @@ public class VariantContextAdaptors { } else if ( !vcf.isIndel() ) { refAllele = Allele.create(ref.getBase(), true); } else if ( vcf.isDeletion() ) { - int start = (int)(ref.getLocus().getStart() - ref.getWindow().getStart() + 1); + int start = vcf.getPosition() - (int)ref.getWindow().getStart() + 1; int delLength = 0; for ( VCFGenotypeEncoding enc : vcf.getAlternateAlleles() ) { if ( enc.getLength() > delLength ) delLength = enc.getLength(); } - if ( delLength > ref.getWindow().getStop() - ref.getLocus().getStop() ) + if ( delLength > ref.getWindow().getStop() - vcf.getPosition() ) throw new IllegalArgumentException("Length of deletion is larger than reference context provided at " + ref.getLocus()); refAllele = deletionAllele(ref, start, delLength); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 2ae7e39d9..5b10eee1d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -34,6 +34,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.text.TextFormattingUtils; @@ -51,6 +53,7 @@ import java.util.*; * Unlike most mappers, this walker uses the full alignment context to determine whether an * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. */ +@Reference(window=@Window(start=-1,stop=150)) public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_CIGAR_TAG = "OC"; @@ -71,6 +74,9 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]") protected Integer compressionLevel = 5; + @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", required=false, doc="Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references.") + protected boolean USE_KNOWN_INDELS_ONLY = false; + @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false) protected int MAX_RECORDS_IN_RAM = 500000; @@ -102,10 +108,10 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="realignReadsWithBadMates", required=false, doc="Should we try to realign paired-end reads whose mates map to other chromosomes?") protected boolean REALIGN_BADLY_MATED_READS = false; - @Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + @Argument(fullName="noPGTag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") protected boolean NO_PG_TAG = false; - @Argument(fullName="no_original_alignment_tags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.") + @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.") protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.") @@ -260,7 +266,7 @@ public class IndelRealigner extends ReadWalker { } else { readsToClean.add(read, ref.getBases()); // add the rods to the list of known variants - populateKnownIndels(metaDataTracker, null); + populateKnownIndels(metaDataTracker, ref); } if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) { @@ -403,108 +409,23 @@ public class IndelRealigner extends ReadWalker { final ArrayList altReads = new ArrayList(); // reads that don't perfectly match final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? final Set altConsenses = new LinkedHashSet(); // list of alt consenses - long totalRawMismatchSum = 0; - // if there are any known indels for this region, get them - for ( VariantContext knownIndel : knownIndelsToTry.values() ) { - if ( knownIndel == null || !knownIndel.isIndel() ) - continue; - byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); - Consensus c = createAlternateConsensus((int)(knownIndel.getLocation().getStart() - leftmostIndex), reference, indelStr, knownIndel.isDeletion()); - if ( c != null ) - altConsenses.add(c); - } + // if there are any known indels for this region, get them and create alternate consenses + generateAlternateConsensesFromKnownIndels(altConsenses, leftmostIndex, reference); - // decide which reads potentially need to be cleaned - for ( final SAMRecord read : reads ) { + // decide which reads potentially need to be cleaned; + // if there are reads with a single indel in them, add that indel to the list of alternate consenses + long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference); - // if ( debugOn ) { - // System.out.println(read.getReadName()+" "+read.getCigarString()+" "+read.getAlignmentStart()+"-"+read.getAlignmentEnd()); - // System.out.println(reference.substring((int)(read.getAlignmentStart()-leftmostIndex),(int)(read.getAlignmentEnd()-leftmostIndex))); - // System.out.println(read.getReadString()); - // } + // use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference + if ( !USE_KNOWN_INDELS_ONLY ) + generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference); - // we can not deal with screwy records - if ( read.getCigar().numCigarElements() == 0 ) { - refReads.add(read); - continue; - } - - final AlignedRead aRead = new AlignedRead(read); - - // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence - int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); - if ( numBlocks == 2 ) { - Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0); - aRead.setCigar(newCigar, false); - } - - final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex; - final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); - // if ( debugOn ) System.out.println("mismatchScore="+mismatchScore); - - // if this doesn't match perfectly to the reference, let's try to clean it - if ( rawMismatchScore > 0 ) { - altReads.add(aRead); - //logger.debug("Adding " + aRead.getRead().getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads"); - - if ( !read.getDuplicateReadFlag() ) - totalRawMismatchSum += rawMismatchScore; - aRead.setMismatchScoreToReference(rawMismatchScore); - aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef)); - - // if it has an indel, let's see if that's the best consensus - if ( numBlocks == 2 ) { - Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases()); - if ( c != null ) - altConsenses.add(c); - - } - else { - // if ( debugOn ) System.out.println("Going to test..."); - altAlignmentsToTest.add(aRead); - } - } - // otherwise, we can emit it as is - else { - // if ( debugOn ) System.out.println("Emitting as is..."); - //logger.debug("Adding " + aRead.getRead().getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); - refReads.add(read); - } - } - - // choose alternate consensuses - if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { - for ( AlignedRead aRead : altAlignmentsToTest ) { - // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadBases()); - if ( c != null ) { - // if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ; - altConsenses.add(c); - } else { - // if ( debugOn ) System.out.println("FAILED to create Alt consensus from SW"); - } - } - } else { - // choose alternate consenses randomly - int readsSeen = 0; - while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsenses.size() <= MAX_CONSENSUSES) { - int index = generator.nextInt(altAlignmentsToTest.size()); - AlignedRead aRead = altAlignmentsToTest.remove(index); - // do a pairwise alignment against the reference - SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); - Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadBases()); - if ( c != null ) - altConsenses.add(c); - } - } + // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); Consensus bestConsensus = null; Iterator iter = altConsenses.iterator(); - // if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n"); - while ( iter.hasNext() ) { Consensus consensus = iter.next(); //logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str)); @@ -648,6 +569,103 @@ public class IndelRealigner extends ReadWalker { } } + private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final long leftmostIndex, final byte[] reference) { + for ( VariantContext knownIndel : knownIndelsToTry.values() ) { + if ( knownIndel == null || !knownIndel.isIndel() ) + continue; + byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); + int start = (int)(knownIndel.getLocation().getStart() - leftmostIndex) + 1; + Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel.isDeletion()); + if ( c != null ) + altConsensesToPopulate.add(c); + } + } + + private long determineReadsThatNeedCleaning(final List reads, + final ArrayList refReadsToPopulate, + final ArrayList altReadsToPopulate, + final LinkedList altAlignmentsToTest, + final Set altConsenses, + final long leftmostIndex, + final byte[] reference) { + + long totalRawMismatchSum = 0L; + + for ( final SAMRecord read : reads ) { + + // we can not deal with screwy records + if ( read.getCigar().numCigarElements() == 0 ) { + refReadsToPopulate.add(read); + continue; + } + + final AlignedRead aRead = new AlignedRead(read); + + // first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence + int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read); + if ( numBlocks == 2 ) { + Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0); + aRead.setCigar(newCigar, false); + } + + final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex; + final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE); + + // if this doesn't match perfectly to the reference, let's try to clean it + if ( rawMismatchScore > 0 ) { + altReadsToPopulate.add(aRead); + //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads"); + + if ( !read.getDuplicateReadFlag() ) + totalRawMismatchSum += rawMismatchScore; + aRead.setMismatchScoreToReference(rawMismatchScore); + aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef)); + + // if it has an indel, let's see if that's the best consensus + if ( !USE_KNOWN_INDELS_ONLY && numBlocks == 2 ) { + Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases()); + if ( c != null ) + altConsenses.add(c); + } else { + altAlignmentsToTest.add(aRead); + } + } + // otherwise, we can emit it as is + else { + //logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads"); + refReadsToPopulate.add(read); + } + } + + return totalRawMismatchSum; + } + + private void generateAlternateConsensesFromReads(final LinkedList altAlignmentsToTest, final Set altConsensesToPopulate, final byte[] reference) { + + // if we are under the limit, use all reads to generate alternate consenses + if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) { + for ( AlignedRead aRead : altAlignmentsToTest ) + createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } + // otherwise, choose reads for alternate consenses randomly + else { + int readsSeen = 0; + while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { + int index = generator.nextInt(altAlignmentsToTest.size()); + AlignedRead aRead = altAlignmentsToTest.remove(index); + createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); + } + } + } + + private void createAndAddAlternateConsensus(final byte[] read, final Set altConsensesToPopulate, final byte[] reference) { + // do a pairwise alignment against the reference + SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); + Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read); + if ( c != null ) + altConsensesToPopulate.add(c); + } + // create a Consensus from cigar/read strings which originate somewhere on the reference private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) { if ( indexOnRef < 0 ) diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index b682ceb67..14a74d341 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -9,21 +9,31 @@ public class IndelRealignerIntegrationTest extends WalkerTest { @Test public void testRealignerLod5() { - String[] md5lod5 = {"56f1fb75cae706a5a6278257ea2f2598", "18fca887d1eb7dc300e717ae03b9da62"}; + String[] md5s = {"56f1fb75cae706a5a6278257ea2f2598", "18fca887d1eb7dc300e717ae03b9da62"}; WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10030000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", 2, - Arrays.asList(md5lod5)); + Arrays.asList(md5s)); executeTest("test realigner lod5", spec); } @Test public void testRealignerLod50() { - String[] md5lod50 = {"56f1fb75cae706a5a6278257ea2f2598", "9537e4f195ce5840136f60fb61201369"}; + String[] md5s = {"56f1fb75cae706a5a6278257ea2f2598", "9537e4f195ce5840136f60fb61201369"}; WalkerTestSpec spec = new WalkerTestSpec( "-T IndelRealigner -noPG -LOD 50 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10030000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe", 2, - Arrays.asList(md5lod50)); + Arrays.asList(md5s)); executeTest("test realigner lod50", spec); } + + @Test + public void testRealignerKnownsOnly() { + String[] md5s = {"54621fe4a53a559908ff20f6f0d0e758", "1091436c40d5ba557d85662999cc0c1d"}; + WalkerTestSpec spec = new WalkerTestSpec( + "-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly", + 2, + Arrays.asList(md5s)); + executeTest("test realigner known indels only", spec); + } } \ No newline at end of file