Added ability to clean using only known indels. Added integration test for it. Fixed vcf->vc conversion for indels which was busted.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3678 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
610cc7ae2b
commit
12c0de6170
|
|
@ -13,10 +13,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.LikelihoodObject;
|
||||
import org.broadinstitute.sting.utils.genotype.geli.GeliTextWriter;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFSingleCall;
|
||||
import org.broadinstitute.sting.utils.genotype.glf.GLFWriter;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -218,13 +215,13 @@ public class VariantContextAdaptors {
|
|||
} else if ( !vcf.isIndel() ) {
|
||||
refAllele = Allele.create(ref.getBase(), true);
|
||||
} else if ( vcf.isDeletion() ) {
|
||||
int start = (int)(ref.getLocus().getStart() - ref.getWindow().getStart() + 1);
|
||||
int start = vcf.getPosition() - (int)ref.getWindow().getStart() + 1;
|
||||
int delLength = 0;
|
||||
for ( VCFGenotypeEncoding enc : vcf.getAlternateAlleles() ) {
|
||||
if ( enc.getLength() > delLength )
|
||||
delLength = enc.getLength();
|
||||
}
|
||||
if ( delLength > ref.getWindow().getStop() - ref.getLocus().getStop() )
|
||||
if ( delLength > ref.getWindow().getStop() - vcf.getPosition() )
|
||||
throw new IllegalArgumentException("Length of deletion is larger than reference context provided at " + ref.getLocus());
|
||||
|
||||
refAllele = deletionAllele(ref, start, delLength);
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.*;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
|
@ -51,6 +53,7 @@ import java.util.*;
|
|||
* Unlike most mappers, this walker uses the full alignment context to determine whether an
|
||||
* appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly.
|
||||
*/
|
||||
@Reference(window=@Window(start=-1,stop=150))
|
||||
public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||
|
||||
public static final String ORIGINAL_CIGAR_TAG = "OC";
|
||||
|
|
@ -71,6 +74,9 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName="bam_compression", shortName="compress", required=false, doc="Compression level to use for output bams [default:5]")
|
||||
protected Integer compressionLevel = 5;
|
||||
|
||||
@Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", required=false, doc="Don't run 'Smith-Waterman' to generate alternate consenses; use only known indels provided as RODs for constructing the alternate references.")
|
||||
protected boolean USE_KNOWN_INDELS_ONLY = false;
|
||||
|
||||
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
|
||||
"If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false)
|
||||
protected int MAX_RECORDS_IN_RAM = 500000;
|
||||
|
|
@ -102,10 +108,10 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
@Argument(fullName="realignReadsWithBadMates", required=false, doc="Should we try to realign paired-end reads whose mates map to other chromosomes?")
|
||||
protected boolean REALIGN_BADLY_MATED_READS = false;
|
||||
|
||||
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
@Argument(fullName="noPGTag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
protected boolean NO_PG_TAG = false;
|
||||
|
||||
@Argument(fullName="no_original_alignment_tags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.")
|
||||
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.")
|
||||
protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false;
|
||||
|
||||
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.")
|
||||
|
|
@ -260,7 +266,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
} else {
|
||||
readsToClean.add(read, ref.getBases());
|
||||
// add the rods to the list of known variants
|
||||
populateKnownIndels(metaDataTracker, null);
|
||||
populateKnownIndels(metaDataTracker, ref);
|
||||
}
|
||||
|
||||
if ( readsToClean.size() + readsNotToClean.size() >= MAX_READS ) {
|
||||
|
|
@ -403,108 +409,23 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
final ArrayList<AlignedRead> altReads = new ArrayList<AlignedRead>(); // reads that don't perfectly match
|
||||
final LinkedList<AlignedRead> altAlignmentsToTest = new LinkedList<AlignedRead>(); // should we try to make an alt consensus from the read?
|
||||
final Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
|
||||
long totalRawMismatchSum = 0;
|
||||
|
||||
// if there are any known indels for this region, get them
|
||||
for ( VariantContext knownIndel : knownIndelsToTry.values() ) {
|
||||
if ( knownIndel == null || !knownIndel.isIndel() )
|
||||
continue;
|
||||
byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
|
||||
Consensus c = createAlternateConsensus((int)(knownIndel.getLocation().getStart() - leftmostIndex), reference, indelStr, knownIndel.isDeletion());
|
||||
if ( c != null )
|
||||
altConsenses.add(c);
|
||||
}
|
||||
// if there are any known indels for this region, get them and create alternate consenses
|
||||
generateAlternateConsensesFromKnownIndels(altConsenses, leftmostIndex, reference);
|
||||
|
||||
// decide which reads potentially need to be cleaned
|
||||
for ( final SAMRecord read : reads ) {
|
||||
// decide which reads potentially need to be cleaned;
|
||||
// if there are reads with a single indel in them, add that indel to the list of alternate consenses
|
||||
long totalRawMismatchSum = determineReadsThatNeedCleaning(reads, refReads, altReads, altAlignmentsToTest, altConsenses, leftmostIndex, reference);
|
||||
|
||||
// if ( debugOn ) {
|
||||
// System.out.println(read.getReadName()+" "+read.getCigarString()+" "+read.getAlignmentStart()+"-"+read.getAlignmentEnd());
|
||||
// System.out.println(reference.substring((int)(read.getAlignmentStart()-leftmostIndex),(int)(read.getAlignmentEnd()-leftmostIndex)));
|
||||
// System.out.println(read.getReadString());
|
||||
// }
|
||||
// use 'Smith-Waterman' to create alternate consenses from reads that mismatch the reference
|
||||
if ( !USE_KNOWN_INDELS_ONLY )
|
||||
generateAlternateConsensesFromReads(altAlignmentsToTest, altConsenses, reference);
|
||||
|
||||
// we can not deal with screwy records
|
||||
if ( read.getCigar().numCigarElements() == 0 ) {
|
||||
refReads.add(read);
|
||||
continue;
|
||||
}
|
||||
|
||||
final AlignedRead aRead = new AlignedRead(read);
|
||||
|
||||
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
|
||||
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
|
||||
if ( numBlocks == 2 ) {
|
||||
Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
||||
aRead.setCigar(newCigar, false);
|
||||
}
|
||||
|
||||
final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex;
|
||||
final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE);
|
||||
// if ( debugOn ) System.out.println("mismatchScore="+mismatchScore);
|
||||
|
||||
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||
if ( rawMismatchScore > 0 ) {
|
||||
altReads.add(aRead);
|
||||
//logger.debug("Adding " + aRead.getRead().getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads");
|
||||
|
||||
if ( !read.getDuplicateReadFlag() )
|
||||
totalRawMismatchSum += rawMismatchScore;
|
||||
aRead.setMismatchScoreToReference(rawMismatchScore);
|
||||
aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef));
|
||||
|
||||
// if it has an indel, let's see if that's the best consensus
|
||||
if ( numBlocks == 2 ) {
|
||||
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases());
|
||||
if ( c != null )
|
||||
altConsenses.add(c);
|
||||
|
||||
}
|
||||
else {
|
||||
// if ( debugOn ) System.out.println("Going to test...");
|
||||
altAlignmentsToTest.add(aRead);
|
||||
}
|
||||
}
|
||||
// otherwise, we can emit it as is
|
||||
else {
|
||||
// if ( debugOn ) System.out.println("Emitting as is...");
|
||||
//logger.debug("Adding " + aRead.getRead().getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads");
|
||||
refReads.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
// choose alternate consensuses
|
||||
if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) {
|
||||
for ( AlignedRead aRead : altAlignmentsToTest ) {
|
||||
// do a pairwise alignment against the reference
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadBases());
|
||||
if ( c != null ) {
|
||||
// if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ;
|
||||
altConsenses.add(c);
|
||||
} else {
|
||||
// if ( debugOn ) System.out.println("FAILED to create Alt consensus from SW");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// choose alternate consenses randomly
|
||||
int readsSeen = 0;
|
||||
while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsenses.size() <= MAX_CONSENSUSES) {
|
||||
int index = generator.nextInt(altAlignmentsToTest.size());
|
||||
AlignedRead aRead = altAlignmentsToTest.remove(index);
|
||||
// do a pairwise alignment against the reference
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getReadBases());
|
||||
if ( c != null )
|
||||
altConsenses.add(c);
|
||||
}
|
||||
}
|
||||
// if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n");
|
||||
|
||||
Consensus bestConsensus = null;
|
||||
Iterator<Consensus> iter = altConsenses.iterator();
|
||||
|
||||
// if ( debugOn ) System.out.println("------\nChecking consenses...\n--------\n");
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
Consensus consensus = iter.next();
|
||||
//logger.debug("Trying new consensus: " + consensus.cigar + " " + new String(consensus.str));
|
||||
|
|
@ -648,6 +569,103 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
private void generateAlternateConsensesFromKnownIndels(final Set<Consensus> altConsensesToPopulate, final long leftmostIndex, final byte[] reference) {
|
||||
for ( VariantContext knownIndel : knownIndelsToTry.values() ) {
|
||||
if ( knownIndel == null || !knownIndel.isIndel() )
|
||||
continue;
|
||||
byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
|
||||
int start = (int)(knownIndel.getLocation().getStart() - leftmostIndex) + 1;
|
||||
Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel.isDeletion());
|
||||
if ( c != null )
|
||||
altConsensesToPopulate.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
private long determineReadsThatNeedCleaning(final List<SAMRecord> reads,
|
||||
final ArrayList<SAMRecord> refReadsToPopulate,
|
||||
final ArrayList<AlignedRead> altReadsToPopulate,
|
||||
final LinkedList<AlignedRead> altAlignmentsToTest,
|
||||
final Set<Consensus> altConsenses,
|
||||
final long leftmostIndex,
|
||||
final byte[] reference) {
|
||||
|
||||
long totalRawMismatchSum = 0L;
|
||||
|
||||
for ( final SAMRecord read : reads ) {
|
||||
|
||||
// we can not deal with screwy records
|
||||
if ( read.getCigar().numCigarElements() == 0 ) {
|
||||
refReadsToPopulate.add(read);
|
||||
continue;
|
||||
}
|
||||
|
||||
final AlignedRead aRead = new AlignedRead(read);
|
||||
|
||||
// first, move existing indels (for 1 indel reads only) to leftmost position within identical sequence
|
||||
int numBlocks = AlignmentUtils.getNumAlignmentBlocks(read);
|
||||
if ( numBlocks == 2 ) {
|
||||
Cigar newCigar = AlignmentUtils.leftAlignIndel(unclipCigar(read.getCigar()), reference, read.getReadBases(), read.getAlignmentStart()-(int)leftmostIndex, 0);
|
||||
aRead.setCigar(newCigar, false);
|
||||
}
|
||||
|
||||
final int startOnRef = read.getAlignmentStart()-(int)leftmostIndex;
|
||||
final int rawMismatchScore = mismatchQualitySumIgnoreCigar(aRead, reference, startOnRef, Integer.MAX_VALUE);
|
||||
|
||||
// if this doesn't match perfectly to the reference, let's try to clean it
|
||||
if ( rawMismatchScore > 0 ) {
|
||||
altReadsToPopulate.add(aRead);
|
||||
//logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to non-ref reads");
|
||||
|
||||
if ( !read.getDuplicateReadFlag() )
|
||||
totalRawMismatchSum += rawMismatchScore;
|
||||
aRead.setMismatchScoreToReference(rawMismatchScore);
|
||||
aRead.setAlignerMismatchScore(AlignmentUtils.mismatchingQualities(aRead.getRead(), reference, startOnRef));
|
||||
|
||||
// if it has an indel, let's see if that's the best consensus
|
||||
if ( !USE_KNOWN_INDELS_ONLY && numBlocks == 2 ) {
|
||||
Consensus c = createAlternateConsensus(startOnRef, aRead.getCigar(), reference, aRead.getReadBases());
|
||||
if ( c != null )
|
||||
altConsenses.add(c);
|
||||
} else {
|
||||
altAlignmentsToTest.add(aRead);
|
||||
}
|
||||
}
|
||||
// otherwise, we can emit it as is
|
||||
else {
|
||||
//logger.debug("Adding " + read.getReadName() + " with raw mismatch score " + rawMismatchScore + " to ref reads");
|
||||
refReadsToPopulate.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
return totalRawMismatchSum;
|
||||
}
|
||||
|
||||
private void generateAlternateConsensesFromReads(final LinkedList<AlignedRead> altAlignmentsToTest, final Set<Consensus> altConsensesToPopulate, final byte[] reference) {
|
||||
|
||||
// if we are under the limit, use all reads to generate alternate consenses
|
||||
if ( altAlignmentsToTest.size() <= MAX_READS_FOR_CONSENSUSES ) {
|
||||
for ( AlignedRead aRead : altAlignmentsToTest )
|
||||
createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference);
|
||||
}
|
||||
// otherwise, choose reads for alternate consenses randomly
|
||||
else {
|
||||
int readsSeen = 0;
|
||||
while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) {
|
||||
int index = generator.nextInt(altAlignmentsToTest.size());
|
||||
AlignedRead aRead = altAlignmentsToTest.remove(index);
|
||||
createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void createAndAddAlternateConsensus(final byte[] read, final Set<Consensus> altConsensesToPopulate, final byte[] reference) {
|
||||
// do a pairwise alignment against the reference
|
||||
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, read, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
|
||||
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, read);
|
||||
if ( c != null )
|
||||
altConsensesToPopulate.add(c);
|
||||
}
|
||||
|
||||
// create a Consensus from cigar/read strings which originate somewhere on the reference
|
||||
private Consensus createAlternateConsensus(final int indexOnRef, final Cigar c, final byte[] reference, final byte[] readStr) {
|
||||
if ( indexOnRef < 0 )
|
||||
|
|
|
|||
|
|
@ -9,21 +9,31 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testRealignerLod5() {
|
||||
String[] md5lod5 = {"56f1fb75cae706a5a6278257ea2f2598", "18fca887d1eb7dc300e717ae03b9da62"};
|
||||
String[] md5s = {"56f1fb75cae706a5a6278257ea2f2598", "18fca887d1eb7dc300e717ae03b9da62"};
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T IndelRealigner -noPG -LOD 5 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10030000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe",
|
||||
2,
|
||||
Arrays.asList(md5lod5));
|
||||
Arrays.asList(md5s));
|
||||
executeTest("test realigner lod5", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRealignerLod50() {
|
||||
String[] md5lod50 = {"56f1fb75cae706a5a6278257ea2f2598", "9537e4f195ce5840136f60fb61201369"};
|
||||
String[] md5s = {"56f1fb75cae706a5a6278257ea2f2598", "9537e4f195ce5840136f60fb61201369"};
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T IndelRealigner -noPG -LOD 50 -maxConsensuses 100 -greedy 100 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10030000 -compress 1 -targetIntervals " + validationDataLocation + "cleaner.test.intervals -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe",
|
||||
2,
|
||||
Arrays.asList(md5lod50));
|
||||
Arrays.asList(md5s));
|
||||
executeTest("test realigner lod50", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRealignerKnownsOnly() {
|
||||
String[] md5s = {"54621fe4a53a559908ff20f6f0d0e758", "1091436c40d5ba557d85662999cc0c1d"};
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T IndelRealigner -noPG -LOD 1.0 -R " + oneKGLocation + "reference/human_b36_both.fasta -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam -L 1:10023000-10076000 -compress 1 -targetIntervals " + validationDataLocation + "NA12878.indels.intervals -B knownIndels,VCF," + validationDataLocation + "NA12878.indels.vcf -O %s -stats %s --sortInCoordinateOrderEvenThoughItIsHighlyUnsafe -knownsOnly",
|
||||
2,
|
||||
Arrays.asList(md5s));
|
||||
executeTest("test realigner known indels only", spec);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue