The new cleaner can now use known indels to create alternate consenses for cleaning.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2816 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-02-10 04:39:15 +00:00
parent 0250338ce7
commit 6652b992f7
1 changed files with 58 additions and 9 deletions

View File

@ -151,20 +151,22 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
generator = new Random(RANDOM_SEED); generator = new Random(RANDOM_SEED);
// set up the rods (since this is a ReadWalker we don't get rods from the traversal) // set up the rods (since this is a ReadWalker we don't get rods from the traversal)
logger.info("Reading and parsing known indel rod files...");
List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>(); List<ReferenceOrderedData<? extends ReferenceOrderedDatum>> rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum>>();
ReferenceOrderedData.parseBindings(knownIndels, rods); ReferenceOrderedData.parseBindings(knownIndels, rods);
for ( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods ) { for ( ReferenceOrderedData<? extends ReferenceOrderedDatum> rod : rods ) {
if ( !(rod instanceof VariationRod) ) if ( !VariationRod.class.isAssignableFrom(rod.getType()) )
continue; continue;
SeekableRODIterator<? extends ReferenceOrderedDatum> iter = rod.iterator(); SeekableRODIterator<? extends ReferenceOrderedDatum> iter = rod.iterator();
while ( iter.hasNext() ) { while ( iter.hasNext() ) {
RODRecordList<? extends ReferenceOrderedDatum> records = iter.next(); RODRecordList<? extends ReferenceOrderedDatum> records = iter.next();
for ( ReferenceOrderedDatum record : records ) { for ( ReferenceOrderedDatum record : records ) {
if (((VariationRod)record).isIndel()) if ( ((VariationRod)record).isIndel() )
knownIndelsToTry.add((VariationRod)record); knownIndelsToTry.add((VariationRod)record);
} }
} }
} }
logger.info("Finished reading and parsing known indel rod files");
if ( OUT_INDELS != null ) { if ( OUT_INDELS != null ) {
try { try {
@ -251,6 +253,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
readLoc = GenomeLocParser.createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); readLoc = GenomeLocParser.createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart());
if ( readLoc.isBefore(currentInterval) || Utils.is454Read(read) ) { if ( readLoc.isBefore(currentInterval) || Utils.is454Read(read) ) {
// TODO -- it would be nice if we could use indels from 454 reads as alternate consenses
emit(read); emit(read);
return 0; return 0;
} }
@ -391,6 +394,21 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses Set<Consensus> altConsenses = new LinkedHashSet<Consensus>(); // list of alt consenses
int totalMismatchSum = 0; int totalMismatchSum = 0;
// if there are any known indels for this region, get them
while ( knownIndelsToTry.size() > 0 ) {
VariationRod knownIndel = knownIndelsToTry.first();
if ( knownIndel.getLocation().isBefore(readsToClean.getLocation()) ) {
knownIndelsToTry.remove(knownIndel);
} else if ( knownIndel.getLocation().overlapsP(readsToClean.getLocation()) ) {
knownIndelsToTry.remove(knownIndel);
String indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAlleleList().get(0) : Utils.dupString('-', knownIndel.getAlleleList().get(0).length());
Consensus c = createAlternateConsensus((int)(knownIndel.getLocation().getStart() - leftmostIndex), reference, indelStr, knownIndel.isDeletion());
if ( c != null )
altConsenses.add(c);
} else {
break;
}
}
// decide which reads potentially need to be cleaned // decide which reads potentially need to be cleaned
for ( SAMRecord read : reads ) { for ( SAMRecord read : reads ) {
@ -430,7 +448,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// if it has an indel, let's see if that's the best consensus // if it has an indel, let's see if that's the best consensus
if ( numBlocks == 2 ) { if ( numBlocks == 2 ) {
Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases()); Consensus c = createAlternateConsensus(aRead.getAlignmentStart() - (int)leftmostIndex, aRead.getCigar(), reference, aRead.getRead().getReadBases());
if ( c==null) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName()); if ( c == null ) {} //System.out.println("ERROR: Failed to create alt consensus for read "+aRead.getRead().getReadName());
else altConsenses.add(c); else altConsenses.add(c);
} }
else { else {
@ -451,7 +469,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// do a pairwise alignment against the reference // do a pairwise alignment against the reference
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
if ( c != null) { if ( c != null ) {
// if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ; // if ( debugOn ) System.out.println("NEW consensus generated by SW: "+c.str ) ;
altConsenses.add(c); altConsenses.add(c);
} else { } else {
@ -467,7 +485,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
// do a pairwise alignment against the reference // do a pairwise alignment against the reference
SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND); SWPairwiseAlignment swConsensus = new SWPairwiseAlignment(reference, aRead.getRead().getReadBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND);
Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases()); Consensus c = createAlternateConsensus(swConsensus.getAlignmentStart2wrt1(), swConsensus.getCigar(), reference, aRead.getRead().getReadBases());
if ( c != null) if ( c != null )
altConsenses.add(c); altConsenses.add(c);
} }
} }
@ -653,13 +671,43 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
for (int i = refIdx; i < reference.length; i++) for (int i = refIdx; i < reference.length; i++)
sb.append((char)reference[i]); sb.append((char)reference[i]);
byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the cuurent read byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read
// if ( debugOn ) System.out.println("Alt consensus generated: "+altConsensus);
return new Consensus(altConsensus, c, indexOnRef); return new Consensus(altConsensus, c, indexOnRef);
} }
private Consensus createAlternateConsensus(int indexOnRef, byte[] reference, String indelStr, boolean isDeletion) {
if ( indexOnRef < 0 )
return null;
// create the new consensus
StringBuilder sb = new StringBuilder();
Cigar cigar = new Cigar();
int refIdx;
for (refIdx = 0; refIdx < indexOnRef; refIdx++)
sb.append((char)reference[refIdx]);
if ( indexOnRef > 0 )
cigar.add(new CigarElement(indexOnRef, CigarOperator.M));
if ( isDeletion ) {
refIdx += indelStr.length();
cigar.add(new CigarElement(indelStr.length(), CigarOperator.D));
}
else {
sb.append(indelStr);
cigar.add(new CigarElement(indelStr.length(), CigarOperator.I));
}
if ( reference.length - refIdx > 0 )
cigar.add(new CigarElement(reference.length - refIdx, CigarOperator.M));
for (; refIdx < reference.length; refIdx++)
sb.append((char)reference[refIdx]);
byte[] altConsensus = StringUtil.stringToBytes(sb.toString()); // alternative consensus sequence we just built from the current read
return new Consensus(altConsensus, cigar, indexOnRef);
}
private Pair<Integer, Integer> findBestOffset(byte[] ref, AlignedRead read) { private Pair<Integer, Integer> findBestOffset(byte[] ref, AlignedRead read) {
int attempts = ref.length - read.getReadLength() + 1; int attempts = ref.length - read.getReadLength() + 1;
int bestScore = mismatchQualitySumIgnoreCigar(read, ref, 0); int bestScore = mismatchQualitySumIgnoreCigar(read, ref, 0);
@ -1162,6 +1210,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
System.arraycopy(reference, 0, newReference, 0, reference.length); System.arraycopy(reference, 0, newReference, 0, reference.length);
System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases); System.arraycopy(ref, ref.length-neededBases, newReference, reference.length, neededBases);
reference = newReference; reference = newReference;
loc = GenomeLocParser.createGenomeLoc(loc.getContigIndex(), loc.getStart(), loc.getStop()+neededBases);
} }
} }
} }