Some internal refactoring. Now "safely" ignores duplicate records (NOT duplicate reads but rather malformed bam files!) resulting from the bug/feature in CleanedReadInjector.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1949 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
asivache 2009-10-30 17:50:51 +00:00
parent 7654051aee
commit ea8d5c7077
1 changed files with 140 additions and 126 deletions

View File

@ -486,10 +486,10 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
private String bases; private String bases;
private Type type; private Type type;
private Set<SAMRecord> reads = new HashSet<SAMRecord>(); // keep track of reads that have this indel private Set<ExpandedSAMRecord> reads = new HashSet<ExpandedSAMRecord>(); // keep track of reads that have this indel
private Set<String> samples = new HashSet<String>(); // which samples had the indel described by this object private Set<String> samples = new HashSet<String>(); // which samples had the indel described by this object
public IndelVariant(SAMRecord read , Type type, String bases) { public IndelVariant(ExpandedSAMRecord read , Type type, String bases) {
this.type = type; this.type = type;
this.bases = bases.toUpperCase(); this.bases = bases.toUpperCase();
addObservation(read); addObservation(read);
@ -500,11 +500,19 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
* this indel was observed in as well. * this indel was observed in as well.
* @param read * @param read
*/ */
public void addObservation(SAMRecord read) { public void addObservation(ExpandedSAMRecord read) {
if ( reads.contains(read) ) throw new StingException("Attempting to add indel observation that was already registered"); if ( reads.contains(read) ) {
//TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam
// seeing the same read again can mean only one thing: the input bam file is corrupted and contains
// duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector
// so this is a short-term patch: don't cry, but just ignore the duplicate record
//throw new StingException("Attempting to add indel observation that was already registered");
return;
}
reads.add(read); reads.add(read);
String sample = null; String sample = null;
if ( read.getReadGroup() != null ) sample = read.getReadGroup().getSample(); if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample();
if ( sample != null ) samples.add(sample); if ( sample != null ) samples.add(sample);
} }
@ -535,7 +543,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
return sb.toString(); return sb.toString();
} }
public Set<SAMRecord> getReadSet() { return reads; } public Set<ExpandedSAMRecord> getReadSet() { return reads; }
public int getCount() { return reads.size(); } public int getCount() { return reads.size(); }
@ -614,16 +622,14 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
long left = Math.max( pos-nqs, context.getStart() ); long left = Math.max( pos-nqs, context.getStart() );
long right = Math.min(pos+nqs-1, context.getStop()); long right = Math.min(pos+nqs-1, context.getStop());
//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size()); //if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size());
Iterator<SAMRecord> read_iter = context.getReads().iterator(); Iterator<ExpandedSAMRecord> read_iter = context.getReads().iterator();
Iterator<byte[]> flag_iter = context.getMMFlags().iterator();
Iterator<Integer> mm_iter = context.getTotalMMs().iterator();
Iterator <byte[]> qual_iter = context.getExpandedQuals().iterator();
while ( read_iter.hasNext() ) { while ( read_iter.hasNext() ) {
SAMRecord read = read_iter.next(); ExpandedSAMRecord rec = read_iter.next();
byte[] flags = flag_iter.next(); SAMRecord read = rec.getSAMRecord();
byte[] quals = qual_iter.next(); byte[] flags = rec.getExpandedMMFlags();
Integer mm = mm_iter.next(); byte[] quals = rec.getExpandedQuals();
int mm = rec.getMMCount();
if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue; if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue;
@ -642,19 +648,19 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
} }
if ( read_has_consensus ) { if ( read_has_consensus ) {
consensus_indel_read_total_mm += mm.intValue(); consensus_indel_read_total_mm += mm;
consensus_indel_read_total_mapq += read.getMappingQuality(); consensus_indel_read_total_mapq += read.getMappingQuality();
if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++; if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++;
else consensus_indel_read_orientation_cnt.first++; else consensus_indel_read_orientation_cnt.first++;
} }
if ( read_has_a_variant ) { if ( read_has_a_variant ) {
all_indel_read_total_mm += mm.intValue(); all_indel_read_total_mm += mm;
all_indel_read_total_mapq += read.getMappingQuality(); all_indel_read_total_mapq += read.getMappingQuality();
if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++; if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++;
else all_indel_read_orientation_cnt.first++; else all_indel_read_orientation_cnt.first++;
} }
all_read_total_mm+= mm.intValue(); all_read_total_mm+= mm;
all_read_total_mapq += read.getMappingQuality(); all_read_total_mapq += read.getMappingQuality();
if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++; if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++;
else all_read_orientation_cnt.first++; else all_read_orientation_cnt.first++;
@ -893,11 +899,12 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
} }
class WindowContext { interface IndelListener {
private List<SAMRecord> reads; public void addObservation(int pos, IndelVariant.Type t, String bases, ExpandedSAMRecord r);
private List<byte[]> mismatch_flags; }
private List<byte[]> expanded_quals;
private List<Integer> mms; class WindowContext implements IndelListener {
private Set<ExpandedSAMRecord> reads;
private long start=0; // where the window starts on the ref, 1-based private long start=0; // where the window starts on the ref, 1-based
private CircularArray< List< IndelVariant > > indels; private CircularArray< List< IndelVariant > > indels;
@ -907,11 +914,8 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
public WindowContext(long start, int length) { public WindowContext(long start, int length) {
this.start = start; this.start = start;
indels = new CircularArray< List<IndelVariant> >(length); indels = new CircularArray< List<IndelVariant> >(length);
reads = new LinkedList<SAMRecord>(); // reads = new LinkedList<SAMRecord>();
mismatch_flags = new LinkedList<byte[]>(); reads = new HashSet<ExpandedSAMRecord>();
// offsets = new LinkedList<Integer>();
mms = new LinkedList<Integer>();
expanded_quals = new LinkedList<byte[]>();
} }
/** Returns 1-based reference start position of the interval this object keeps context for. /** Returns 1-based reference start position of the interval this object keeps context for.
@ -931,10 +935,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
*/ */
public void clear() { public void clear() {
start = 0; start = 0;
mms.clear();
reads.clear(); reads.clear();
mismatch_flags.clear();
expanded_quals.clear();
indels.clear(); indels.clear();
} }
@ -952,10 +953,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
return false; return false;
} }
public List<SAMRecord> getReads() { return reads; } public Set<ExpandedSAMRecord> getReads() { return reads; }
public List<byte[]> getMMFlags() { return mismatch_flags; }
public List<Integer> getTotalMMs() { return mms; }
public List<byte[]> getExpandedQuals() { return expanded_quals; }
/** Returns the number of reads spanning over the specified reference position /** Returns the number of reads spanning over the specified reference position
* (regardless of whether they have a base or indel at that specific location) * (regardless of whether they have a base or indel at that specific location)
@ -963,8 +961,8 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
*/ */
public int coverageAt(final long refPos) { public int coverageAt(final long refPos) {
int cov = 0; int cov = 0;
for ( SAMRecord read : reads ) { for ( ExpandedSAMRecord read : reads ) {
if ( read.getAlignmentStart() > refPos || read.getAlignmentEnd() < refPos ) continue; if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) continue;
cov++; cov++;
} }
return cov; return cov;
@ -983,39 +981,85 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
IndelVariant indel = indels.get(0).get(0); IndelVariant indel = indels.get(0).get(0);
throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+ throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+
(indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getReadName()); (indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName());
} }
Iterator<SAMRecord> read_iter = reads.iterator(); Iterator<ExpandedSAMRecord> read_iter = reads.iterator();
Iterator<Integer> mm_iter = mms.iterator();
Iterator<byte[]> flags_iter = mismatch_flags.iterator();
Iterator<byte[]> quals_iter = expanded_quals.iterator();
while ( read_iter.hasNext() ) { while ( read_iter.hasNext() ) {
SAMRecord r = read_iter.next(); ExpandedSAMRecord r = read_iter.next();
mm_iter.next(); if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope
flags_iter.next();
quals_iter.next();
if ( r.getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope
read_iter.remove(); read_iter.remove();
mm_iter.remove();
flags_iter.remove();
quals_iter.remove();
} }
} }
} }
public void add(SAMRecord read, char [] ref) { public void add(SAMRecord read, char [] ref) {
ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this);
//TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed
if ( reads.contains(er)) return; // ignore duplicate records
reads.add(er);
}
public void addObservation(int pos, IndelVariant.Type type, String bases, ExpandedSAMRecord rec) {
List<IndelVariant> indelsAtSite;
try {
indelsAtSite = indels.get(pos);
} catch (IndexOutOfBoundsException e) {
SAMRecord r = rec.getSAMRecord();
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+
"; window end="+getStop());
throw e;
}
if ( indelsAtSite == null ) {
indelsAtSite = new ArrayList<IndelVariant>();
indels.set(pos, indelsAtSite);
}
boolean found = false;
for ( IndelVariant v : indelsAtSite ) {
if ( ! v.equals(type, bases) ) continue;
v.addObservation(rec);
found = true;
break;
}
if ( ! found ) {
IndelVariant v = new IndelVariant(rec, type, bases);
indelsAtSite.add(v);
}
}
public List<IndelVariant> indelsAt( final long refPos ) {
List<IndelVariant> l = indels.get((int)( refPos - start ));
if ( l == null ) return emptyIndelList;
else return l;
}
}
class ExpandedSAMRecord {
private SAMRecord read;
private byte[] mismatch_flags;
private byte[] expanded_quals;
private int mms;
public ExpandedSAMRecord(SAMRecord r, char [] ref, long offset, IndelListener l) {
read = r;
final long rStart = read.getAlignmentStart(); final long rStart = read.getAlignmentStart();
final long rStop = read.getAlignmentEnd(); final long rStop = read.getAlignmentEnd();
final String readBases = read.getReadString().toUpperCase(); final String readBases = read.getReadString().toUpperCase();
byte flags[] = new byte[(int)(rStop-rStart+1)]; mismatch_flags = new byte[(int)(rStop-rStart+1)];
byte quals[] = new byte[(int)(rStop-rStart+1)]; expanded_quals = new byte[(int)(rStop-rStart+1)];
int localStart = (int)( rStart - start ); // start of the alignment wrt start of the current window, 0-based
// now let's extract indels: // now let's extract indels:
@ -1025,8 +1069,6 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
int posOnRead = 0; int posOnRead = 0;
int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read: int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
// its start on the actual full reference contig is r.getAlignmentStart() // its start on the actual full reference contig is r.getAlignmentStart()
int mm=0; // number of single-base mismatches in the current read (indels do not count!)
for ( int i = 0 ; i < nCigarElems ; i++ ) { for ( int i = 0 ; i < nCigarElems ; i++ ) {
final CigarElement ce = c.getCigarElement(i); final CigarElement ce = c.getCigarElement(i);
@ -1049,17 +1091,17 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
case D: case D:
type = IndelVariant.Type.D; type = IndelVariant.Type.D;
bases = new String( ref, posOnRef, ce.getLength() ); bases = new String( ref, posOnRef, ce.getLength() );
for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) flags[posOnRef] = quals[posOnRef] = -1; for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1;
break; break;
case M: case M:
for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) { for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) {
if ( readBases.charAt(posOnRead) != //note: readBases was uppercased above! if ( readBases.charAt(posOnRead) != //note: readBases was uppercased above!
Character.toUpperCase(ref[posOnRef]) ) { // mismatch! Character.toUpperCase(ref[posOnRef]) ) { // mismatch!
mm++; mms++;
flags[posOnRef] = 1; mismatch_flags[posOnRef] = 1;
} }
quals[posOnRef] = read.getBaseQualities()[posOnRead]; expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead];
} }
break; // advance along the gapless block in the alignment break; // advance along the gapless block in the alignment
default : default :
@ -1074,54 +1116,26 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
// note that here we will be assigning indels to the first deleted base or to the first // note that here we will be assigning indels to the first deleted base or to the first
// base after insertion, not to the last base before the event! // base after insertion, not to the last base before the event!
addIndelObservation(localStart+eventPosition, type, bases, read); l.addObservation((int)(offset+eventPosition), type, bases, this);
}
reads.add(read);
mms.add(mm);
mismatch_flags.add(flags);
expanded_quals.add(quals);
// offsets.add(localStart);
}
private void addIndelObservation(int pos, IndelVariant.Type type, String bases, SAMRecord r) {
List<IndelVariant> indelsAtSite;
try {
indelsAtSite = indels.get(pos);
} catch (IndexOutOfBoundsException e) {
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+
"; window end="+getStop());
throw e;
}
if ( indelsAtSite == null ) {
indelsAtSite = new ArrayList<IndelVariant>();
indels.set(pos, indelsAtSite);
}
boolean found = false;
for ( IndelVariant v : indelsAtSite ) {
if ( ! v.equals(type, bases) ) continue;
v.addObservation(r);
found = true;
break;
}
if ( ! found ) {
IndelVariant v = new IndelVariant(r, type, bases);
indelsAtSite.add(v);
} }
} }
public List<IndelVariant> indelsAt( final long refPos ) { public SAMRecord getSAMRecord() { return read; }
List<IndelVariant> l = indels.get((int)( refPos - start ));
if ( l == null ) return emptyIndelList; public byte [] getExpandedMMFlags() { return mismatch_flags; }
else return l;
public byte [] getExpandedQuals() { return expanded_quals; }
public int getMMCount() { return mms; }
public boolean equals(Object o) {
if ( this == o ) return true;
if ( read == null ) return false;
if ( o instanceof SAMRecord ) return read.equals(o);
if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read);
return false;
} }
} }
} }