Some internal refactoring. Now "safely" ignores duplicate records (NOT duplicate reads but rather malformed bam files!) resulting from the bug/feature in CleanedReadInjector.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1949 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7654051aee
commit
ea8d5c7077
|
|
@ -486,10 +486,10 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
private String bases;
|
private String bases;
|
||||||
private Type type;
|
private Type type;
|
||||||
|
|
||||||
private Set<SAMRecord> reads = new HashSet<SAMRecord>(); // keep track of reads that have this indel
|
private Set<ExpandedSAMRecord> reads = new HashSet<ExpandedSAMRecord>(); // keep track of reads that have this indel
|
||||||
private Set<String> samples = new HashSet<String>(); // which samples had the indel described by this object
|
private Set<String> samples = new HashSet<String>(); // which samples had the indel described by this object
|
||||||
|
|
||||||
public IndelVariant(SAMRecord read , Type type, String bases) {
|
public IndelVariant(ExpandedSAMRecord read , Type type, String bases) {
|
||||||
this.type = type;
|
this.type = type;
|
||||||
this.bases = bases.toUpperCase();
|
this.bases = bases.toUpperCase();
|
||||||
addObservation(read);
|
addObservation(read);
|
||||||
|
|
@ -500,11 +500,19 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
* this indel was observed in as well.
|
* this indel was observed in as well.
|
||||||
* @param read
|
* @param read
|
||||||
*/
|
*/
|
||||||
public void addObservation(SAMRecord read) {
|
public void addObservation(ExpandedSAMRecord read) {
|
||||||
if ( reads.contains(read) ) throw new StingException("Attempting to add indel observation that was already registered");
|
if ( reads.contains(read) ) {
|
||||||
|
//TODO fix CleanedReadInjector and reinstate exception here: duplicate records may signal a problem with the bam
|
||||||
|
// seeing the same read again can mean only one thing: the input bam file is corrupted and contains
|
||||||
|
// duplicate records. We KNOW that this may happen for the time being due to bug in CleanedReadInjector
|
||||||
|
// so this is a short-term patch: don't cry, but just ignore the duplicate record
|
||||||
|
|
||||||
|
//throw new StingException("Attempting to add indel observation that was already registered");
|
||||||
|
return;
|
||||||
|
}
|
||||||
reads.add(read);
|
reads.add(read);
|
||||||
String sample = null;
|
String sample = null;
|
||||||
if ( read.getReadGroup() != null ) sample = read.getReadGroup().getSample();
|
if ( read.getSAMRecord().getReadGroup() != null ) sample = read.getSAMRecord().getReadGroup().getSample();
|
||||||
if ( sample != null ) samples.add(sample);
|
if ( sample != null ) samples.add(sample);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -535,7 +543,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<SAMRecord> getReadSet() { return reads; }
|
public Set<ExpandedSAMRecord> getReadSet() { return reads; }
|
||||||
|
|
||||||
public int getCount() { return reads.size(); }
|
public int getCount() { return reads.size(); }
|
||||||
|
|
||||||
|
|
@ -614,16 +622,14 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
long left = Math.max( pos-nqs, context.getStart() );
|
long left = Math.max( pos-nqs, context.getStart() );
|
||||||
long right = Math.min(pos+nqs-1, context.getStop());
|
long right = Math.min(pos+nqs-1, context.getStop());
|
||||||
//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size());
|
//if ( pos == 3534096 ) System.out.println("pos="+pos +" total reads: "+context.getReads().size());
|
||||||
Iterator<SAMRecord> read_iter = context.getReads().iterator();
|
Iterator<ExpandedSAMRecord> read_iter = context.getReads().iterator();
|
||||||
Iterator<byte[]> flag_iter = context.getMMFlags().iterator();
|
|
||||||
Iterator<Integer> mm_iter = context.getTotalMMs().iterator();
|
|
||||||
Iterator <byte[]> qual_iter = context.getExpandedQuals().iterator();
|
|
||||||
|
|
||||||
while ( read_iter.hasNext() ) {
|
while ( read_iter.hasNext() ) {
|
||||||
SAMRecord read = read_iter.next();
|
ExpandedSAMRecord rec = read_iter.next();
|
||||||
byte[] flags = flag_iter.next();
|
SAMRecord read = rec.getSAMRecord();
|
||||||
byte[] quals = qual_iter.next();
|
byte[] flags = rec.getExpandedMMFlags();
|
||||||
Integer mm = mm_iter.next();
|
byte[] quals = rec.getExpandedQuals();
|
||||||
|
int mm = rec.getMMCount();
|
||||||
|
|
||||||
if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue;
|
if( read.getAlignmentStart() > pos || read.getAlignmentEnd() < pos ) continue;
|
||||||
|
|
||||||
|
|
@ -642,19 +648,19 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( read_has_consensus ) {
|
if ( read_has_consensus ) {
|
||||||
consensus_indel_read_total_mm += mm.intValue();
|
consensus_indel_read_total_mm += mm;
|
||||||
consensus_indel_read_total_mapq += read.getMappingQuality();
|
consensus_indel_read_total_mapq += read.getMappingQuality();
|
||||||
if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++;
|
if ( read.getReadNegativeStrandFlag() ) consensus_indel_read_orientation_cnt.second++;
|
||||||
else consensus_indel_read_orientation_cnt.first++;
|
else consensus_indel_read_orientation_cnt.first++;
|
||||||
}
|
}
|
||||||
if ( read_has_a_variant ) {
|
if ( read_has_a_variant ) {
|
||||||
all_indel_read_total_mm += mm.intValue();
|
all_indel_read_total_mm += mm;
|
||||||
all_indel_read_total_mapq += read.getMappingQuality();
|
all_indel_read_total_mapq += read.getMappingQuality();
|
||||||
if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++;
|
if ( read.getReadNegativeStrandFlag() ) all_indel_read_orientation_cnt.second++;
|
||||||
else all_indel_read_orientation_cnt.first++;
|
else all_indel_read_orientation_cnt.first++;
|
||||||
}
|
}
|
||||||
|
|
||||||
all_read_total_mm+= mm.intValue();
|
all_read_total_mm+= mm;
|
||||||
all_read_total_mapq += read.getMappingQuality();
|
all_read_total_mapq += read.getMappingQuality();
|
||||||
if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++;
|
if ( read.getReadNegativeStrandFlag() ) all_read_orientation_cnt.second++;
|
||||||
else all_read_orientation_cnt.first++;
|
else all_read_orientation_cnt.first++;
|
||||||
|
|
@ -893,11 +899,12 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class WindowContext {
|
interface IndelListener {
|
||||||
private List<SAMRecord> reads;
|
public void addObservation(int pos, IndelVariant.Type t, String bases, ExpandedSAMRecord r);
|
||||||
private List<byte[]> mismatch_flags;
|
}
|
||||||
private List<byte[]> expanded_quals;
|
|
||||||
private List<Integer> mms;
|
class WindowContext implements IndelListener {
|
||||||
|
private Set<ExpandedSAMRecord> reads;
|
||||||
private long start=0; // where the window starts on the ref, 1-based
|
private long start=0; // where the window starts on the ref, 1-based
|
||||||
private CircularArray< List< IndelVariant > > indels;
|
private CircularArray< List< IndelVariant > > indels;
|
||||||
|
|
||||||
|
|
@ -907,11 +914,8 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
public WindowContext(long start, int length) {
|
public WindowContext(long start, int length) {
|
||||||
this.start = start;
|
this.start = start;
|
||||||
indels = new CircularArray< List<IndelVariant> >(length);
|
indels = new CircularArray< List<IndelVariant> >(length);
|
||||||
reads = new LinkedList<SAMRecord>();
|
// reads = new LinkedList<SAMRecord>();
|
||||||
mismatch_flags = new LinkedList<byte[]>();
|
reads = new HashSet<ExpandedSAMRecord>();
|
||||||
// offsets = new LinkedList<Integer>();
|
|
||||||
mms = new LinkedList<Integer>();
|
|
||||||
expanded_quals = new LinkedList<byte[]>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns 1-based reference start position of the interval this object keeps context for.
|
/** Returns 1-based reference start position of the interval this object keeps context for.
|
||||||
|
|
@ -931,10 +935,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
*/
|
*/
|
||||||
public void clear() {
|
public void clear() {
|
||||||
start = 0;
|
start = 0;
|
||||||
mms.clear();
|
|
||||||
reads.clear();
|
reads.clear();
|
||||||
mismatch_flags.clear();
|
|
||||||
expanded_quals.clear();
|
|
||||||
indels.clear();
|
indels.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -952,10 +953,7 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<SAMRecord> getReads() { return reads; }
|
public Set<ExpandedSAMRecord> getReads() { return reads; }
|
||||||
public List<byte[]> getMMFlags() { return mismatch_flags; }
|
|
||||||
public List<Integer> getTotalMMs() { return mms; }
|
|
||||||
public List<byte[]> getExpandedQuals() { return expanded_quals; }
|
|
||||||
|
|
||||||
/** Returns the number of reads spanning over the specified reference position
|
/** Returns the number of reads spanning over the specified reference position
|
||||||
* (regardless of whether they have a base or indel at that specific location)
|
* (regardless of whether they have a base or indel at that specific location)
|
||||||
|
|
@ -963,8 +961,8 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
*/
|
*/
|
||||||
public int coverageAt(final long refPos) {
|
public int coverageAt(final long refPos) {
|
||||||
int cov = 0;
|
int cov = 0;
|
||||||
for ( SAMRecord read : reads ) {
|
for ( ExpandedSAMRecord read : reads ) {
|
||||||
if ( read.getAlignmentStart() > refPos || read.getAlignmentEnd() < refPos ) continue;
|
if ( read.getSAMRecord().getAlignmentStart() > refPos || read.getSAMRecord().getAlignmentEnd() < refPos ) continue;
|
||||||
cov++;
|
cov++;
|
||||||
}
|
}
|
||||||
return cov;
|
return cov;
|
||||||
|
|
@ -983,39 +981,85 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
IndelVariant indel = indels.get(0).get(0);
|
IndelVariant indel = indels.get(0).get(0);
|
||||||
|
|
||||||
throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+
|
throw new StingException("Indel found at the first position ("+start+") after a shift was performed: currently not supported: "+
|
||||||
(indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getReadName());
|
(indel.getType()==IndelVariant.Type.I?"+":"-")+indel.getBases()+"; reads: "+indel.getReadSet().iterator().next().getSAMRecord().getReadName());
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator<SAMRecord> read_iter = reads.iterator();
|
Iterator<ExpandedSAMRecord> read_iter = reads.iterator();
|
||||||
Iterator<Integer> mm_iter = mms.iterator();
|
|
||||||
Iterator<byte[]> flags_iter = mismatch_flags.iterator();
|
|
||||||
Iterator<byte[]> quals_iter = expanded_quals.iterator();
|
|
||||||
|
|
||||||
while ( read_iter.hasNext() ) {
|
while ( read_iter.hasNext() ) {
|
||||||
SAMRecord r = read_iter.next();
|
ExpandedSAMRecord r = read_iter.next();
|
||||||
mm_iter.next();
|
if ( r.getSAMRecord().getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope
|
||||||
flags_iter.next();
|
|
||||||
quals_iter.next();
|
|
||||||
|
|
||||||
if ( r.getAlignmentEnd() < start ) { // discard reads and associated data that went out of scope
|
|
||||||
read_iter.remove();
|
read_iter.remove();
|
||||||
mm_iter.remove();
|
|
||||||
flags_iter.remove();
|
|
||||||
quals_iter.remove();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(SAMRecord read, char [] ref) {
|
public void add(SAMRecord read, char [] ref) {
|
||||||
|
|
||||||
|
ExpandedSAMRecord er = new ExpandedSAMRecord(read,ref,read.getAlignmentStart()-start,this);
|
||||||
|
//TODO duplicate records may actually indicate a problem with input bam file; throw an exception when the bug in CleanedReadInjector is fixed
|
||||||
|
if ( reads.contains(er)) return; // ignore duplicate records
|
||||||
|
reads.add(er);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addObservation(int pos, IndelVariant.Type type, String bases, ExpandedSAMRecord rec) {
|
||||||
|
List<IndelVariant> indelsAtSite;
|
||||||
|
try {
|
||||||
|
indelsAtSite = indels.get(pos);
|
||||||
|
} catch (IndexOutOfBoundsException e) {
|
||||||
|
SAMRecord r = rec.getSAMRecord();
|
||||||
|
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
|
||||||
|
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
|
||||||
|
r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+
|
||||||
|
"; window end="+getStop());
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( indelsAtSite == null ) {
|
||||||
|
indelsAtSite = new ArrayList<IndelVariant>();
|
||||||
|
indels.set(pos, indelsAtSite);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean found = false;
|
||||||
|
for ( IndelVariant v : indelsAtSite ) {
|
||||||
|
if ( ! v.equals(type, bases) ) continue;
|
||||||
|
|
||||||
|
v.addObservation(rec);
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! found ) {
|
||||||
|
IndelVariant v = new IndelVariant(rec, type, bases);
|
||||||
|
indelsAtSite.add(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<IndelVariant> indelsAt( final long refPos ) {
|
||||||
|
List<IndelVariant> l = indels.get((int)( refPos - start ));
|
||||||
|
if ( l == null ) return emptyIndelList;
|
||||||
|
else return l;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class ExpandedSAMRecord {
|
||||||
|
private SAMRecord read;
|
||||||
|
private byte[] mismatch_flags;
|
||||||
|
private byte[] expanded_quals;
|
||||||
|
private int mms;
|
||||||
|
|
||||||
|
public ExpandedSAMRecord(SAMRecord r, char [] ref, long offset, IndelListener l) {
|
||||||
|
|
||||||
|
read = r;
|
||||||
final long rStart = read.getAlignmentStart();
|
final long rStart = read.getAlignmentStart();
|
||||||
final long rStop = read.getAlignmentEnd();
|
final long rStop = read.getAlignmentEnd();
|
||||||
final String readBases = read.getReadString().toUpperCase();
|
final String readBases = read.getReadString().toUpperCase();
|
||||||
|
|
||||||
byte flags[] = new byte[(int)(rStop-rStart+1)];
|
mismatch_flags = new byte[(int)(rStop-rStart+1)];
|
||||||
byte quals[] = new byte[(int)(rStop-rStart+1)];
|
expanded_quals = new byte[(int)(rStop-rStart+1)];
|
||||||
|
|
||||||
int localStart = (int)( rStart - start ); // start of the alignment wrt start of the current window, 0-based
|
|
||||||
|
|
||||||
// now let's extract indels:
|
// now let's extract indels:
|
||||||
|
|
||||||
|
|
@ -1025,8 +1069,6 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
int posOnRead = 0;
|
int posOnRead = 0;
|
||||||
int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
|
int posOnRef = 0; // the chunk of reference ref[] that we have access to is aligned with the read:
|
||||||
// its start on the actual full reference contig is r.getAlignmentStart()
|
// its start on the actual full reference contig is r.getAlignmentStart()
|
||||||
int mm=0; // number of single-base mismatches in the current read (indels do not count!)
|
|
||||||
|
|
||||||
for ( int i = 0 ; i < nCigarElems ; i++ ) {
|
for ( int i = 0 ; i < nCigarElems ; i++ ) {
|
||||||
|
|
||||||
final CigarElement ce = c.getCigarElement(i);
|
final CigarElement ce = c.getCigarElement(i);
|
||||||
|
|
@ -1049,17 +1091,17 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
case D:
|
case D:
|
||||||
type = IndelVariant.Type.D;
|
type = IndelVariant.Type.D;
|
||||||
bases = new String( ref, posOnRef, ce.getLength() );
|
bases = new String( ref, posOnRef, ce.getLength() );
|
||||||
for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) flags[posOnRef] = quals[posOnRef] = -1;
|
for( int k = 0 ; k < ce.getLength(); k++, posOnRef++ ) mismatch_flags[posOnRef] = expanded_quals[posOnRef] = -1;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case M:
|
case M:
|
||||||
for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) {
|
for ( int k = 0; k < ce.getLength(); k++, posOnRef++, posOnRead++ ) {
|
||||||
if ( readBases.charAt(posOnRead) != //note: readBases was uppercased above!
|
if ( readBases.charAt(posOnRead) != //note: readBases was uppercased above!
|
||||||
Character.toUpperCase(ref[posOnRef]) ) { // mismatch!
|
Character.toUpperCase(ref[posOnRef]) ) { // mismatch!
|
||||||
mm++;
|
mms++;
|
||||||
flags[posOnRef] = 1;
|
mismatch_flags[posOnRef] = 1;
|
||||||
}
|
}
|
||||||
quals[posOnRef] = read.getBaseQualities()[posOnRead];
|
expanded_quals[posOnRef] = read.getBaseQualities()[posOnRead];
|
||||||
}
|
}
|
||||||
break; // advance along the gapless block in the alignment
|
break; // advance along the gapless block in the alignment
|
||||||
default :
|
default :
|
||||||
|
|
@ -1074,54 +1116,26 @@ public class IndelGenotyperV2Walker extends ReadWalker<Integer,Integer> {
|
||||||
|
|
||||||
// note that here we will be assigning indels to the first deleted base or to the first
|
// note that here we will be assigning indels to the first deleted base or to the first
|
||||||
// base after insertion, not to the last base before the event!
|
// base after insertion, not to the last base before the event!
|
||||||
addIndelObservation(localStart+eventPosition, type, bases, read);
|
l.addObservation((int)(offset+eventPosition), type, bases, this);
|
||||||
}
|
|
||||||
reads.add(read);
|
|
||||||
mms.add(mm);
|
|
||||||
mismatch_flags.add(flags);
|
|
||||||
expanded_quals.add(quals);
|
|
||||||
// offsets.add(localStart);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addIndelObservation(int pos, IndelVariant.Type type, String bases, SAMRecord r) {
|
|
||||||
List<IndelVariant> indelsAtSite;
|
|
||||||
try {
|
|
||||||
indelsAtSite = indels.get(pos);
|
|
||||||
} catch (IndexOutOfBoundsException e) {
|
|
||||||
System.out.println("Read "+r.getReadName()+": out of coverage window bounds.Probably window is too small.\n"+
|
|
||||||
"Read length="+r.getReadLength()+"; cigar="+r.getCigarString()+"; start="+
|
|
||||||
r.getAlignmentStart()+"; end="+r.getAlignmentEnd()+"; window start="+getStart()+
|
|
||||||
"; window end="+getStop());
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( indelsAtSite == null ) {
|
|
||||||
indelsAtSite = new ArrayList<IndelVariant>();
|
|
||||||
indels.set(pos, indelsAtSite);
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean found = false;
|
|
||||||
for ( IndelVariant v : indelsAtSite ) {
|
|
||||||
if ( ! v.equals(type, bases) ) continue;
|
|
||||||
|
|
||||||
v.addObservation(r);
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( ! found ) {
|
|
||||||
IndelVariant v = new IndelVariant(r, type, bases);
|
|
||||||
indelsAtSite.add(v);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<IndelVariant> indelsAt( final long refPos ) {
|
public SAMRecord getSAMRecord() { return read; }
|
||||||
List<IndelVariant> l = indels.get((int)( refPos - start ));
|
|
||||||
if ( l == null ) return emptyIndelList;
|
public byte [] getExpandedMMFlags() { return mismatch_flags; }
|
||||||
else return l;
|
|
||||||
|
public byte [] getExpandedQuals() { return expanded_quals; }
|
||||||
|
|
||||||
|
public int getMMCount() { return mms; }
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if ( this == o ) return true;
|
||||||
|
if ( read == null ) return false;
|
||||||
|
if ( o instanceof SAMRecord ) return read.equals(o);
|
||||||
|
if ( o instanceof ExpandedSAMRecord ) return read.equals(((ExpandedSAMRecord)o).read);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue