Changed reads traversals from providing a LocusContext from which the reference sequence

could be extracted to a char[] containing the reference bases.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@657 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-05-11 22:45:11 +00:00
parent 052819bed5
commit 23e9e29964
21 changed files with 115 additions and 61 deletions

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.dataSources.shards.Shard;
import edu.mit.broad.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
/**
* Created by IntelliJ IDEA.
@ -54,6 +55,8 @@ public class ReferenceProvider {
* Gets the bases of the reference that are aligned to the given read.
* @param read the read for which to extract reference information.
* @return The bases corresponding to this read, or null if the read is unmapped.
* If the alignment goes off the end of the contig, return just the portion
* mapped to the reference.
*/
public char[] getReferenceBases( SAMRecord read ) {
if( read.getReadUnmappedFlag() )
@ -63,6 +66,10 @@ public class ReferenceProvider {
int start = read.getAlignmentStart();
int stop = read.getAlignmentEnd();
SAMSequenceRecord sequenceRecord = sequenceFile.getSequenceDictionary().getSequence(contig);
if( stop > sequenceRecord.getSequenceLength() )
stop = sequenceRecord.getSequenceLength();
ReferenceSequence alignmentToReference = sequenceFile.getSubsequenceAt( contig, start, stop );
return StringUtil.bytesToString(alignmentToReference.getBases()).toCharArray();
}

View File

@ -17,6 +17,7 @@ import java.util.LinkedList;
import java.io.File;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
/**
* Created by IntelliJ IDEA.
@ -84,11 +85,21 @@ public class TraverseByReads extends TraversalEngine {
final List<SAMRecord> reads = Arrays.asList(read);
GenomeLoc loc = new GenomeLoc(read);
char[] refBases = null;
// Jump forward in the reference to this locus location
LocusContext locus = new LocusContext(loc, reads, offsets);
if (!loc.isUnmapped() && refIter != null) {
final ReferenceIterator refSite = refIter.seekForward(loc);
locus.setReferenceContig(refSite.getCurrentContig());
// MAQ alignments sometimes go off the end of the reference. Take this
// into account by only returning a shortened version of the reference.
int refStart = read.getAlignmentStart()-1;
int refLength = Math.min( read.getAlignmentEnd() - read.getAlignmentStart() + 1,
refSite.getCurrentContig().length() - read.getAlignmentStart() );
refBases = StringUtil.bytesToString( refSite.getCurrentContig().getBases(),refStart,refLength ).toCharArray();
}
GenomeLoc.removePastLocs(loc, notYetTraversedLocations);
@ -97,9 +108,9 @@ public class TraverseByReads extends TraversalEngine {
//
// execute the walker contact
//
final boolean keepMeP = walker.filter(locus, read);
final boolean keepMeP = walker.filter(refBases, read);
if (keepMeP) {
M x = walker.map(locus, read);
M x = walker.map(refBases, read);
sum = walker.reduce(x, sum);
}

View File

@ -112,9 +112,9 @@ public class TraverseReads extends TraversalEngine {
// update the number of reads we've seen
TraversalStatistics.nRecords++;
final boolean keepMeP = readWalker.filter(locus, read);
final boolean keepMeP = readWalker.filter(refSeq, read);
if (keepMeP) {
M x = readWalker.map(locus, read);
M x = readWalker.map(refSeq, read);
sum = readWalker.reduce(x, sum);
}

View File

@ -4,7 +4,7 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
public class CountReadsWalker extends ReadWalker<Integer, Integer> {
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
//System.out.println(read.format());
return 1;
}

View File

@ -4,7 +4,7 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.LocusContext;
public class PrintReadsWalker extends ReadWalker<Integer, Integer> {
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
out.println(read.format());
return 1;
}

View File

@ -17,11 +17,11 @@ public abstract class ReadWalker<MapType, ReduceType> extends Walker<MapType, Re
/** Must return true for reads that need to be processed. Reads, for which this method return false will
* be skipped by the engine and never passed to the walker.
*/
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
// We are keeping all the reads
return true;
}
// Map over the org.broadinstitute.sting.gatk.LocusContext
public abstract MapType map(LocusContext context, SAMRecord read);
public abstract MapType map(char[] ref, SAMRecord read);
}

View File

@ -22,16 +22,14 @@ public class AlignedReadsHistoWalker extends ReadWalker<Integer, Integer> {
}
}
public String walkerType() { return "ByRead"; }
// Do we actually want to operate on the context?
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
// we only want aligned reads
return !read.getReadUnmappedFlag();
}
// Map over the org.broadinstitute.sting.atk.LocusContext
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
//System.out.println(read.getAttribute("NM"));
int editDist = Integer.parseInt(read.getAttribute("NM").toString());
if (editDist <= 50)

View File

@ -12,12 +12,12 @@ public class BaseQualityDumpWalker extends ReadWalker<Integer, Integer> {
protected final int MAX_TARGET_EDIT_DISTANCE = 4; //10;
// Do we actually want to operate on the context?
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
// we only want aligned reads
return !read.getReadUnmappedFlag();
}
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
int editDist = Integer.parseInt(read.getAttribute("NM").toString());

View File

@ -23,12 +23,12 @@ public class BaseQualityHistoWalker extends ReadWalker<Integer, Integer> {
}
// Do we actually want to operate on the context?
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
return true; // We are keeping all the reads
}
// Map over the org.broadinstitute.sting.gatk.LocusContext
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
for ( byte qual : read.getBaseQualities() ) {
//System.out.println(qual);
this.qualCounts[qual]++;

View File

@ -6,7 +6,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
import net.sf.samtools.SAMRecord;
public class DisplayFourBaseReadWalker extends ReadWalker<Integer, Integer> {
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
String bases = read.getReadString();
boolean displayed = false;

View File

@ -39,7 +39,7 @@ public class IOCrusherWalker extends ReadWalker<SAMRecord, ArrayList<SAMFileWrit
/**
*
*/
public SAMRecord map(LocusContext context, SAMRecord read) {
public SAMRecord map(char[] ref, SAMRecord read) {
nReadsRead++;
return read;
}

View File

@ -75,7 +75,7 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
}
}
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
total_reads++;
@ -84,7 +84,8 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
ptWriter.receive(read); // make sure we still write the read to the output, we do not want to lose data!
return 0;
}
/* 11 May 2009 - Commented out because we no longer have the full reference sequence for sanity checking.
ReferenceSequence contig_seq = context.getReferenceContig();
if ( read.getReferenceName() != cur_contig) {
cur_contig = read.getReferenceName();
@ -94,6 +95,7 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
pileBuilder.setReferenceSequence(refstr);
out.println("loaded contig "+cur_contig+" (index="+read.getReferenceIndex()+"); length="+contig_seq.getBases().length+" tst="+contig_seq.toString());
}
*/
// if ( cur_contig.equals("chrM") || GenomeLoc.compareContigs(cur_contig,"chrY") > 0 ) continue; // skip chrM and unplaced contigs for now
@ -138,10 +140,10 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
*/
if ( ERR_MODE != null ) {
if ( ERR_MODE.equals("MM")) err = numMismatches(read,contig_seq);
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(read,contig_seq);
else if ( ERR_MODE.equals("ERR")) err = numErrors(read,contig_seq);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(read,contig_seq);
if ( ERR_MODE.equals("MM")) err = numMismatches(read,ref);
else if ( ERR_MODE.equals("MC") ) err = AlignmentUtils.numMismatches(read,ref);
else if ( ERR_MODE.equals("ERR")) err = numErrors(read,ref);
else if ( ERR_MODE.equals("MG")) err = numMismatchesGaps(read,ref);
if ( err > MAX_ERRS ) {
ptWriter.receive(read);
discarded_maxerr_count++;
@ -159,10 +161,10 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
* @return number of errors (number of mismatches plus total length of all insertions/deletions
* @throws RuntimeException
*/
private static int numErrors(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
private static int numErrors(SAMRecord r, char[] ref) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
int errs = numMismatches(r,refseq);
int errs = numMismatches(r,ref);
// now we have to add the total length of all indels:
Cigar c = r.getCigar();
@ -187,10 +189,10 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
* deletion will be counted as a single error regardless of the length)
* @throws RuntimeException
*/
private static int numMismatchesGaps(SAMRecord r,ReferenceSequence refseq) throws RuntimeException {
private static int numMismatchesGaps(SAMRecord r,char[] ref) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
int errs = numMismatches(r,refseq);
int errs = numMismatches(r,ref);
// now we have to add the total length of all indels:
Cigar c = r.getCigar();
@ -209,7 +211,7 @@ public class IndelInspector extends ReadWalker<Integer, Integer> {
}
/** This method is a HACK: it is designed to work around the current bug in NM tags created at CRD */
private static int numMismatches(SAMRecord r, ReferenceSequence refseq) throws RuntimeException {
private static int numMismatches(SAMRecord r, char[] refseq) throws RuntimeException {
// NM currently stores the total number of mismatches in all blocks + 1
Integer i = (Integer)r.getAttribute("NM");

View File

@ -24,18 +24,21 @@ public class IndelIntervalWalker extends ReadWalker<IndelIntervalWalker.Interval
public void initialize() {}
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
return ( !read.getReadUnmappedFlag() && // mapped
read.getReadLength() <= maxReadLength && // not too big
read.getAlignmentBlocks().size() > 1); // indel
}
public Interval map(LocusContext context, SAMRecord read) {
public Interval map(char[] ref, SAMRecord read) {
List<AlignmentBlock> blocks = read.getAlignmentBlocks();
long indelLeftEdge = read.getAlignmentStart() + blocks.get(0).getLength() - 1;
long indelRightEdge = read.getAlignmentEnd() - blocks.get(blocks.size()-1).getLength() + 1;
GenomeLoc indelLoc = new GenomeLoc(context.getLocation().getContigIndex(), indelLeftEdge, indelRightEdge);
return new Interval(context.getLocation(), indelLoc);
GenomeLoc indelLoc = new GenomeLoc(read.getReferenceIndex(), indelLeftEdge, indelRightEdge);
GenomeLoc refLoc = new GenomeLoc(read.getReferenceIndex(), read.getAlignmentStart(), read.getAlignmentEnd());
return new Interval(refLoc, indelLoc);
}
public Interval reduceInit() {

View File

@ -90,7 +90,7 @@ public class LogisticRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWr
}
public SAMRecord map(LocusContext context, SAMRecord read) {
public SAMRecord map(char[] ref, SAMRecord read) {
SAMRecord recalRead = read;
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();

View File

@ -13,20 +13,18 @@ import java.util.List;
@WalkerName("CountMismatches")
public class MismatchCounterWalker extends ReadWalker<Integer, Integer> {
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
int nMismatches = 0;
ReferenceSequence refseq = context.getReferenceContig();
int start = read.getAlignmentStart()-1;
int stop = read.getAlignmentEnd();
// sometimes BWA outputs screwy reads
if ( stop >= context.getReferenceContig().getBases().length )
if ( stop - start > ref.length )
return 0;
if ( read.getAlignmentBlocks().size() == 1 ) {
// No indels
List<Byte> refSeq = Utils.subseq(context.getReferenceContig().getBases(), start, stop);
List<Byte> refSeq = Utils.subseq(ref);
List<Byte> readBases = Utils.subseq(read.getReadBases());
assert(refSeq.size() == readBases.size());

View File

@ -19,12 +19,12 @@ public class MismatchHistoWalker extends ReadWalker<Integer, Integer> {
protected final int MAX_TARGET_EDIT_DISTANCE = 10;
// Do we actually want to operate on the context?
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
// we only want aligned reads
return !read.getReadUnmappedFlag();
}
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
int editDist = Integer.parseInt(read.getAttribute("NM").toString());
@ -33,20 +33,18 @@ public class MismatchHistoWalker extends ReadWalker<Integer, Integer> {
editDist >= MIN_TARGET_EDIT_DISTANCE &&
editDist <= MAX_TARGET_EDIT_DISTANCE ) {
ReferenceSequence refseq = context.getReferenceContig();
int start = read.getAlignmentStart()-1;
int stop = read.getAlignmentEnd();
// sometimes BWA outputs screwy reads
if ( stop >= context.getReferenceContig().getBases().length )
if ( stop - start > ref.length )
return 0;
List<Byte> refSeq = Utils.subseq(context.getReferenceContig().getBases(), start, stop);
List<Byte> refSeq = Utils.subseq(ref);
List<Byte> readBases = Utils.subseq(read.getReadBases());
assert(refSeq.size() == readBases.size());
// it's actually faster to reallocate a resized array than to use ArrayLists...
if ( refSeq.size() > mismatchCounts.length ) {
if ( ref.length > mismatchCounts.length ) {
int oldLength = mismatchCounts.length;
mismatchCounts = (long[])resizeArray(mismatchCounts, refSeq.size());
for ( int i = oldLength; i < refSeq.size(); i++ )

View File

@ -27,12 +27,11 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
/**
* Ignore reads with indels or clipping
*
* @param context the reference context
* @param read the read to assess
* @return true if the read can be processed, false if it should be ignored
*/
public boolean filter(LocusContext context, SAMRecord read) {
return (read.getCigar().numCigarElements() == 1 && read.getAlignmentStart() + read.getReadLength() < context.getReferenceContig().length());
public boolean filter(char[] ref, SAMRecord read) {
return (read.getCigar().numCigarElements() == 1 && read.getReadLength() > ref.length);
}
/**
@ -41,17 +40,15 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
* of this array is always "true" so that we can figure out how many reads we
* processed in a thread-safe manner.
*
* @param context the reference context
* @param read the read to assess
* @return An array of length (read_length + 1) indicating where the mismatches occur.
* Last element is for internal use so the reduce() function can figure out how
* many reads we processed.
*/
public boolean[] map(LocusContext context, SAMRecord read) {
public boolean[] map(char[] ref, SAMRecord read) {
boolean[] errorsPerCycle = new boolean[read.getReadLength() + 1];
byte[] bases = read.getReadBases();
byte[] contig = context.getReferenceContig().getBases();
byte[] sq = (byte[]) read.getAttribute("SQ");
if (printVisualHits) {
@ -61,16 +58,16 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
}
System.out.println();
for (int cycle = 0, offset = (int) context.getPosition() - 1; cycle < bases.length; cycle++, offset++) {
byte compBase = convertIUPACBaseToSimpleBase(contig[offset]);
for (int cycle = 0; cycle < bases.length; cycle++) {
byte compBase = convertIUPACBaseToSimpleBase((byte)ref[cycle]);
System.out.print((char) compBase);
}
System.out.println("\n");
}
for (int cycle = 0, offset = (int) context.getPosition() - 1; cycle < bases.length; cycle++, offset++) {
byte compBase = convertIUPACBaseToSimpleBase(contig[offset]);
for (int cycle = 0; cycle < bases.length; cycle++) {
byte compBase = convertIUPACBaseToSimpleBase((byte)ref[cycle]);
if (compBase != '.') {
if (useNextBestBase || useNextRandomBase) {

View File

@ -23,12 +23,12 @@ public class ReadFilterWalker extends ReadWalker<Integer,Integer> {
}
@Override
public boolean filter(LocusContext context, SAMRecord read) {
public boolean filter(char[] ref, SAMRecord read) {
return read.getReadLength() <= max_len;
}
@Override
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
writer.addAlignment(read);
return 1;
}

View File

@ -19,7 +19,7 @@ public class SAMToFastqAndSqWalker extends ReadWalker<Integer, Integer> {
private boolean ready = false;
public Integer map(LocusContext context, SAMRecord read) {
public Integer map(char[] ref, SAMRecord read) {
if (!ready) {
try {
fastqbw = new PrintWriter(FASTQFILE);

View File

@ -50,6 +50,39 @@ public class AlignmentUtils {
return mm_count;
}
/**
* mhanna - 11 May 2009 - stubbed out competing method that works with partial references.
* Computes number of mismatches in the read alignment to the refence <code>ref</code>
* specified in the record <code>r</code>. Indels are completely <i>ignored</i> by this method:
* only base mismatches in the alignment segments where both sequences are present are counted.
* @param r
* @return
*/
public static int numMismatches(SAMRecord r, char[] ref) {
if ( r.getReadUnmappedFlag() ) return 1000000;
int i_ref = 0; // position on the ref
int i_read = 0; // position on the read
int mm_count = 0; // number of mismatches
Cigar c = r.getCigar();
for ( int k = 0 ; k < c.numCigarElements() ; k++ ) {
CigarElement ce = c.getCigarElement(k);
switch( ce.getOperator() ) {
case M:
for ( int l = 0 ; l < ce.getLength() ; l++, i_ref++, i_read++ ) {
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) == 'N' ) continue; // do not count N's ?
if ( Character.toUpperCase(r.getReadString().charAt(i_read) ) !=
Character.toUpperCase(ref[i_ref]) ) mm_count++;
}
break;
case I: i_read += ce.getLength(); break;
case D: i_ref += ce.getLength(); break;
default: throw new RuntimeException("Unrecognized cigar element");
}
}
return mm_count;
}
// IMPORTANT NOTE: ALTHOUGH THIS METHOD IS EXTREMELY SIMILAR TO THE ONE ABOVE, WE NEED
// TWO SEPARATE IMPLEMENTATIONS IN ORDER TO PREVENT JAVA STRINGS FROM FORCING US TO
// PERFORM EXPENSIVE ARRAY COPYING WHEN TRYING TO GET A BYTE ARRAY...

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.util.StringUtil;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import java.util.*;
@ -119,14 +120,20 @@ public class Utils {
return c;
}
public static ArrayList<Byte> subseq(char[] fullArray) {
byte[] fullByteArray = new byte[fullArray.length];
StringUtil.charsToBytes(fullArray,0,fullArray.length,fullByteArray,0);
return subseq(fullByteArray);
}
public static ArrayList<Byte> subseq(byte[] fullArray) {
return subseq(fullArray, 0, fullArray.length);
return subseq(fullArray, 0, fullArray.length-1);
}
public static ArrayList<Byte> subseq(byte[] fullArray, int start, int end) {
assert end < fullArray.length;
ArrayList<Byte> dest = new ArrayList<Byte>(end - start + 1);
for (int i = start; i < end; i++) {
for (int i = start; i <= end; i++) {
dest.add(fullArray[i]);
}
return dest;