Oops. Slight twiddle of the math here so that I'm not asking if bestBase == nextBestBase.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@336 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-04-08 19:56:54 +00:00
parent d4ab95c098
commit 0e7d962eca
1 changed files with 49 additions and 6 deletions

View File

@ -8,16 +8,41 @@ import net.sf.samtools.SAMRecord;
import edu.mit.broad.picard.reference.ReferenceSequence;
/**
* ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing
* ReadErrorRateWalker assesses the error rate per read position ('cycle') by comparing the
* read to its home on the reference and noting the mismatch rate. It ignores reads with
* indels in them, treats high and low-quality references bases the same, and does not count
* ambiguous bases as mismatches. It's also thread-safe, so you can process a slew of reads
* in short order.
*
* @author Kiran Garimella
*/
public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
@Argument(fullName="useNextBestBase",required=false,defaultValue="false")
public boolean useNextBestBase;
/**
* Ignore reads with indels or clipping
*
* @param context the reference context
* @param read the read to assess
* @return true if the read can be processed, false if it should be ignored
*/
public boolean filter(LocusContext context, SAMRecord read) {
return (read.getCigar().numCigarElements() == 1);
}
/**
* For each read, return a boolean array indicating the locations of the mismatch.
* Length of the array is one element longer than the read length. The last element
* of this array is always "true" so that we can figure out how many reads we
* processed in a thread-safe manner.
*
* @param context the reference context
* @param read the read to assess
* @return An array of length (read_length + 1) indicating where the mismatches occur.
* Last element is for internal use so the reduce() function can figure out how
* many reads we processed.
*/
public boolean[] map(LocusContext context, SAMRecord read) {
boolean[] errorsPerCycle = new boolean[read.getReadLength() + 1];
@ -55,21 +80,23 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
}
if (nextBestBase != '.') {
errorsPerCycle[cycle] = !(bases[cycle] == compBase || bases[cycle] == nextBestBase);
totalMismatches = !(bases[cycle] == compBase || bases[cycle] == nextBestBase) ? 1 : 0;
errorsPerCycle[cycle] = !(bases[cycle] == compBase || nextBestBase == compBase);
totalMismatches = (!(bases[cycle] == compBase || nextBestBase == compBase)) ? 1 : 0;
}
} else {
errorsPerCycle[cycle] = (bases[cycle] != compBase);
totalMismatches += (bases[cycle] != compBase) ? 1 : 0;
errorsPerCycle[cycle] = !(bases[cycle] == compBase);
totalMismatches += (!(bases[cycle] == compBase)) ? 1 : 0;
}
}
}
/*
if (totalMismatches > 4) {
for (int cycle = 0; cycle < bases.length; cycle++) { System.out.print((char) bases[cycle]); } System.out.print("\n");
for (int cycle = 0, offset = (int) context.getPosition(); cycle < bases.length; cycle++, offset++) { System.out.print((char) contig[offset]); } System.out.print("\n");
System.out.println(totalMismatches + "\n");
}
*/
// We encode that we saw a read in the last position of the array.
// That way we know what to normalize by, and we get thread safety!
@ -78,10 +105,22 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
return errorsPerCycle;
}
/**
* We don't initialize the array here because we need to know how long the read is first.
*
* @return null
*/
public int[] reduceInit() {
return null;
}
/**
* Summarize the error rate data.
*
* @param value the read mismatch array
* @param sum the summed mismatch array
* @return the summed mismatch array with the new read mismatch array added
*/
public int[] reduce(boolean[] value, int[] sum) {
if (sum == null) {
sum = new int[value.length];
@ -94,11 +133,15 @@ public class ReadErrorRateWalker extends ReadWalker<boolean[], int[]> {
return sum;
}
/**
* We've processed all the reads. Emit the final, normalized error rate data.
*
* @param sum the summed mismatch array
*/
public void onTraversalDone(int[] sum) {
for (int cycle = 0; cycle < sum.length - 1; cycle++) {
double errorrate = ((double) sum[cycle])/((double) sum[sum.length - 1]);
System.out.println("[ERROR_RATE] " + cycle + " " + errorrate);
}
}
}