Quick QC walkers to look at the error profile of indels in the read

This commit is contained in:
Mauricio Carneiro 2012-04-25 17:09:36 -04:00
parent 24173d860a
commit 08dbd756f3
3 changed files with 203 additions and 0 deletions

View File

@ -0,0 +1,94 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
/**
* Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of reads ending in each category.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ReadEndIndels \
* -o output.grp \
* -I input.bam \
* [-L input.intervals]
* </pre>
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountReadEventsWalker extends ReadWalker<Map<CigarOperator, ArrayList<Integer>> , Map<Integer, Map<CigarOperator, Long>>> {
@Output (doc = "GATKReport table output")
PrintStream out;
public Map<CigarOperator, ArrayList<Integer>> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) {
return ReadUtils.getCigarOperatorForAllBases(read);
}
public Map<Integer, Map<CigarOperator, Long>> reduceInit() {
return new HashMap<Integer, Map<CigarOperator, Long>>();
}
public Map<Integer, Map<CigarOperator, Long>> reduce(Map<CigarOperator, ArrayList<Integer>> value, Map<Integer, Map<CigarOperator, Long>> sum) {
for (Map.Entry<CigarOperator, ArrayList<Integer>> entry : value.entrySet()) {
CigarOperator op = entry.getKey();
ArrayList<Integer> positions = entry.getValue();
for (int p : positions) {
Map<CigarOperator, Long> operatorCount = sum.get(p);
if (operatorCount == null) {
operatorCount = new HashMap<CigarOperator, Long>();
sum.put(p, operatorCount);
}
Long count = operatorCount.get(op);
if (count == null)
count = 0L;
count++;
operatorCount.put(op, count);
}
}
return sum;
}
@Override
public void onTraversalDone(Map<Integer, Map<CigarOperator, Long>> result) {
GATKReport report = GATKReport.newSimpleReport("Events", "Position", "Event", "Observations");
for (Map.Entry<Integer, Map<CigarOperator, Long>> entry : result.entrySet()) {
int position = entry.getKey();
Map<CigarOperator, Long> operatorCount = entry.getValue();
for (Map.Entry<CigarOperator, Long> subEntry: operatorCount.entrySet()) {
String operator = subEntry.getKey().name();
Long observations = subEntry.getValue();
report.addRow(position, operator, observations);
}
}
report.print(out);
}
}

View File

@ -0,0 +1,70 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.List;
/**
* Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of reads ending in each category.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ReadEndIndels \
* -o output.txt \
* -I input.bam \
* [-L input.intervals]
* </pre>
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountTerminusEventWalker extends ReadWalker<Pair<Long, Long>, Pair<Long, Long>> {
public Pair<Long, Long> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) {
List<CigarElement> cigarElements = read.getCigar().getCigarElements();
CigarElement lastElement = null;
for (CigarElement element : cigarElements) {
if (element.getOperator() != CigarOperator.HARD_CLIP)
lastElement = element;
}
if (lastElement == null)
throw new UserException.MalformedBAM(read, "read does not have any bases, it's all hard clips");
long endsInIndel = lastElement.getOperator() == CigarOperator.INSERTION || lastElement.getOperator() == CigarOperator.DELETION? 1 : 0;
long endsInSC = lastElement.getOperator() == CigarOperator.SOFT_CLIP ? 1 : 0;
return new Pair<Long, Long>(endsInIndel, endsInSC);
}
public Pair<Long, Long> reduceInit() { return new Pair<Long, Long>(0L, 0L); }
public Pair<Long, Long> reduce(Pair<Long, Long> value, Pair<Long, Long> sum) {
sum.set(sum.getFirst() + value.getFirst(), sum.getSecond() + value.getSecond());
return sum;
}
@Override
public void onTraversalDone(Pair<Long, Long> result) {
System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond()));
}
}

View File

@ -783,4 +783,43 @@ public class ReadUtils {
return location;
}
/**
* Creates a map with each event in the read (cigar operator) and the read coordinate where it happened.
*
* Example:
* D -> 2, 34, 75
* I -> 55
* S -> 0, 101
* H -> 101
*
* @param read the read
* @return a map with the properties described above. See example
*/
public static Map<CigarOperator, ArrayList<Integer>> getCigarOperatorForAllBases (GATKSAMRecord read) {
Map<CigarOperator, ArrayList<Integer>> events = new HashMap<CigarOperator, ArrayList<Integer>>();
int position = 0;
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
CigarOperator op = cigarElement.getOperator();
if (op.consumesReadBases()) {
ArrayList<Integer> list = events.get(op);
if (list == null) {
list = new ArrayList<Integer>();
events.put(op, list);
}
for (int i = position; i < cigarElement.getLength(); i++)
list.add(position++);
}
else {
ArrayList<Integer> list = events.get(op);
if (list == null) {
list = new ArrayList<Integer>();
events.put(op, list);
}
list.add(position);
}
}
return events;
}
}