Quick QC walkers to look at the error profile of indels in the read
This commit is contained in:
parent
24173d860a
commit
08dbd756f3
|
|
@ -0,0 +1,94 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of reads ending in each category.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ReadEndIndels \
|
||||
* -o output.grp \
|
||||
* -I input.bam \
|
||||
* [-L input.intervals]
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class CountReadEventsWalker extends ReadWalker<Map<CigarOperator, ArrayList<Integer>> , Map<Integer, Map<CigarOperator, Long>>> {
|
||||
@Output (doc = "GATKReport table output")
|
||||
PrintStream out;
|
||||
|
||||
public Map<CigarOperator, ArrayList<Integer>> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) {
|
||||
return ReadUtils.getCigarOperatorForAllBases(read);
|
||||
}
|
||||
|
||||
public Map<Integer, Map<CigarOperator, Long>> reduceInit() {
|
||||
return new HashMap<Integer, Map<CigarOperator, Long>>();
|
||||
}
|
||||
|
||||
public Map<Integer, Map<CigarOperator, Long>> reduce(Map<CigarOperator, ArrayList<Integer>> value, Map<Integer, Map<CigarOperator, Long>> sum) {
|
||||
for (Map.Entry<CigarOperator, ArrayList<Integer>> entry : value.entrySet()) {
|
||||
CigarOperator op = entry.getKey();
|
||||
ArrayList<Integer> positions = entry.getValue();
|
||||
|
||||
for (int p : positions) {
|
||||
Map<CigarOperator, Long> operatorCount = sum.get(p);
|
||||
if (operatorCount == null) {
|
||||
operatorCount = new HashMap<CigarOperator, Long>();
|
||||
sum.put(p, operatorCount);
|
||||
}
|
||||
|
||||
Long count = operatorCount.get(op);
|
||||
if (count == null)
|
||||
count = 0L;
|
||||
count++;
|
||||
operatorCount.put(op, count);
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Map<Integer, Map<CigarOperator, Long>> result) {
|
||||
GATKReport report = GATKReport.newSimpleReport("Events", "Position", "Event", "Observations");
|
||||
for (Map.Entry<Integer, Map<CigarOperator, Long>> entry : result.entrySet()) {
|
||||
int position = entry.getKey();
|
||||
Map<CigarOperator, Long> operatorCount = entry.getValue();
|
||||
|
||||
for (Map.Entry<CigarOperator, Long> subEntry: operatorCount.entrySet()) {
|
||||
String operator = subEntry.getKey().name();
|
||||
Long observations = subEntry.getValue();
|
||||
report.addRow(position, operator, observations);
|
||||
}
|
||||
}
|
||||
report.print(out);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.qc;
|
||||
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Requires;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Number of reads ending in each category.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ReadEndIndels \
|
||||
* -o output.txt \
|
||||
* -I input.bam \
|
||||
* [-L input.intervals]
|
||||
* </pre>
|
||||
*/
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE})
|
||||
public class CountTerminusEventWalker extends ReadWalker<Pair<Long, Long>, Pair<Long, Long>> {
|
||||
public Pair<Long, Long> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) {
|
||||
List<CigarElement> cigarElements = read.getCigar().getCigarElements();
|
||||
|
||||
CigarElement lastElement = null;
|
||||
for (CigarElement element : cigarElements) {
|
||||
if (element.getOperator() != CigarOperator.HARD_CLIP)
|
||||
lastElement = element;
|
||||
}
|
||||
|
||||
if (lastElement == null)
|
||||
throw new UserException.MalformedBAM(read, "read does not have any bases, it's all hard clips");
|
||||
|
||||
long endsInIndel = lastElement.getOperator() == CigarOperator.INSERTION || lastElement.getOperator() == CigarOperator.DELETION? 1 : 0;
|
||||
long endsInSC = lastElement.getOperator() == CigarOperator.SOFT_CLIP ? 1 : 0;
|
||||
|
||||
return new Pair<Long, Long>(endsInIndel, endsInSC);
|
||||
}
|
||||
|
||||
public Pair<Long, Long> reduceInit() { return new Pair<Long, Long>(0L, 0L); }
|
||||
|
||||
public Pair<Long, Long> reduce(Pair<Long, Long> value, Pair<Long, Long> sum) {
|
||||
sum.set(sum.getFirst() + value.getFirst(), sum.getSecond() + value.getSecond());
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Pair<Long, Long> result) {
|
||||
System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond()));
|
||||
}
|
||||
}
|
||||
|
|
@ -783,4 +783,43 @@ public class ReadUtils {
|
|||
return location;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a map with each event in the read (cigar operator) and the read coordinate where it happened.
|
||||
*
|
||||
* Example:
|
||||
* D -> 2, 34, 75
|
||||
* I -> 55
|
||||
* S -> 0, 101
|
||||
* H -> 101
|
||||
*
|
||||
* @param read the read
|
||||
* @return a map with the properties described above. See example
|
||||
*/
|
||||
public static Map<CigarOperator, ArrayList<Integer>> getCigarOperatorForAllBases (GATKSAMRecord read) {
|
||||
Map<CigarOperator, ArrayList<Integer>> events = new HashMap<CigarOperator, ArrayList<Integer>>();
|
||||
|
||||
int position = 0;
|
||||
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
|
||||
CigarOperator op = cigarElement.getOperator();
|
||||
if (op.consumesReadBases()) {
|
||||
ArrayList<Integer> list = events.get(op);
|
||||
if (list == null) {
|
||||
list = new ArrayList<Integer>();
|
||||
events.put(op, list);
|
||||
}
|
||||
for (int i = position; i < cigarElement.getLength(); i++)
|
||||
list.add(position++);
|
||||
}
|
||||
else {
|
||||
ArrayList<Integer> list = events.get(op);
|
||||
if (list == null) {
|
||||
list = new ArrayList<Integer>();
|
||||
events.put(op, list);
|
||||
}
|
||||
list.add(position);
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue