diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java new file mode 100755 index 000000000..c5ab0426d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads ending in each category. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReadEndIndels \
+ *   -o output.grp \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ */ + + +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountReadEventsWalker extends ReadWalker> , Map>> { + @Output (doc = "GATKReport table output") + PrintStream out; + + public Map> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + return ReadUtils.getCigarOperatorForAllBases(read); + } + + public Map> reduceInit() { + return new HashMap>(); + } + + public Map> reduce(Map> value, Map> sum) { + for (Map.Entry> entry : value.entrySet()) { + CigarOperator op = entry.getKey(); + ArrayList positions = entry.getValue(); + + for (int p : positions) { + Map operatorCount = sum.get(p); + if (operatorCount == null) { + operatorCount = new HashMap(); + sum.put(p, operatorCount); + } + + Long count = operatorCount.get(op); + if (count == null) + count = 0L; + count++; + operatorCount.put(op, count); + } + } + return sum; + } + + @Override + public void onTraversalDone(Map> result) { + GATKReport report = GATKReport.newSimpleReport("Events", "Position", "Event", "Observations"); + for (Map.Entry> entry : result.entrySet()) { + int position = entry.getKey(); + Map operatorCount = entry.getValue(); + + for (Map.Entry subEntry: operatorCount.entrySet()) { + String operator = subEntry.getKey().name(); + Long observations = subEntry.getValue(); + report.addRow(position, operator, observations); + } + } + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java new file mode 100755 index 000000000..9208cbae8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java @@ -0,0 +1,70 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.List; + +/** + * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads ending in each category. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReadEndIndels \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountTerminusEventWalker extends ReadWalker, Pair> { + public Pair map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + List cigarElements = read.getCigar().getCigarElements(); + + CigarElement lastElement = null; + for (CigarElement element : cigarElements) { + if (element.getOperator() != CigarOperator.HARD_CLIP) + lastElement = element; + } + + if (lastElement == null) + throw new UserException.MalformedBAM(read, "read does not have any bases, it's all hard clips"); + + long endsInIndel = lastElement.getOperator() == CigarOperator.INSERTION || lastElement.getOperator() == CigarOperator.DELETION? 1 : 0; + long endsInSC = lastElement.getOperator() == CigarOperator.SOFT_CLIP ? 1 : 0; + + return new Pair(endsInIndel, endsInSC); + } + + public Pair reduceInit() { return new Pair(0L, 0L); } + + public Pair reduce(Pair value, Pair sum) { + sum.set(sum.getFirst() + value.getFirst(), sum.getSecond() + value.getSecond()); + return sum; + } + + @Override + public void onTraversalDone(Pair result) { + System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index c2f7117f8..4e2fd1446 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -783,4 +783,43 @@ public class ReadUtils { return location; } + /** + * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. + * + * Example: + * D -> 2, 34, 75 + * I -> 55 + * S -> 0, 101 + * H -> 101 + * + * @param read the read + * @return a map with the properties described above. See example + */ + public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { + Map> events = new HashMap>(); + + int position = 0; + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + CigarOperator op = cigarElement.getOperator(); + if (op.consumesReadBases()) { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + for (int i = position; i < cigarElement.getLength(); i++) + list.add(position++); + } + else { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + list.add(position); + } + } + return events; + } + }