From b979bd2cedadf9fb4aaa294a56811f4108f113b3 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 2 Dec 2009 03:39:24 +0000 Subject: [PATCH] - Optimized implementation of -byReadGroup in DoCWalker - Added implementation of -bySample in DoCWalker - Removed CoverageBySample and added a watered down version to the examples directory git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2209 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/examples/CoverageBySample.java | 68 +++++++ .../gatk/walkers/DepthOfCoverageWalker.java | 77 ++++++-- .../gatk/walkers/CoverageBySample.java | 185 ------------------ 3 files changed, 127 insertions(+), 203 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java delete mode 100644 java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java diff --git a/java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java b/java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java new file mode 100644 index 000000000..54eaa7e90 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/examples/CoverageBySample.java @@ -0,0 +1,68 @@ + +package org.broadinstitute.sting.gatk.examples; + +import net.sf.samtools.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.utils.pileup.*; + +import java.util.*; + +public class CoverageBySample extends LocusWalker { + + private HashSet sampleNames = new HashSet(); + + public boolean requiresReads() { return true; } + + public void initialize() { + + List read_groups = this.getToolkit().getSAMFileHeader().getReadGroups(); + + for ( SAMReadGroupRecord record : read_groups ) { + String sample = record.getSample(); + if ( sample != null ) + sampleNames.add(sample); + } + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + HashMap depthBySample = new HashMap(); + for ( String sample : sampleNames ) + depthBySample.put(sample, 0); + + ReadBackedPileup pileup = context.getPileup(); + for ( PileupElement p : pileup ) { + + SAMReadGroupRecord readGroup = p.getRead().getReadGroup(); + if ( readGroup == null ) + continue; + + String sample = readGroup.getSample(); + if ( sample != null ) { + int oldDepth = depthBySample.get(sample); + depthBySample.put(sample, oldDepth + 1); + } + } + + for ( Map.Entry sample : depthBySample.entrySet() ) { + out.printf(" %s %8d%n", sample.getKey(), sample.getValue()); + } + + return 1; + } + + + public void onTraversalDone(Integer result) { + out.println("Processed " + result + " loci."); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java index e20799d8b..060374fad 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/DepthOfCoverageWalker.java @@ -32,9 +32,7 @@ import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.pileup.*; import net.sf.samtools.SAMReadGroupRecord; -import java.util.HashMap; -import java.util.Collections; -import java.util.ArrayList; +import java.util.*; /** * Display the depth of coverage at a given locus. @@ -48,16 +46,25 @@ public class DepthOfCoverageWalker extends LocusWalker } @Argument(fullName="printStyle", shortName = "s", doc="Printing style: NONE, COMPACT, or DETAILED", required=false) - public printType printStyle = printType.COMPACT; + protected printType printStyle = printType.COMPACT; @Argument(fullName="excludeDeletions", shortName = "ed", doc="If true, we will exclude reads with deletions at a locus in coverage",required=false) - public boolean excludeDeletionsInCoverage = false; + protected boolean excludeDeletionsInCoverage = false; @Argument(fullName="minMAPQ", shortName = "minMAPQ", doc="If provided, we will exclude reads with MAPQ < this value at a locus in coverage",required=false) - public int excludeMAPQBelowThis = -1; + protected int excludeMAPQBelowThis = -1; @Argument(fullName="byReadGroup", shortName="byRG", doc="List read depths for each read group") - public boolean byReadGroup = false; + protected boolean byReadGroup = false; + + @Argument(fullName="bySample", shortName="bySample", doc="List read depths for each sample") + protected boolean bySample = false; + + + private TreeSet readGroupNames = new TreeSet(); + private TreeSet sampleNames = new TreeSet(); + + public boolean includeReadsWithDeletionAtLoci() { return ! excludeDeletionsInCoverage; } @@ -70,15 +77,32 @@ public class DepthOfCoverageWalker extends LocusWalker out.printf("locus nCleanReads nDeletionReads nLowMAPQReads%n"); break; } + + if ( byReadGroup ) { + List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); + for ( SAMReadGroupRecord record : readGroups ) + readGroupNames.add(record.getReadGroupId()); + } + + if ( bySample ) { + List readGroups = this.getToolkit().getSAMFileHeader().getReadGroups(); + for ( SAMReadGroupRecord record : readGroups ) { + String sample = record.getSample(); + if ( sample != null ) + sampleNames.add(sample); + } + } } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { int nCleanReads = 0, nBadMAPQReads = 0, nDeletionReads = 0; HashMap depthByReadGroup = new HashMap(); - for (SAMReadGroupRecord rg : this.getToolkit().getSAMFileHeader().getReadGroups()) { - depthByReadGroup.put(rg.getReadGroupId(), 0); - } + for ( String readGroupName : readGroupNames ) + depthByReadGroup.put(readGroupName, 0); + HashMap depthBySample = new HashMap(); + for ( String sample : sampleNames ) + depthBySample.put(sample, 0); ReadBackedPileup pileup = context.getPileup(); for ( PileupElement p : pileup ) { @@ -87,9 +111,23 @@ public class DepthOfCoverageWalker extends LocusWalker else if ( p.isDeletion() ) nDeletionReads++; else nCleanReads++; - String readGroupName = p.getRead().getReadGroup().getReadGroupId(); - int oldDepth = depthByReadGroup.get(readGroupName); - depthByReadGroup.put(readGroupName, oldDepth + 1); + SAMReadGroupRecord readGroup = p.getRead().getReadGroup(); + if ( readGroup == null ) + continue; + + if ( byReadGroup ) { + String readGroupName = readGroup.getReadGroupId(); + int oldDepth = depthByReadGroup.get(readGroupName); + depthByReadGroup.put(readGroupName, oldDepth + 1); + } + + if ( bySample ) { + String sample = readGroup.getSample(); + if ( sample != null ) { + int oldDepth = depthBySample.get(sample); + depthBySample.put(sample, oldDepth + 1); + } + } } int nTotalReads = nCleanReads + (excludeDeletionsInCoverage ? 0 : nDeletionReads); @@ -103,15 +141,18 @@ public class DepthOfCoverageWalker extends LocusWalker break; } - if (byReadGroup) { - ArrayList rgs = new ArrayList(depthByReadGroup.keySet()); - Collections.sort(rgs); - - for (String rg : rgs) { + if ( byReadGroup ) { + for ( String rg : readGroupNames ) { out.printf(" %s %8d%n", rg, depthByReadGroup.get(rg)); } } + if ( bySample ) { + for ( String sample : sampleNames ) { + out.printf(" %s %8d%n", sample, depthBySample.get(sample)); + } + } + return nTotalReads; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java deleted file mode 100644 index 214bac361..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CoverageBySample.java +++ /dev/null @@ -1,185 +0,0 @@ - -package org.broadinstitute.sting.playground.gatk.walkers; - -import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.*; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.BaseUtils; - -import java.util.*; - -public class CoverageBySample extends LocusWalker -{ - List sample_names = null; - - public boolean requiresReads() { return true; } - - private SAMFileHeader header; - - public void initialize() - { - GenomeAnalysisEngine toolkit = this.getToolkit(); - this.header = toolkit.getSAMFileHeader(); - List read_groups = header.getReadGroups(); - - sample_names = new ArrayList(); - HashSet unique_sample_names = new HashSet(); - - for (int i = 0; i < read_groups.size(); i++) - { - String sample_name = read_groups.get(i).getSample(); - if (unique_sample_names.contains(sample_name)) { continue; } - unique_sample_names.add(sample_name); - sample_names.add(sample_name); - } - } - - public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) - { - String line = context.getLocation().getContig() + " " + context.getLocation().getStart() + " " ; - - AlignmentContext[] contexts = filterAlignmentContext(context, sample_names, 0); - - HashMap counts = countReadsBySample(context); - for (int i = 0; i < contexts.length; i++) - { - List reads = contexts[i].getReads(); - List offsets = contexts[i].getOffsets(); - - out.printf("%s %s ", context.getLocation(), sample_names.get(i)); - - int[] forward_counts = new int[4]; - int[] backward_counts = new int[4]; - - for (int j = 0; j < reads.size(); j++) - { - SAMRecord read = reads.get(j); - int offset = offsets.get(j); - boolean backward = read.getReadNegativeStrandFlag(); - char base = Character.toUpperCase((char)(read.getReadBases()[offset])); - - if (BaseUtils.simpleBaseToBaseIndex(base) == -1) { continue; } - - if (backward) { base = Character.toLowerCase(base); } - - if (! backward) { forward_counts[BaseUtils.simpleBaseToBaseIndex(base)]++; } - else { backward_counts[BaseUtils.simpleBaseToBaseIndex(base)]++; } - - //out.printf("%c", base); - } - out.printf("A[%d] C[%d] G[%d] T[%d] a[%d] c[%d] g[%d] t[%d]", - forward_counts[0], - forward_counts[1], - forward_counts[2], - forward_counts[3], - backward_counts[0], - backward_counts[1], - backward_counts[2], - backward_counts[3]); - out.printf("\n"); - } - return ""; - } - - private HashMap countReadsBySample(AlignmentContext context) - { - HashMap counts = new HashMap(); - for (int i = 0; i < sample_names.size(); i++) - { - counts.put(sample_names.get(i), 0); - } - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - String sample = read.getReadGroup().getSample(); - counts.put(sample, counts.get(sample)+1); - } - return counts; - } - - private AlignmentContext[] filterAlignmentContext(AlignmentContext context, List sample_names, int downsample) - { - HashMap index = new HashMap(); - for (int i = 0; i < sample_names.size(); i++) - { - index.put(sample_names.get(i), i); - } - - AlignmentContext[] contexts = new AlignmentContext[sample_names.size()]; - ArrayList[] reads = new ArrayList[sample_names.size()]; - ArrayList[] offsets = new ArrayList[sample_names.size()]; - - for (int i = 0; i < sample_names.size(); i++) - { - reads[i] = new ArrayList(); - offsets[i] = new ArrayList(); - } - - for (int i = 0; i < context.getReads().size(); i++) - { - SAMRecord read = context.getReads().get(i); - Integer offset = context.getOffsets().get(i); - - assert(header != null); - assert(read.getReadGroup() != null); - - String sample = read.getReadGroup().getSample(); - //if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } - reads[index.get(sample)].add(read); - offsets[index.get(sample)].add(offset); - } - - if (downsample != 0) - { - for (int j = 0; j < reads.length; j++) - { - List perm = new ArrayList(); - for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } - perm = Utils.RandomSubset(perm, downsample); - - ArrayList downsampled_reads = new ArrayList(); - ArrayList downsampled_offsets = new ArrayList(); - - for (int i = 0; i < perm.size(); i++) - { - downsampled_reads.add(reads[j].get(perm.get(i))); - downsampled_offsets.add(offsets[j].get(perm.get(i))); - } - - reads[j] = downsampled_reads; - offsets[j] = downsampled_offsets; - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - else - { - for (int j = 0; j < reads.length; j++) - { - contexts[j] = new AlignmentContext(context.getLocation(), reads[j], offsets[j]); - } - } - - return contexts; - } - - public void onTraversalDone(String result) - { - return; - } - - public String reduceInit() - { - return ""; - } - - public String reduce(String line, String sum) - { - return ""; - } - - -}