diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java index 29b815e9c..1f014a665 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java @@ -29,18 +29,23 @@ public class PoolCaller extends LocusWalker @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; @Argument(fullName="lodThreshold", shortName="lod", required=false, doc="lod threshold for outputting individual genotypes") public Double lodThreshold = 2.0; @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT; - @Argument(fullName="individual_output_prefix", shortName="individual_output_prefix", required=true, doc="prefix to write individual SNP calls to") public String INDIVIDUAL_OUTPUT_PREFIX; + @Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT; + @Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null; - private Random random; private SAMFileHeader header; private PrintStream discovery_output_file; + private PrintStream individual_output_file; + + AlleleFrequencyEstimate[] calls; + ArrayList caller_sums; public void initialize() { try { discovery_output_file = new PrintStream(DISCOVERY_OUTPUT); + individual_output_file = new PrintStream(INDIVIDUAL_OUTPUT); } catch (Exception e) { @@ -60,6 +65,7 @@ public class PoolCaller extends LocusWalker sample_names = new ArrayList(); callers = new ArrayList(); + caller_sums = new ArrayList(); random = new Random(42); @@ -68,37 +74,37 @@ public class PoolCaller extends LocusWalker for (int i = 0; i < read_groups.size(); i++) { String sample_name = read_groups.get(i).getSample(); + + if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + if (unique_sample_names.contains(sample_name)) { continue; } unique_sample_names.add(sample_name); sample_names.add(sample_name); System.out.println("SAMPLE: " + sample_name); SingleSampleGenotyper caller = new SingleSampleGenotyper(); - caller.callsFileName = INDIVIDUAL_OUTPUT_PREFIX + "." + sample_name + ".calls"; - caller.metricsFileName = INDIVIDUAL_OUTPUT_PREFIX + "." + sample_name + ".metrics"; + caller.callsFileName = null; + caller.metricsFileName = null; caller.lodThreshold = lodThreshold; caller.fourBaseMode = false; caller.printMetrics = false; + caller.SAMPLE_NAME_REGEX = SAMPLE_NAME_REGEX; caller.initialize(); + caller.calls_file = individual_output_file; + caller_sums.add(caller.reduceInit()); callers.add(caller); } } public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) { - - // 1. seperate each context. - LocusContext[] contexts = new LocusContext[sample_names.size()]; - for (int i = 0; i < sample_names.size(); i++) - { - contexts[i] = filterLocusContext(context, sample_names.get(i), 0); - } + LocusContext[] contexts = filterLocusContext(context, sample_names, 0); // EM Loop: - AlleleFrequencyEstimate[] calls = null; double EM_alt_freq; double EM_N = 0; + calls = null; // this line is kinda hacky if (MAX_ITERATIONS == 1) { EM_alt_freq = -1; } @@ -127,7 +133,6 @@ public class PoolCaller extends LocusWalker likelihood += calls[i].posterior(); - if (! FRACTIONAL_COUNTS) { //System.out.printf("DBG: %s %f %f\n", @@ -173,10 +178,8 @@ public class PoolCaller extends LocusWalker ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]); if (calls[i].depth == 0) { continue; } //if (calls[i].lodVsRef < lodThreshold) { continue; } - out.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases()); + //discovery_output_file.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases()); } - - System.out.printf("EVAL %s\n", context.getLocation()); //for (int i = 0; i < likelihood_trajectory.length; i++) //{ @@ -207,10 +210,23 @@ public class PoolCaller extends LocusWalker return new LocusContext(location, reads, offsets); } - private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample) + private LocusContext[] filterLocusContext(LocusContext context, List sample_names, int downsample) { - ArrayList reads = new ArrayList(); - ArrayList offsets = new ArrayList(); + HashMap index = new HashMap(); + for (int i = 0; i < sample_names.size(); i++) + { + index.put(sample_names.get(i), i); + } + + LocusContext[] contexts = new LocusContext[sample_names.size()]; + ArrayList[] reads = new ArrayList[sample_names.size()]; + ArrayList[] offsets = new ArrayList[sample_names.size()]; + + for (int i = 0; i < sample_names.size(); i++) + { + reads[i] = new ArrayList(); + offsets[i] = new ArrayList(); + } for (int i = 0; i < context.getReads().size(); i++) { @@ -219,51 +235,71 @@ public class PoolCaller extends LocusWalker String RG = (String)(read.getAttribute("RG")); assert(header != null); - //System.out.printf("RG: %s\n", RG); assert(header.getReadGroup(RG) != null); String sample = header.getReadGroup(RG).getSample(); - if (sample == sample_name) - { - reads.add(read); - offsets.add(offset); - } + if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); } + reads[index.get(sample)].add(read); + offsets[index.get(sample)].add(offset); } if (downsample != 0) { - List perm = new ArrayList(); - for (int i = 0; i < reads.size(); i++) { perm.add(i); } - perm = Utils.RandomSubset(perm, downsample); - - ArrayList downsampled_reads = new ArrayList(); - ArrayList downsampled_offsets = new ArrayList(); - - for (int i = 0; i < perm.size(); i++) - { - downsampled_reads.add(reads.get(perm.get(i))); - downsampled_offsets.add(offsets.get(perm.get(i))); - } - - reads = downsampled_reads; - offsets = downsampled_offsets; + for (int j = 0; j < reads.length; j++) + { + List perm = new ArrayList(); + for (int i = 0; i < reads[j].size(); i++) { perm.add(i); } + perm = Utils.RandomSubset(perm, downsample); + + ArrayList downsampled_reads = new ArrayList(); + ArrayList downsampled_offsets = new ArrayList(); + + for (int i = 0; i < perm.size(); i++) + { + downsampled_reads.add(reads[j].get(perm.get(i))); + downsampled_offsets.add(offsets[j].get(perm.get(i))); + } + + reads[j] = downsampled_reads; + offsets[j] = downsampled_offsets; + contexts[j] = new LocusContext(context.getLocation(), reads[j], offsets[j]); + } } + else + { + for (int j = 0; j < reads.length; j++) + { + contexts[j] = new LocusContext(context.getLocation(), reads[j], offsets[j]); + } + } - return new LocusContext(context.getLocation(), reads, offsets); + return contexts; } public void onTraversalDone() { - return; + discovery_output_file.flush(); + discovery_output_file.close(); + individual_output_file.flush(); + individual_output_file.close(); + return; } public String reduceInit() { - return ""; + for (int i = 0; i < callers.size(); i++) + { + callers.get(i).reduceInit(); + } + return ""; } public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) { + for (int i = 0; i < callers.size(); i++) + { + caller_sums.set(i, callers.get(i).reduce(calls[i], caller_sums.get(i))); + } return ""; } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java index ab2532f03..5a0c4453d 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java @@ -34,6 +34,7 @@ public class SingleSampleGenotyper extends LocusWalker= 5) { calls_file.print(alleleFreq.asGFFString()); - /* - String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N); - System.out.print("DEBUG " + gtype + " "); - if (gtype.contentEquals("het")) { - System.out.println(alleleFreq.ref + "" + alleleFreq.alt); - } else if (gtype.contentEquals("hom")) { - System.out.println(alleleFreq.ref + "" + alleleFreq.ref); - } else { - System.out.println(alleleFreq.alt + "" + alleleFreq.alt); - } - */ + //String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N); + //System.out.print("DEBUG " + gtype + " "); + //if (gtype.contentEquals("het")) { + // System.out.println(alleleFreq.ref + "" + alleleFreq.alt); + //} else if (gtype.contentEquals("hom")) { + // System.out.println(alleleFreq.ref + "" + alleleFreq.ref); + //} else { + // System.out.println(alleleFreq.alt + "" + alleleFreq.alt); + //} } return ""; } + */ } diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index 7b34245a1..e07d8f8d8 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -147,17 +147,25 @@ public class AlleleFrequencyEstimate { return s; } - public String asTabularString() { - return String.format("RESULT %s %c %c %f %f %f %f %d %s\n", + public String asTabularStringHeader() + { + return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases"; + } + + public String asTabularString() + { + return String.format("%s %s %c %c %s %f %f %f %f %d %s", location, + sample_name, ref, alt, + genotype(), qhat, qstar, lodVsRef, lodVsNextBest, depth, - notes); + bases); } public String toString() { return asTabularString(); }