Massive speed-up, clean-up and tabular output.

This program is going to rule.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@731 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-05-16 16:52:40 +00:00
parent 3b57a35009
commit 527df6e57b
3 changed files with 111 additions and 59 deletions

View File

@ -29,18 +29,23 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
@Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10; @Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10;
@Argument(fullName="lodThreshold", shortName="lod", required=false, doc="lod threshold for outputting individual genotypes") public Double lodThreshold = 2.0; @Argument(fullName="lodThreshold", shortName="lod", required=false, doc="lod threshold for outputting individual genotypes") public Double lodThreshold = 2.0;
@Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT; @Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT;
@Argument(fullName="individual_output_prefix", shortName="individual_output_prefix", required=true, doc="prefix to write individual SNP calls to") public String INDIVIDUAL_OUTPUT_PREFIX; @Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT;
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
private Random random; private Random random;
private SAMFileHeader header; private SAMFileHeader header;
private PrintStream discovery_output_file; private PrintStream discovery_output_file;
private PrintStream individual_output_file;
AlleleFrequencyEstimate[] calls;
ArrayList<String> caller_sums;
public void initialize() public void initialize()
{ {
try try
{ {
discovery_output_file = new PrintStream(DISCOVERY_OUTPUT); discovery_output_file = new PrintStream(DISCOVERY_OUTPUT);
individual_output_file = new PrintStream(INDIVIDUAL_OUTPUT);
} }
catch (Exception e) catch (Exception e)
{ {
@ -60,6 +65,7 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
sample_names = new ArrayList<String>(); sample_names = new ArrayList<String>();
callers = new ArrayList<SingleSampleGenotyper>(); callers = new ArrayList<SingleSampleGenotyper>();
caller_sums = new ArrayList<String>();
random = new Random(42); random = new Random(42);
@ -68,37 +74,37 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
for (int i = 0; i < read_groups.size(); i++) for (int i = 0; i < read_groups.size(); i++)
{ {
String sample_name = read_groups.get(i).getSample(); String sample_name = read_groups.get(i).getSample();
if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); }
if (unique_sample_names.contains(sample_name)) { continue; } if (unique_sample_names.contains(sample_name)) { continue; }
unique_sample_names.add(sample_name); unique_sample_names.add(sample_name);
sample_names.add(sample_name); sample_names.add(sample_name);
System.out.println("SAMPLE: " + sample_name); System.out.println("SAMPLE: " + sample_name);
SingleSampleGenotyper caller = new SingleSampleGenotyper(); SingleSampleGenotyper caller = new SingleSampleGenotyper();
caller.callsFileName = INDIVIDUAL_OUTPUT_PREFIX + "." + sample_name + ".calls"; caller.callsFileName = null;
caller.metricsFileName = INDIVIDUAL_OUTPUT_PREFIX + "." + sample_name + ".metrics"; caller.metricsFileName = null;
caller.lodThreshold = lodThreshold; caller.lodThreshold = lodThreshold;
caller.fourBaseMode = false; caller.fourBaseMode = false;
caller.printMetrics = false; caller.printMetrics = false;
caller.SAMPLE_NAME_REGEX = SAMPLE_NAME_REGEX;
caller.initialize(); caller.initialize();
caller.calls_file = individual_output_file;
caller_sums.add(caller.reduceInit());
callers.add(caller); callers.add(caller);
} }
} }
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context) public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
{ {
// 1. seperate each context. // 1. seperate each context.
LocusContext[] contexts = new LocusContext[sample_names.size()]; LocusContext[] contexts = filterLocusContext(context, sample_names, 0);
for (int i = 0; i < sample_names.size(); i++)
{
contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
}
// EM Loop: // EM Loop:
AlleleFrequencyEstimate[] calls = null;
double EM_alt_freq; double EM_alt_freq;
double EM_N = 0; double EM_N = 0;
calls = null;
// this line is kinda hacky // this line is kinda hacky
if (MAX_ITERATIONS == 1) { EM_alt_freq = -1; } if (MAX_ITERATIONS == 1) { EM_alt_freq = -1; }
@ -127,7 +133,6 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
likelihood += calls[i].posterior(); likelihood += calls[i].posterior();
if (! FRACTIONAL_COUNTS) if (! FRACTIONAL_COUNTS)
{ {
//System.out.printf("DBG: %s %f %f\n", //System.out.printf("DBG: %s %f %f\n",
@ -173,10 +178,8 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]); ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]);
if (calls[i].depth == 0) { continue; } if (calls[i].depth == 0) { continue; }
//if (calls[i].lodVsRef < lodThreshold) { continue; } //if (calls[i].lodVsRef < lodThreshold) { continue; }
out.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases()); //discovery_output_file.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases());
} }
System.out.printf("EVAL %s\n", context.getLocation());
//for (int i = 0; i < likelihood_trajectory.length; i++) //for (int i = 0; i < likelihood_trajectory.length; i++)
//{ //{
@ -207,10 +210,23 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
return new LocusContext(location, reads, offsets); return new LocusContext(location, reads, offsets);
} }
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample) private LocusContext[] filterLocusContext(LocusContext context, List<String> sample_names, int downsample)
{ {
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>(); HashMap<String,Integer> index = new HashMap<String,Integer>();
ArrayList<Integer> offsets = new ArrayList<Integer>(); for (int i = 0; i < sample_names.size(); i++)
{
index.put(sample_names.get(i), i);
}
LocusContext[] contexts = new LocusContext[sample_names.size()];
ArrayList<SAMRecord>[] reads = new ArrayList[sample_names.size()];
ArrayList<Integer>[] offsets = new ArrayList[sample_names.size()];
for (int i = 0; i < sample_names.size(); i++)
{
reads[i] = new ArrayList<SAMRecord>();
offsets[i] = new ArrayList<Integer>();
}
for (int i = 0; i < context.getReads().size(); i++) for (int i = 0; i < context.getReads().size(); i++)
{ {
@ -219,51 +235,71 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
String RG = (String)(read.getAttribute("RG")); String RG = (String)(read.getAttribute("RG"));
assert(header != null); assert(header != null);
//System.out.printf("RG: %s\n", RG);
assert(header.getReadGroup(RG) != null); assert(header.getReadGroup(RG) != null);
String sample = header.getReadGroup(RG).getSample(); String sample = header.getReadGroup(RG).getSample();
if (sample == sample_name) if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); }
{ reads[index.get(sample)].add(read);
reads.add(read); offsets[index.get(sample)].add(offset);
offsets.add(offset);
}
} }
if (downsample != 0) if (downsample != 0)
{ {
List<Integer> perm = new ArrayList<Integer>(); for (int j = 0; j < reads.length; j++)
for (int i = 0; i < reads.size(); i++) { perm.add(i); } {
perm = Utils.RandomSubset(perm, downsample); List<Integer> perm = new ArrayList<Integer>();
for (int i = 0; i < reads[j].size(); i++) { perm.add(i); }
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>(); perm = Utils.RandomSubset(perm, downsample);
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
for (int i = 0; i < perm.size(); i++) ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
{
downsampled_reads.add(reads.get(perm.get(i))); for (int i = 0; i < perm.size(); i++)
downsampled_offsets.add(offsets.get(perm.get(i))); {
} downsampled_reads.add(reads[j].get(perm.get(i)));
downsampled_offsets.add(offsets[j].get(perm.get(i)));
reads = downsampled_reads; }
offsets = downsampled_offsets;
reads[j] = downsampled_reads;
offsets[j] = downsampled_offsets;
contexts[j] = new LocusContext(context.getLocation(), reads[j], offsets[j]);
}
} }
else
{
for (int j = 0; j < reads.length; j++)
{
contexts[j] = new LocusContext(context.getLocation(), reads[j], offsets[j]);
}
}
return new LocusContext(context.getLocation(), reads, offsets); return contexts;
} }
public void onTraversalDone() public void onTraversalDone()
{ {
return; discovery_output_file.flush();
discovery_output_file.close();
individual_output_file.flush();
individual_output_file.close();
return;
} }
public String reduceInit() public String reduceInit()
{ {
return ""; for (int i = 0; i < callers.size(); i++)
{
callers.get(i).reduceInit();
}
return "";
} }
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{ {
for (int i = 0; i < callers.size(); i++)
{
caller_sums.set(i, callers.get(i).reduce(calls[i], caller_sums.get(i)));
}
return ""; return "";
} }

View File

@ -34,6 +34,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
@Argument(fullName="qHom", shortName="qHom", doc="qHom", required=false) public Double qHom = 0.04; @Argument(fullName="qHom", shortName="qHom", doc="qHom", required=false) public Double qHom = 0.04;
@Argument(fullName="qHet", shortName="qHet", doc="qHet", required=false) public Double qHet = 0.49; @Argument(fullName="qHet", shortName="qHet", doc="qHet", required=false) public Double qHet = 0.49;
@Argument(fullName="qHomNonRef", shortName="qHomNonRef", doc="qHomNonRef", required=false) public Double qHomNonRef = 0.97; @Argument(fullName="qHomNonRef", shortName="qHomNonRef", doc="qHomNonRef", required=false) public Double qHomNonRef = 0.97;
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
public AlleleMetrics metrics; public AlleleMetrics metrics;
public PrintStream calls_file; public PrintStream calls_file;
@ -47,8 +48,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
try try
{ {
sample_name = null; sample_name = null;
metrics = new AlleleMetrics(metricsFileName, lodThreshold); if (metricsFileName != null) { metrics = new AlleleMetrics(metricsFileName, lodThreshold); }
calls_file = new PrintStream(callsFileName); if (callsFileName != null) { calls_file = new PrintStream(callsFileName); }
} }
catch (Exception e) catch (Exception e)
{ {
@ -70,6 +71,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
if (read_group_record != null) if (read_group_record != null)
{ {
String local_sample_name = read.getHeader().getReadGroup(RG).getSample(); String local_sample_name = read.getHeader().getReadGroup(RG).getSample();
if (SAMPLE_NAME_REGEX != null) { local_sample_name = local_sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); }
if (sample_name == null) { sample_name = local_sample_name; } if (sample_name == null) { sample_name = local_sample_name; }
else else
{ {
@ -355,6 +357,13 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
return ""; return "";
} }
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
calls_file.println(alleleFreq.asTabularString());
return "";
}
/*
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{ {
// Print RESULT data for confident calls // Print RESULT data for confident calls
@ -401,19 +410,18 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
if (alleleFreq.lodVsRef >= 5) { if (alleleFreq.lodVsRef >= 5) {
calls_file.print(alleleFreq.asGFFString()); calls_file.print(alleleFreq.asGFFString());
/* //String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N);
String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N); //System.out.print("DEBUG " + gtype + " ");
System.out.print("DEBUG " + gtype + " "); //if (gtype.contentEquals("het")) {
if (gtype.contentEquals("het")) { // System.out.println(alleleFreq.ref + "" + alleleFreq.alt);
System.out.println(alleleFreq.ref + "" + alleleFreq.alt); //} else if (gtype.contentEquals("hom")) {
} else if (gtype.contentEquals("hom")) { // System.out.println(alleleFreq.ref + "" + alleleFreq.ref);
System.out.println(alleleFreq.ref + "" + alleleFreq.ref); //} else {
} else { // System.out.println(alleleFreq.alt + "" + alleleFreq.alt);
System.out.println(alleleFreq.alt + "" + alleleFreq.alt); //}
}
*/
} }
return ""; return "";
} }
*/
} }

View File

@ -147,17 +147,25 @@ public class AlleleFrequencyEstimate {
return s; return s;
} }
public String asTabularString() { public String asTabularStringHeader()
return String.format("RESULT %s %c %c %f %f %f %f %d %s\n", {
return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases";
}
public String asTabularString()
{
return String.format("%s %s %c %c %s %f %f %f %f %d %s",
location, location,
sample_name,
ref, ref,
alt, alt,
genotype(),
qhat, qhat,
qstar, qstar,
lodVsRef, lodVsRef,
lodVsNextBest, lodVsNextBest,
depth, depth,
notes); bases);
} }
public String toString() { return asTabularString(); } public String toString() { return asTabularString(); }