Massive speed-up, clean-up and tabular output.

This program is going to rule.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@731 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-05-16 16:52:40 +00:00
parent 3b57a35009
commit 527df6e57b
3 changed files with 111 additions and 59 deletions

View File

@ -29,18 +29,23 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
@Argument(required=false, shortName="max_iterations", doc="Maximum number of iterations for EM") public int MAX_ITERATIONS = 10;
@Argument(fullName="lodThreshold", shortName="lod", required=false, doc="lod threshold for outputting individual genotypes") public Double lodThreshold = 2.0;
@Argument(fullName="discovery_output", shortName="discovery_output", required=true, doc="file to write SNP discovery output to") public String DISCOVERY_OUTPUT;
@Argument(fullName="individual_output_prefix", shortName="individual_output_prefix", required=true, doc="prefix to write individual SNP calls to") public String INDIVIDUAL_OUTPUT_PREFIX;
@Argument(fullName="individual_output", shortName="individual_output", required=true, doc="file to write individual SNP calls to") public String INDIVIDUAL_OUTPUT;
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
private Random random;
private SAMFileHeader header;
private PrintStream discovery_output_file;
private PrintStream individual_output_file;
AlleleFrequencyEstimate[] calls;
ArrayList<String> caller_sums;
public void initialize()
{
try
{
discovery_output_file = new PrintStream(DISCOVERY_OUTPUT);
individual_output_file = new PrintStream(INDIVIDUAL_OUTPUT);
}
catch (Exception e)
{
@ -60,6 +65,7 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
sample_names = new ArrayList<String>();
callers = new ArrayList<SingleSampleGenotyper>();
caller_sums = new ArrayList<String>();
random = new Random(42);
@ -68,37 +74,37 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
for (int i = 0; i < read_groups.size(); i++)
{
String sample_name = read_groups.get(i).getSample();
if (SAMPLE_NAME_REGEX != null) { sample_name = sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); }
if (unique_sample_names.contains(sample_name)) { continue; }
unique_sample_names.add(sample_name);
sample_names.add(sample_name);
System.out.println("SAMPLE: " + sample_name);
SingleSampleGenotyper caller = new SingleSampleGenotyper();
caller.callsFileName = INDIVIDUAL_OUTPUT_PREFIX + "." + sample_name + ".calls";
caller.metricsFileName = INDIVIDUAL_OUTPUT_PREFIX + "." + sample_name + ".metrics";
caller.callsFileName = null;
caller.metricsFileName = null;
caller.lodThreshold = lodThreshold;
caller.fourBaseMode = false;
caller.printMetrics = false;
caller.SAMPLE_NAME_REGEX = SAMPLE_NAME_REGEX;
caller.initialize();
caller.calls_file = individual_output_file;
caller_sums.add(caller.reduceInit());
callers.add(caller);
}
}
public AlleleFrequencyEstimate map(RefMetaDataTracker tracker, char ref, LocusContext context)
{
// 1. seperate each context.
LocusContext[] contexts = new LocusContext[sample_names.size()];
for (int i = 0; i < sample_names.size(); i++)
{
contexts[i] = filterLocusContext(context, sample_names.get(i), 0);
}
LocusContext[] contexts = filterLocusContext(context, sample_names, 0);
// EM Loop:
AlleleFrequencyEstimate[] calls = null;
double EM_alt_freq;
double EM_N = 0;
calls = null;
// this line is kinda hacky
if (MAX_ITERATIONS == 1) { EM_alt_freq = -1; }
@ -127,7 +133,6 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
likelihood += calls[i].posterior();
if (! FRACTIONAL_COUNTS)
{
//System.out.printf("DBG: %s %f %f\n",
@ -173,10 +178,8 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
ReadBackedPileup pileup = new ReadBackedPileup(ref, contexts[i]);
if (calls[i].depth == 0) { continue; }
//if (calls[i].lodVsRef < lodThreshold) { continue; }
out.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases());
//discovery_output_file.printf("%s %s %c %f %s %f %f %f %f %f %s\n", context.getLocation(), sample_names.get(i), ref, EM_alt_freq, calls[i].genotype(), calls[i].lodVsRef, calls[i].lodVsNextBest, calls[i].pBest, calls[i].pRef, discovery_lod, pileup.getBases());
}
System.out.printf("EVAL %s\n", context.getLocation());
//for (int i = 0; i < likelihood_trajectory.length; i++)
//{
@ -207,10 +210,23 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
return new LocusContext(location, reads, offsets);
}
private LocusContext filterLocusContext(LocusContext context, String sample_name, int downsample)
private LocusContext[] filterLocusContext(LocusContext context, List<String> sample_names, int downsample)
{
ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
ArrayList<Integer> offsets = new ArrayList<Integer>();
HashMap<String,Integer> index = new HashMap<String,Integer>();
for (int i = 0; i < sample_names.size(); i++)
{
index.put(sample_names.get(i), i);
}
LocusContext[] contexts = new LocusContext[sample_names.size()];
ArrayList<SAMRecord>[] reads = new ArrayList[sample_names.size()];
ArrayList<Integer>[] offsets = new ArrayList[sample_names.size()];
for (int i = 0; i < sample_names.size(); i++)
{
reads[i] = new ArrayList<SAMRecord>();
offsets[i] = new ArrayList<Integer>();
}
for (int i = 0; i < context.getReads().size(); i++)
{
@ -219,51 +235,71 @@ public class PoolCaller extends LocusWalker<AlleleFrequencyEstimate, String>
String RG = (String)(read.getAttribute("RG"));
assert(header != null);
//System.out.printf("RG: %s\n", RG);
assert(header.getReadGroup(RG) != null);
String sample = header.getReadGroup(RG).getSample();
if (sample == sample_name)
{
reads.add(read);
offsets.add(offset);
}
if (SAMPLE_NAME_REGEX != null) { sample = sample.replaceAll(SAMPLE_NAME_REGEX, "$1"); }
reads[index.get(sample)].add(read);
offsets[index.get(sample)].add(offset);
}
if (downsample != 0)
{
List<Integer> perm = new ArrayList<Integer>();
for (int i = 0; i < reads.size(); i++) { perm.add(i); }
perm = Utils.RandomSubset(perm, downsample);
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
for (int i = 0; i < perm.size(); i++)
{
downsampled_reads.add(reads.get(perm.get(i)));
downsampled_offsets.add(offsets.get(perm.get(i)));
}
reads = downsampled_reads;
offsets = downsampled_offsets;
for (int j = 0; j < reads.length; j++)
{
List<Integer> perm = new ArrayList<Integer>();
for (int i = 0; i < reads[j].size(); i++) { perm.add(i); }
perm = Utils.RandomSubset(perm, downsample);
ArrayList<SAMRecord> downsampled_reads = new ArrayList<SAMRecord>();
ArrayList<Integer> downsampled_offsets = new ArrayList<Integer>();
for (int i = 0; i < perm.size(); i++)
{
downsampled_reads.add(reads[j].get(perm.get(i)));
downsampled_offsets.add(offsets[j].get(perm.get(i)));
}
reads[j] = downsampled_reads;
offsets[j] = downsampled_offsets;
contexts[j] = new LocusContext(context.getLocation(), reads[j], offsets[j]);
}
}
else
{
for (int j = 0; j < reads.length; j++)
{
contexts[j] = new LocusContext(context.getLocation(), reads[j], offsets[j]);
}
}
return new LocusContext(context.getLocation(), reads, offsets);
return contexts;
}
public void onTraversalDone()
{
return;
discovery_output_file.flush();
discovery_output_file.close();
individual_output_file.flush();
individual_output_file.close();
return;
}
public String reduceInit()
{
return "";
for (int i = 0; i < callers.size(); i++)
{
callers.get(i).reduceInit();
}
return "";
}
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
for (int i = 0; i < callers.size(); i++)
{
caller_sums.set(i, callers.get(i).reduce(calls[i], caller_sums.get(i)));
}
return "";
}

View File

@ -34,6 +34,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
@Argument(fullName="qHom", shortName="qHom", doc="qHom", required=false) public Double qHom = 0.04;
@Argument(fullName="qHet", shortName="qHet", doc="qHet", required=false) public Double qHet = 0.49;
@Argument(fullName="qHomNonRef", shortName="qHomNonRef", doc="qHomNonRef", required=false) public Double qHomNonRef = 0.97;
@Argument(fullName="sample_name_regex", shortName="sample_name_regex", required=false, doc="sample_name_regex") public String SAMPLE_NAME_REGEX = null;
public AlleleMetrics metrics;
public PrintStream calls_file;
@ -47,8 +48,8 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
try
{
sample_name = null;
metrics = new AlleleMetrics(metricsFileName, lodThreshold);
calls_file = new PrintStream(callsFileName);
if (metricsFileName != null) { metrics = new AlleleMetrics(metricsFileName, lodThreshold); }
if (callsFileName != null) { calls_file = new PrintStream(callsFileName); }
}
catch (Exception e)
{
@ -70,6 +71,7 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
if (read_group_record != null)
{
String local_sample_name = read.getHeader().getReadGroup(RG).getSample();
if (SAMPLE_NAME_REGEX != null) { local_sample_name = local_sample_name.replaceAll(SAMPLE_NAME_REGEX, "$1"); }
if (sample_name == null) { sample_name = local_sample_name; }
else
{
@ -355,6 +357,13 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
return "";
}
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
calls_file.println(alleleFreq.asTabularString());
return "";
}
/*
public String reduce(AlleleFrequencyEstimate alleleFreq, String sum)
{
// Print RESULT data for confident calls
@ -401,19 +410,18 @@ public class SingleSampleGenotyper extends LocusWalker<AlleleFrequencyEstimate,
if (alleleFreq.lodVsRef >= 5) {
calls_file.print(alleleFreq.asGFFString());
/*
String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N);
System.out.print("DEBUG " + gtype + " ");
if (gtype.contentEquals("het")) {
System.out.println(alleleFreq.ref + "" + alleleFreq.alt);
} else if (gtype.contentEquals("hom")) {
System.out.println(alleleFreq.ref + "" + alleleFreq.ref);
} else {
System.out.println(alleleFreq.alt + "" + alleleFreq.alt);
}
*/
//String gtype = genotypeTypeString(alleleFreq.qstar, alleleFreq.N);
//System.out.print("DEBUG " + gtype + " ");
//if (gtype.contentEquals("het")) {
// System.out.println(alleleFreq.ref + "" + alleleFreq.alt);
//} else if (gtype.contentEquals("hom")) {
// System.out.println(alleleFreq.ref + "" + alleleFreq.ref);
//} else {
// System.out.println(alleleFreq.alt + "" + alleleFreq.alt);
//}
}
return "";
}
*/
}

View File

@ -147,17 +147,25 @@ public class AlleleFrequencyEstimate {
return s;
}
public String asTabularString() {
return String.format("RESULT %s %c %c %f %f %f %f %d %s\n",
public String asTabularStringHeader()
{
return "location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases";
}
public String asTabularString()
{
return String.format("%s %s %c %c %s %f %f %f %f %d %s",
location,
sample_name,
ref,
alt,
genotype(),
qhat,
qstar,
lodVsRef,
lodVsNextBest,
depth,
notes);
bases);
}
public String toString() { return asTabularString(); }