tools for optimizing calls.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2674 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2010-01-24 20:16:55 +00:00
parent 877957761f
commit 1d6d2b26f7
2 changed files with 499 additions and 0 deletions

View File

@ -0,0 +1,175 @@
package org.broadinstitute.sting.playground.tools.vcf;
//import org.broadinstitute.sting.playground.tools.vcf.VCFOptimize.Cut;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import edu.mit.broad.picard.util.Interval;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import java.io.*;
import java.util.*;
import net.sf.picard.PicardException;
import net.sf.picard.io.IoUtil;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileWalker;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.commons.math.optimization.*;
import org.apache.commons.math.optimization.direct.*;
import org.apache.commons.math.analysis.MultivariateRealFunction;
// First draft of a program for working with VCF files in various ways.
/**
* @author jmaguire
*/
class VCFApplyCuts extends CommandLineProgram
{
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "cuts", shortName = "cuts", doc = "file to read cuts from", required = true) public String cuts_filename;
@Argument(fullName = "output", shortName = "output", doc = "file to write filtered VCF to", required = true) public String output_filename;
class Cut
{
public double lod;
public double slod;
public int freq;
public Cut(double lod, double slod)
{
this.lod = lod;
this.slod = slod;
this.freq = -1;
}
public Cut(double lod, double slod, int freq)
{
this.lod = lod;
this.slod = slod;
this.freq = freq;
}
public Cut(String record)
{
String[] tokens = record.split("\\s+");
this.freq = Integer.parseInt(tokens[0]);
this.lod = Double.parseDouble(tokens[1]);
this.slod = Double.parseDouble(tokens[2]);
}
public String toString()
{
return String.format("%d %f %f", freq, lod, slod);
}
}
private boolean applyCuts(ArrayList<Cut> cuts, VCFHeader header, VCFRecord record)
{
Map<String,String> info = record.getInfoValues();
if (! info.containsKey("AC"))
{
throw new RuntimeException("AC not present in record: \n" + record.toStringEncoding(header));
}
if (! info.containsKey("DP"))
{
throw new RuntimeException("DP not present in record: \n" + record.toStringEncoding(header));
}
if (! info.containsKey("SB"))
{
throw new RuntimeException("SB not present in record: \n" + record.toStringEncoding(header));
}
boolean transition = VCFTool.isTransition(record);
int freq = Integer.parseInt(record.getInfoValues().get("AC"));
double LOD = record.getQual();
double depth = Double.parseDouble(record.getInfoValues().get("DP"));
double SLOD = Double.parseDouble(record.getInfoValues().get("SB"));
for (int i = 0; i < cuts.size(); i++)
{
Cut cut = cuts.get(i);
if (cut.freq == freq)
{
if ((LOD >= cut.lod) && (-1*SLOD >= cut.slod)) { return true; }
}
}
return false;
}
@Override
protected int execute()
{
// load cuts.
ArrayList<Cut> cuts = null;
Scanner cuts_file = null;
try
{
cuts = new ArrayList<Cut>();
cuts_file = new Scanner(new File(cuts_filename));
}
catch (Exception e)
{
throw new RuntimeException(e);
}
while (cuts_file.hasNextLine())
{
String line = cuts_file.nextLine();
Cut cut = new Cut(line);
cuts.add(cut);
}
VCFReader reader = null;
if (autocorrect) { reader = new VCFReader(VCFHomogenizer.create(filename)); }
else { reader = new VCFReader(new File(filename)); }
VCFHeader header = reader.getHeader();
VCFWriter writer = new VCFWriter(new File(output_filename));
writer.writeHeader(header);
Date start_time = new Date();
int n_records_processed = 0;
int n_records_passed = 0;
while(reader.hasNext())
{
VCFRecord record = reader.next();
if (applyCuts(cuts, header, record) == true)
{
writer.addRecord(record);
n_records_passed += 1;
}
n_records_processed += 1;
}
System.out.printf("Processed %d records\n", n_records_processed);
System.out.printf("Passed %d records\n", n_records_passed);
writer.close();
return 0;
}
}

View File

@ -0,0 +1,324 @@
package org.broadinstitute.sting.playground.tools.vcf;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import edu.mit.broad.picard.util.Interval;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import java.io.*;
import java.util.*;
import net.sf.picard.PicardException;
import net.sf.picard.io.IoUtil;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileWalker;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.commons.math.optimization.*;
import org.apache.commons.math.optimization.direct.*;
import org.apache.commons.math.analysis.MultivariateRealFunction;
// First draft of a program for working with VCF files in various ways.
/**
* @author jmaguire
*/
class VCFOptimize extends CommandLineProgram
{
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "target_TsTv", shortName = "target_TsTv", doc = "Minimum acceptable TsTv", required=false) public double target_TsTv = 2.07;
@Argument(fullName = "output", shortName = "output", doc = "file to write cuts to", required = true) public String output_filename;
// Debugging argument:
@Argument(fullName = "n_records", shortName = "n_records", doc = "Number of records to load (debugging)", required=false) public int n_records_to_process = Integer.MAX_VALUE;
class OptimizationRecord
{
public boolean transition;
public int freq;
public double[] features;
public OptimizationRecord(boolean transition, int freq, double[] features)
{
this.transition = transition;
this.freq = freq;
this.features = features.clone();
}
public OptimizationRecord clone()
{
return new OptimizationRecord(transition, freq, features.clone());
}
}
private OptimizationRecord pack(VCFHeader header, VCFRecord input)
{
Map<String,String> info = input.getInfoValues();
if (! info.containsKey("AC"))
{
throw new RuntimeException("AC not present in record: \n" + input.toStringEncoding(header));
}
if (! info.containsKey("DP"))
{
throw new RuntimeException("DP not present in record: \n" + input.toStringEncoding(header));
}
if (! info.containsKey("SB"))
{
throw new RuntimeException("SB not present in record: \n" + input.toStringEncoding(header));
}
boolean transition = VCFTool.isTransition(input);
int freq = Integer.parseInt(input.getInfoValues().get("AC"));
double LOD = input.getQual();
double depth = Double.parseDouble(input.getInfoValues().get("DP"));
double SLOD = Double.parseDouble(input.getInfoValues().get("SB"));
double[] features = new double[2];
features[0] = LOD;
features[1] = -1*SLOD;
return new OptimizationRecord(transition, freq, features);
}
// This is the objective function we're searching in.
// if (tstv>=min) { return #snps; } else { return -inf; }
public double tstv(double[] point, OptimizationRecord[] records)
{
double transitions = 0;
double transversions = 0;
double total = 0;
for (int i = 0; i < records.length; i++)
{
int j = 0;
for (j = 0; j < point.length; j++)
{
// if (records == null) { System.out.printf("records==null\n"); }
// if (records[i] == null) { System.out.printf("records[%d]==null\n", i); }
// if (records[i].features == null) { System.out.printf("records[%d].features==null\n", i); }
if (records[i].features[j] < point[j]) { break; }
}
if (j == point.length)
{
if (records[i].transition == true) { transitions += 1; }
else { transversions += 1; }
total += 1;
}
}
double tstv = transitions / transversions;
return tstv;
}
// This is the objective function we're searching in.
// if (tstv>=min) { return #snps; } else { return -inf; }
public double num_calls(double[] point, OptimizationRecord[] records)
{
double total = 0;
for (int i = 0; i < records.length; i++)
{
int j = 0;
for (j = 0; j < point.length; j++)
{
// if (records == null) { System.out.printf("records==null\n"); }
// if (records[i] == null) { System.out.printf("records[%d]==null\n", i); }
// if (records[i].features == null) { System.out.printf("records[%d].features==null\n", i); }
if (records[i].features[j] < point[j]) { break; }
}
if (j == point.length)
{
total += 1;
}
}
return total;
}
public class Cut
{
public double lod;
public double slod;
public int freq;
public Cut(double lod, double slod)
{
this.lod = lod;
this.slod = slod;
this.freq = -1;
}
public Cut(double lod, double slod, int freq)
{
this.lod = lod;
this.slod = slod;
this.freq = freq;
}
public Cut(String record)
{
String[] tokens = record.split("\\s+");
this.freq = Integer.parseInt(tokens[0]);
this.lod = Double.parseDouble(tokens[1]);
this.slod = Double.parseDouble(tokens[2]);
}
public String toString()
{
return String.format("%d %f %f", freq, lod, slod);
}
}
// Just a simple grid search.
private Cut optimize(OptimizationRecord[] records, double min_TsTv, int freq)
{
double best_lod = Double.NaN;
double best_slod = Double.NaN;
double best_tstv = Double.NEGATIVE_INFINITY;
double best_num_calls = Double.NEGATIVE_INFINITY;
boolean flag = false;
double[] lods = new double[records.length];
double[] slods = new double[records.length];
for (int i = 0; i < lods.length; i++)
{
lods[i] = records[i].features[0];
slods[i] = records[i].features[1];
}
Arrays.sort(lods);
Arrays.sort(slods);
int num_breaks = 100;
double[] lod_breaks = new double[num_breaks];
double[] slod_breaks = new double[num_breaks];
int bin_size = 1 + (records.length / num_breaks);
//System.out.printf("BREAKS i j lod slod\n");
int j = 0;
for (int i = 0; i < records.length; i += bin_size)
{
lod_breaks[j] = lods[i];
slod_breaks[j] = slods[i];
j += 1;
//System.out.printf("BREAKS %d %d %f %f\n", i, j, lods[i], slods[i]);
}
//System.out.printf("\n");
//for (double lod = 0; lod < 8000; lod += 10)
for (int lod_idx = 0; lod_idx < num_breaks; lod_idx += 1)
{
double lod = lod_breaks[lod_idx];
//for (double slod = -4000; slod < 1000; slod += 10)
for (int slod_idx = 0; slod_idx < num_breaks; slod_idx += 1)
{
double slod = slod_breaks[slod_idx];
double[] point = new double[2];
point[0] = lod;
point[1] = slod;
double tstv = tstv(point, records);
double num_calls = num_calls(point, records);
if ((tstv >= min_TsTv) && (num_calls > best_num_calls))
{
best_lod=lod;
best_slod=slod;
best_tstv=tstv;
best_num_calls=num_calls;
flag=true;
}
else if ((tstv >= best_tstv) && (!flag))
{
best_lod=lod;
best_slod=slod;
best_tstv=tstv;
best_num_calls=num_calls;
}
}
}
//System.out.printf("Found optimum: lod=%f slod=%f num_calls=%f tstv=%f\n", best_lod, best_slod, best_num_calls, best_tstv);
System.out.printf("%d %f %f %f %f\n", freq, best_lod, best_slod, best_num_calls, best_tstv);
return new Cut(best_lod, best_slod);
}
@Override
protected int execute()
{
System.out.println("Loading " + filename + "...");
VCFReader reader = null;
if (autocorrect) { reader = new VCFReader(VCFHomogenizer.create(filename)); }
else { reader = new VCFReader(new File(filename)); }
PrintWriter output = null;
try
{
output = new PrintWriter(new FileWriter(output_filename));
}
catch (Exception e)
{
throw new RuntimeException(e);
}
VCFHeader header = reader.getHeader();
HashMap<Integer,ArrayList<OptimizationRecord>> records = new HashMap<Integer,ArrayList<OptimizationRecord>>();
Date start_time = new Date();
int n_records_processed = 0;
int max_freq = 0;
while(reader.hasNext())
{
VCFRecord record = reader.next();
OptimizationRecord optimization_record = pack(header, record);
if (optimization_record.freq > max_freq) { max_freq = optimization_record.freq; }
if (! records.containsKey(optimization_record.freq)) { records.put(optimization_record.freq, new ArrayList<OptimizationRecord>()); }
records.get(optimization_record.freq).add(optimization_record.clone());
n_records_processed += 1;
if (n_records_processed == n_records_to_process) { break; }
}
System.out.printf("Loaded %d records\n", n_records_processed);
//for (int freq = 1; freq <= 5; freq += 1)
for (int freq = 1; freq <= max_freq; freq += 1)
{
if (records.get(freq) == null) { System.out.printf("Skipping AAF %d (no calls)\n", freq); continue; }
System.out.printf("\nOptimizing AAF %d...\n", freq);
OptimizationRecord[] fnord = new OptimizationRecord[records.get(freq).size()];
Cut cut = optimize(records.get(freq).toArray(fnord), target_TsTv, freq);
cut.freq = freq;
output.println(cut);
}
output.flush();
output.close();
return 0;
}
}