- update to match recent changes in the VCF parser

- compute Het Error Rate in VCFConcordance
- changes to the frequency-specific optimizer




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2839 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2010-02-15 14:27:01 +00:00
parent 8072e9aed5
commit 0ef50bcae7
4 changed files with 257 additions and 25 deletions

View File

@ -56,6 +56,30 @@ class GenotypeConcordance
}
}
public String toLine()
{
int on_diag = 0;
int on_diag_not_homref = 0;
int off_diag = 0;
int total = 0;
int total_not_homref = 0;
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
if (i == j) { on_diag += counts[i][j]; }
if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; }
if (i != j) { off_diag += counts[i][j]; }
if (i != 0 || j != 0) { total_not_homref += counts[i][j]; }
total += counts[i][j];
}
}
String s = String.format("SNP %s %d %d %f %f\n", this.name, total, total_not_homref, this.errorRate(), this.hetErrorRate());
return s;
}
public String toString()
{
String s = this.name + "\n";
@ -108,6 +132,23 @@ class GenotypeConcordance
return error_rate;
}
public double hetErrorRate()
{
int true_hets = 0;
int correct_hets = 0;
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
if (j == 1) { true_hets += counts[i][j]; }
}
}
correct_hets = counts[1][1];
double het_error_rate = 1.0 - ((double)correct_hets / (double)true_hets);
return het_error_rate;
}
public int total()
{
int total = 0;

View File

@ -109,6 +109,9 @@ class VCFHomogenizer extends InputStream
//System.out.println("input : " + input);
// Make it tab-delimited
input = input.replaceAll(" +", "\t");
/////////
// Header corrections
if (input.startsWith("##format=VCFv3.2")) { return "##format=VCRv3.2\n"; }
@ -131,6 +134,7 @@ class VCFHomogenizer extends InputStream
// if alt is "N", make it "."
if (tokens[4].equals("N")) { tokens[4] = "."; }
if (tokens[5].equals(".")) { tokens[5] = "-1"; }
String ref = tokens[3];
String alt = tokens[4];
@ -165,10 +169,15 @@ class VCFHomogenizer extends InputStream
String[] info_tokens = info.split(";");
for (int i = 0; i < info_tokens.length; i++)
{
// Fix the case where AC includes the ref count first.
if (info_tokens[i].startsWith("AC="))
if (info_tokens[i].startsWith("R2="))
{
// Fix NaN's in RNaN's in R2.
String new_token = info_tokens[i].replace("NaN", "0.0");
info_tokens[i] = new_token;
}
else if (info_tokens[i].startsWith("AC="))
{
// Fix the case where AC includes the ref count first.
String[] ACs = info_tokens[i].replaceAll("^AC=", "").split(",");
if (ACs.length == alts.length+1)
{
@ -179,6 +188,7 @@ class VCFHomogenizer extends InputStream
if (j != (ACs.length-1)) { new_ACs += ","; }
}
info_tokens[i] = "AC=" + new_ACs;
continue;
}
}

View File

@ -25,7 +25,7 @@ import net.sf.samtools.SAMSequenceRecord;
//import org.apache.commons.math.optimization.direct.*;
//import org.apache.commons.math.analysis.MultivariateRealFunction;
// First draft of a program for working with VCF files in various ways.
// Program for frequency-specific VCF-files.
/**
@ -39,9 +39,12 @@ class VCFOptimize extends CommandLineProgram
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "target_TsTv", shortName = "target_TsTv", doc = "Minimum acceptable TsTv", required=false) public double target_TsTv = 2.07;
@Argument(fullName = "output", shortName = "output", doc = "file to write cuts to", required = true) public String output_filename;
@Argument(fullName = "min_calls", shortName = "min_calls", doc = "Minimum signifigant number of calls", required=false) public int min_calls = 100;
@Argument(fullName = "num_breaks", shortName = "num_breaks", doc = "Number of breaks to search over", required=false) public int num_breaks = 100;
// Debugging argument:
// Debugging arguments:
@Argument(fullName = "n_records", shortName = "n_records", doc = "Number of records to load (debugging)", required=false) public int n_records_to_process = Integer.MAX_VALUE;
@Argument(fullName = "verbose", shortName = "verbose", doc = "print detailed debugging info.", required=false) public boolean verbose = false;
class OptimizationRecord
{
@ -187,11 +190,6 @@ class VCFOptimize extends CommandLineProgram
// Just a simple grid search.
private Cut optimize(OptimizationRecord[] records, double min_TsTv, int freq)
{
double best_lod = Double.NaN;
double best_slod = Double.NaN;
double best_tstv = Double.NEGATIVE_INFINITY;
double best_num_calls = Double.NEGATIVE_INFINITY;
boolean flag = false;
double[] lods = new double[records.length];
@ -205,7 +203,6 @@ class VCFOptimize extends CommandLineProgram
Arrays.sort(lods);
Arrays.sort(slods);
int num_breaks = 100;
double[] lod_breaks = new double[num_breaks];
double[] slod_breaks = new double[num_breaks];
int bin_size = 1 + (records.length / num_breaks);
@ -221,6 +218,19 @@ class VCFOptimize extends CommandLineProgram
}
//System.out.printf("\n");
double best_lod = lod_breaks[0];
double best_slod = slod_breaks[0];
int best_lod_idx = 0;
int best_slod_idx = 0;
double[] point = new double[2];
point[0] = best_lod;
point[1] = best_slod;
double best_tstv = tstv(point, records);
double best_num_calls = num_calls(point, records);
boolean flag = false;
//for (double lod = 0; lod < 8000; lod += 10)
for (int lod_idx = 0; lod_idx < num_breaks; lod_idx += 1)
@ -231,17 +241,22 @@ class VCFOptimize extends CommandLineProgram
{
double slod = slod_breaks[slod_idx];
double[] point = new double[2];
point = new double[2];
point[0] = lod;
point[1] = slod;
double tstv = tstv(point, records);
double num_calls = num_calls(point, records);
if (num_calls < min_calls) { continue; }
if ((tstv >= min_TsTv) && (num_calls > best_num_calls))
{
best_lod=lod;
best_slod=slod;
best_tstv=tstv;
best_num_calls=num_calls;
best_lod_idx = lod_idx;
best_slod_idx = slod_idx;
flag=true;
}
else if ((tstv >= best_tstv) && (!flag))
@ -250,12 +265,24 @@ class VCFOptimize extends CommandLineProgram
best_slod=slod;
best_tstv=tstv;
best_num_calls=num_calls;
best_lod_idx = lod_idx;
best_slod_idx = slod_idx;
}
if (verbose)
{
System.out.printf("DEBUG: %d | %d %d | %f %f %f %f | %f %f %f %f\n",
freq,
lod_idx, slod_idx,
lod, slod, num_calls, tstv,
best_lod, best_slod, best_num_calls, best_tstv);
}
}
}
//System.out.printf("Found optimum: lod=%f slod=%f num_calls=%f tstv=%f\n", best_lod, best_slod, best_num_calls, best_tstv);
System.out.printf("%d %f %f %f %f\n", freq, best_lod, best_slod, best_num_calls, best_tstv);
System.out.printf("%d %d %d %f %f %f %f\n", freq, best_lod_idx, best_slod_idx, best_lod, best_slod, best_num_calls, best_tstv);
return new Cut(best_lod, best_slod);
}

View File

@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import java.io.*;
import java.util.*;
import java.util.zip.*;
import net.sf.picard.PicardException;
import net.sf.picard.io.IoUtil;
@ -33,8 +34,9 @@ class VCFValidate extends CommandLineProgram
{
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "print", shortName = "print", doc = "print the vcf records to stdout", required = false) public Boolean print = false;
@Argument(fullName = "print", shortName = "print", doc = "print the vcf records to output", required = false) public Boolean print = false;
@Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false;
@Argument(fullName = "out", shortName = "out", doc = "if --print, write to this file (default is /dev/stdout)", required = false) public String out = "/dev/stdout";
@Override
protected int execute()
@ -48,12 +50,19 @@ class VCFValidate extends CommandLineProgram
VCFHeader header = reader.getHeader();
VCFWriter writer = null;
if (print)
{
writer = new VCFWriter(new File(out));
writer.writeHeader(header);
}
Date start_time = new Date();
int n_records_processed = 0;
while(reader.hasNext())
{
VCFRecord record = reader.next();
if (print) { System.out.println(record.toStringEncoding(header)); }
if (print) { writer.addRecord(record); }
if ((profile) && (n_records_processed % 10000 == 0))
{
@ -67,6 +76,8 @@ class VCFValidate extends CommandLineProgram
n_records_processed += 1;
}
if (print) { writer.close(); }
if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); }
else { System.out.println(filename + " is VALID."); }
@ -193,12 +204,15 @@ class VCFStats extends CommandLineProgram
System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high);
System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high);
System.out.printf("\n");
System.out.printf("table\tAAF\tCount\tTs/Tv\n");
for (int AC = 1; AC <= highest_AC; AC++)
{
System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC]));
}
System.out.printf("\n");
System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n");
for (int DP = 1; DP <= highest_DP; DP++)
{
@ -387,7 +401,11 @@ class VCFGrep extends CommandLineProgram
HashSet<String> loci = new HashSet<String>();
try
{
Scanner loci_reader = new Scanner(new File(loci_filename));
Scanner loci_reader;
if (loci_filename.endsWith(".gz")) { loci_reader = new Scanner(new GZIPInputStream(new FileInputStream(loci_filename))); }
else { loci_reader = new Scanner(new File(loci_filename)); }
while(loci_reader.hasNextLine())
{
String line = loci_reader.nextLine();
@ -404,7 +422,9 @@ class VCFGrep extends CommandLineProgram
{
PrintStream output = new PrintStream(new File(out_filename));
Scanner reader = new Scanner(new File(in_filename));
Scanner reader;
if (in_filename.endsWith(".gz")) { reader = new Scanner(new GZIPInputStream(new FileInputStream(in_filename))); }
else { reader = new Scanner(new File(in_filename)); }
while(reader.hasNextLine())
{
String line = reader.nextLine();
@ -709,6 +729,7 @@ class VCFConcordance extends CommandLineProgram
@Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false;
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
@Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null;
@Argument(fullName = "r2_bin_size", shortName = "r2_bin_size", doc = "size of an r2 bin for calculating error rates", required = false) public double r2_bin_size = 0.01;
@Override
@ -722,6 +743,22 @@ class VCFConcordance extends CommandLineProgram
HashMap<String,GenotypeConcordance> individual = new HashMap<String,GenotypeConcordance>();
HashMap<Long,GenotypeConcordance> AAF = new HashMap<Long,GenotypeConcordance>();
HashMap<Long,GenotypeConcordance> Qual = new HashMap<Long,GenotypeConcordance>();
HashMap<Long,GenotypeConcordance> R2 = new HashMap<Long,GenotypeConcordance>();
int shared_ts = 0;
int shared_tv = 0;
int shared_dbsnp = 0;
int shared_total = 0;
int unique1_ts = 0;
int unique1_tv = 0;
int unique1_dbsnp = 0;
int unique1_total = 0;
int unique2_ts = 0;
int unique2_tv = 0;
int unique2_dbsnp = 0;
int unique2_total = 0;
//
/////////////////////////////////
@ -777,6 +814,10 @@ class VCFConcordance extends CommandLineProgram
VCFRecord record1 = reader1.next();
VCFRecord record2 = reader2.next();
int number_sites_unique_to_file1 = 0;
int number_sites_unique_to_file2 = 0;
int number_sites_shared = 0;
while(true)
{
if (record1 == null) { break; }
@ -786,9 +827,10 @@ class VCFConcordance extends CommandLineProgram
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
int comparison = interval1.compareTo(interval2);
//int comparison = interval1.compareTo(interval2);
int comparison = VCFTool.compareIntervals(interval1, interval2);
//System.out.println("DBG: " + interval1 + " " + interval2);
//System.out.println("DBG: " + interval1 + " " + interval2 + " " + comparison);
if (comparison == 0)
{
@ -799,13 +841,31 @@ class VCFConcordance extends CommandLineProgram
{
record1 = reader1.next();
record2 = reader2.next();
continue;
}
char ref = record1.getReferenceBase();
String[] sample_names1 = record1.getSampleNames();
String[] sample_names2 = record2.getSampleNames();
Map<String,String> info1 = record1.getInfoValues();
Map<String,String> info2 = record2.getInfoValues();
double r2_1 = 0;
double r2_2 = 0;
if (info1.containsKey("R2")) { r2_1 = Double.parseDouble(info1.get("R2")); }
if (info2.containsKey("R2")) { r2_2 = Double.parseDouble(info2.get("R2")); }
number_sites_shared += 1;
if (VCFTool.isTransition(record1)) { shared_ts += 1; }
else { shared_tv += 1; }
if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { shared_dbsnp += 1; }
shared_total += 1;
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
List<VCFGenotypeRecord> genotypes2 = record2.getVCFGenotypeRecords();
@ -908,31 +968,59 @@ class VCFConcordance extends CommandLineProgram
if (verbose)
{
output.printf("SNP " + SNP.toString());
//output.printf("SNP " + SNP.toString());
output.printf("SNP " + SNP.toLine());
}
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
AAF.get(n_alt).add(SNP);
long r2_index = (long)(r2_1 / r2_bin_size);
if (! R2.containsKey(r2_index)) { R2.put(r2_index, new GenotypeConcordance(Double.toString(r2_1))); }
R2.get(r2_index).add(SNP);
//System.out.printf("DBG: %f %f\n", r2_1, r2_2);
//System.out.printf("DBG: %f %d %s\n", r2_1, r2_index, SNP.toString());
record1 = reader1.next();
record2 = reader2.next();
}
else if (comparison > 0)
{
if (record2.isFiltered()) { record2 = reader2.next(); continue; }
// interval1 is later than interval2.
Map<String,String> info2 = record2.getInfoValues();
number_sites_unique_to_file2 += 1;
if (VCFTool.isTransition(record2)) { unique2_ts += 1; }
else { unique2_tv += 1; }
if ((info2.get("DB") != null) && (Integer.parseInt(info2.get("DB")) == 1)) { unique2_dbsnp += 1; }
unique2_total += 1;
//if (verbose) { output.printf("DBG: skipping %s\n", record2.toStringEncoding(header2)); }
record2 = reader2.next();
}
else if (comparison < 0)
{
if (record1.isFiltered()) { record1 = reader1.next(); continue; }
// interval2 is later than interval1.
Map<String,String> info1 = record1.getInfoValues();
number_sites_unique_to_file1 += 1;
if (VCFTool.isTransition(record1)) { unique1_ts += 1; }
else { unique1_tv += 1; }
if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { unique1_dbsnp += 1; }
unique1_total += 1;
//if (verbose) { output.printf("DBG: skipping %s\n", record1.toStringEncoding(header1)); }
record1 = reader1.next();
}
}
// Now output the statistics.
{
if (verbose)
{
output.printf("\n");
@ -959,8 +1047,28 @@ class VCFConcordance extends CommandLineProgram
output.print("QUAL " + Qual.get(qual).toString());
}
output.printf("\n");
output.printf("\n");
Object[] R2s = R2.keySet().toArray();
for (int i = 0; i < AAFs.length; i++)
{
Long r2 = (Long)R2s[i];
output.print("R2 " + R2.get(r2).toString());
}
}
output.printf("Number of sites shared : %d %f %f\n", number_sites_shared,
(double)shared_ts/(double)shared_tv,
(double)shared_dbsnp/(double)(shared_ts+shared_tv));
output.printf("Number of sites unique to %s: %d %f %f\n", filename1, number_sites_unique_to_file1,
(double)unique1_ts/(double)unique1_tv,
(double)unique1_dbsnp/(double)(unique1_ts+unique1_tv));
output.printf("Number of sites unique to %s: %d %f %f\n", filename2, number_sites_unique_to_file2,
(double)unique2_ts/(double)unique2_tv,
(double)unique2_dbsnp/(double)(unique2_ts+unique2_tv));
output.printf("\n");
Object[] individuals = individual.keySet().toArray();
for (int i = 0; i < individuals.length; i++)
@ -974,7 +1082,7 @@ class VCFConcordance extends CommandLineProgram
for (int i = 0; i < AAFs.length; i++)
{
Long aaf = (Long)AAFs[i];
output.printf("AAF %d %f %d %d\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef());
output.printf("AAF %d %f %d %d %f\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef(), AAF.get(aaf).hetErrorRate());
}
output.printf("\n");
@ -985,7 +1093,13 @@ class VCFConcordance extends CommandLineProgram
output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef());
}
}
output.printf("\n");
Object[] R2s = R2.keySet().toArray();
for (int i = 0; i < R2s.length; i++)
{
Long r2 = (Long)R2s[i];
output.printf("R2 %f %f %d %d\n", (double)r2 * r2_bin_size, R2.get(r2).errorRate(), R2.get(r2).total(), R2.get(r2).totalNonHomRef());
}
output.flush();
output.close();
@ -995,11 +1109,14 @@ class VCFConcordance extends CommandLineProgram
}
}
public class VCFTool
{
public static void main(String args[])
{
// silence log4j messages.
//appender = new FileAppender(layout, clp.toFile, false);
//logger.addAppender(appender);
SetupSequenceDictionary();
String mode = args[0];
@ -1096,6 +1213,13 @@ public class VCFTool
System.exit(0);
}
if (mode.equals("merge"))
{
VCFMerge cm = new VCFMerge();
CommandLineProgram.start(cm,realArgs);
System.exit(0);
}
System.out.printf("ERROR: mode %s not defined.\n", mode);
System.exit(-1);
@ -1386,5 +1510,35 @@ public class VCFTool
return ans;
}
public static int compareIntervals(Interval a, Interval b)
{
int chr_a;
int chr_b;
if (a.getSequence().equals("X")) { chr_a = 23; }
else if (a.getSequence().equals("Y")) { chr_a = 24; }
else if (a.getSequence().equals("M")) { chr_a = 25; }
else { chr_a = Integer.parseInt(a.getSequence()); }
if (b.getSequence().equals("X")) { chr_b = 23; }
else if (b.getSequence().equals("Y")) { chr_b = 24; }
else if (b.getSequence().equals("M")) { chr_b = 25; }
else { chr_b = Integer.parseInt(b.getSequence()); }
int start_a = a.getStart();
int start_b = b.getStart();
int end_a = a.getEnd();
int end_b = b.getEnd();
if (chr_a < chr_b) { return -1; }
else if (chr_a > chr_b) { return 1; }
else if (start_a < start_b) { return -1; }
else if (start_a > start_b) { return 1; }
else if (end_a < end_b) { return -1; }
else if (end_a > end_b) { return 1; }
else { return 0; }
}
}