- update to match recent changes in the VCF parser
- compute Het Error Rate in VCFConcordance - changes to the frequency-specific optimizer git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2839 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
8072e9aed5
commit
0ef50bcae7
|
|
@ -56,6 +56,30 @@ class GenotypeConcordance
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toLine()
|
||||||
|
{
|
||||||
|
int on_diag = 0;
|
||||||
|
int on_diag_not_homref = 0;
|
||||||
|
int off_diag = 0;
|
||||||
|
int total = 0;
|
||||||
|
int total_not_homref = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < 3; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < 3; j++)
|
||||||
|
{
|
||||||
|
if (i == j) { on_diag += counts[i][j]; }
|
||||||
|
if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; }
|
||||||
|
if (i != j) { off_diag += counts[i][j]; }
|
||||||
|
if (i != 0 || j != 0) { total_not_homref += counts[i][j]; }
|
||||||
|
total += counts[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String s = String.format("SNP %s %d %d %f %f\n", this.name, total, total_not_homref, this.errorRate(), this.hetErrorRate());
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
public String toString()
|
public String toString()
|
||||||
{
|
{
|
||||||
String s = this.name + "\n";
|
String s = this.name + "\n";
|
||||||
|
|
@ -108,6 +132,23 @@ class GenotypeConcordance
|
||||||
return error_rate;
|
return error_rate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public double hetErrorRate()
|
||||||
|
{
|
||||||
|
int true_hets = 0;
|
||||||
|
int correct_hets = 0;
|
||||||
|
for (int i = 0; i < 3; i++)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < 3; j++)
|
||||||
|
{
|
||||||
|
if (j == 1) { true_hets += counts[i][j]; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
correct_hets = counts[1][1];
|
||||||
|
double het_error_rate = 1.0 - ((double)correct_hets / (double)true_hets);
|
||||||
|
return het_error_rate;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public int total()
|
public int total()
|
||||||
{
|
{
|
||||||
int total = 0;
|
int total = 0;
|
||||||
|
|
|
||||||
|
|
@ -109,6 +109,9 @@ class VCFHomogenizer extends InputStream
|
||||||
|
|
||||||
//System.out.println("input : " + input);
|
//System.out.println("input : " + input);
|
||||||
|
|
||||||
|
// Make it tab-delimited
|
||||||
|
input = input.replaceAll(" +", "\t");
|
||||||
|
|
||||||
/////////
|
/////////
|
||||||
// Header corrections
|
// Header corrections
|
||||||
if (input.startsWith("##format=VCFv3.2")) { return "##format=VCRv3.2\n"; }
|
if (input.startsWith("##format=VCFv3.2")) { return "##format=VCRv3.2\n"; }
|
||||||
|
|
@ -131,6 +134,7 @@ class VCFHomogenizer extends InputStream
|
||||||
|
|
||||||
// if alt is "N", make it "."
|
// if alt is "N", make it "."
|
||||||
if (tokens[4].equals("N")) { tokens[4] = "."; }
|
if (tokens[4].equals("N")) { tokens[4] = "."; }
|
||||||
|
if (tokens[5].equals(".")) { tokens[5] = "-1"; }
|
||||||
|
|
||||||
String ref = tokens[3];
|
String ref = tokens[3];
|
||||||
String alt = tokens[4];
|
String alt = tokens[4];
|
||||||
|
|
@ -165,10 +169,15 @@ class VCFHomogenizer extends InputStream
|
||||||
String[] info_tokens = info.split(";");
|
String[] info_tokens = info.split(";");
|
||||||
for (int i = 0; i < info_tokens.length; i++)
|
for (int i = 0; i < info_tokens.length; i++)
|
||||||
{
|
{
|
||||||
|
if (info_tokens[i].startsWith("R2="))
|
||||||
// Fix the case where AC includes the ref count first.
|
|
||||||
if (info_tokens[i].startsWith("AC="))
|
|
||||||
{
|
{
|
||||||
|
// Fix NaN's in RNaN's in R2.
|
||||||
|
String new_token = info_tokens[i].replace("NaN", "0.0");
|
||||||
|
info_tokens[i] = new_token;
|
||||||
|
}
|
||||||
|
else if (info_tokens[i].startsWith("AC="))
|
||||||
|
{
|
||||||
|
// Fix the case where AC includes the ref count first.
|
||||||
String[] ACs = info_tokens[i].replaceAll("^AC=", "").split(",");
|
String[] ACs = info_tokens[i].replaceAll("^AC=", "").split(",");
|
||||||
if (ACs.length == alts.length+1)
|
if (ACs.length == alts.length+1)
|
||||||
{
|
{
|
||||||
|
|
@ -179,6 +188,7 @@ class VCFHomogenizer extends InputStream
|
||||||
if (j != (ACs.length-1)) { new_ACs += ","; }
|
if (j != (ACs.length-1)) { new_ACs += ","; }
|
||||||
}
|
}
|
||||||
info_tokens[i] = "AC=" + new_ACs;
|
info_tokens[i] = "AC=" + new_ACs;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ import net.sf.samtools.SAMSequenceRecord;
|
||||||
//import org.apache.commons.math.optimization.direct.*;
|
//import org.apache.commons.math.optimization.direct.*;
|
||||||
//import org.apache.commons.math.analysis.MultivariateRealFunction;
|
//import org.apache.commons.math.analysis.MultivariateRealFunction;
|
||||||
|
|
||||||
// First draft of a program for working with VCF files in various ways.
|
// Program for frequency-specific VCF-files.
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -39,9 +39,12 @@ class VCFOptimize extends CommandLineProgram
|
||||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||||
@Argument(fullName = "target_TsTv", shortName = "target_TsTv", doc = "Minimum acceptable TsTv", required=false) public double target_TsTv = 2.07;
|
@Argument(fullName = "target_TsTv", shortName = "target_TsTv", doc = "Minimum acceptable TsTv", required=false) public double target_TsTv = 2.07;
|
||||||
@Argument(fullName = "output", shortName = "output", doc = "file to write cuts to", required = true) public String output_filename;
|
@Argument(fullName = "output", shortName = "output", doc = "file to write cuts to", required = true) public String output_filename;
|
||||||
|
@Argument(fullName = "min_calls", shortName = "min_calls", doc = "Minimum signifigant number of calls", required=false) public int min_calls = 100;
|
||||||
|
@Argument(fullName = "num_breaks", shortName = "num_breaks", doc = "Number of breaks to search over", required=false) public int num_breaks = 100;
|
||||||
|
|
||||||
// Debugging argument:
|
// Debugging arguments:
|
||||||
@Argument(fullName = "n_records", shortName = "n_records", doc = "Number of records to load (debugging)", required=false) public int n_records_to_process = Integer.MAX_VALUE;
|
@Argument(fullName = "n_records", shortName = "n_records", doc = "Number of records to load (debugging)", required=false) public int n_records_to_process = Integer.MAX_VALUE;
|
||||||
|
@Argument(fullName = "verbose", shortName = "verbose", doc = "print detailed debugging info.", required=false) public boolean verbose = false;
|
||||||
|
|
||||||
class OptimizationRecord
|
class OptimizationRecord
|
||||||
{
|
{
|
||||||
|
|
@ -187,11 +190,6 @@ class VCFOptimize extends CommandLineProgram
|
||||||
// Just a simple grid search.
|
// Just a simple grid search.
|
||||||
private Cut optimize(OptimizationRecord[] records, double min_TsTv, int freq)
|
private Cut optimize(OptimizationRecord[] records, double min_TsTv, int freq)
|
||||||
{
|
{
|
||||||
double best_lod = Double.NaN;
|
|
||||||
double best_slod = Double.NaN;
|
|
||||||
double best_tstv = Double.NEGATIVE_INFINITY;
|
|
||||||
double best_num_calls = Double.NEGATIVE_INFINITY;
|
|
||||||
boolean flag = false;
|
|
||||||
|
|
||||||
|
|
||||||
double[] lods = new double[records.length];
|
double[] lods = new double[records.length];
|
||||||
|
|
@ -205,7 +203,6 @@ class VCFOptimize extends CommandLineProgram
|
||||||
Arrays.sort(lods);
|
Arrays.sort(lods);
|
||||||
Arrays.sort(slods);
|
Arrays.sort(slods);
|
||||||
|
|
||||||
int num_breaks = 100;
|
|
||||||
double[] lod_breaks = new double[num_breaks];
|
double[] lod_breaks = new double[num_breaks];
|
||||||
double[] slod_breaks = new double[num_breaks];
|
double[] slod_breaks = new double[num_breaks];
|
||||||
int bin_size = 1 + (records.length / num_breaks);
|
int bin_size = 1 + (records.length / num_breaks);
|
||||||
|
|
@ -221,6 +218,19 @@ class VCFOptimize extends CommandLineProgram
|
||||||
}
|
}
|
||||||
//System.out.printf("\n");
|
//System.out.printf("\n");
|
||||||
|
|
||||||
|
double best_lod = lod_breaks[0];
|
||||||
|
double best_slod = slod_breaks[0];
|
||||||
|
|
||||||
|
int best_lod_idx = 0;
|
||||||
|
int best_slod_idx = 0;
|
||||||
|
|
||||||
|
double[] point = new double[2];
|
||||||
|
point[0] = best_lod;
|
||||||
|
point[1] = best_slod;
|
||||||
|
|
||||||
|
double best_tstv = tstv(point, records);
|
||||||
|
double best_num_calls = num_calls(point, records);
|
||||||
|
boolean flag = false;
|
||||||
|
|
||||||
//for (double lod = 0; lod < 8000; lod += 10)
|
//for (double lod = 0; lod < 8000; lod += 10)
|
||||||
for (int lod_idx = 0; lod_idx < num_breaks; lod_idx += 1)
|
for (int lod_idx = 0; lod_idx < num_breaks; lod_idx += 1)
|
||||||
|
|
@ -231,17 +241,22 @@ class VCFOptimize extends CommandLineProgram
|
||||||
{
|
{
|
||||||
double slod = slod_breaks[slod_idx];
|
double slod = slod_breaks[slod_idx];
|
||||||
|
|
||||||
double[] point = new double[2];
|
point = new double[2];
|
||||||
point[0] = lod;
|
point[0] = lod;
|
||||||
point[1] = slod;
|
point[1] = slod;
|
||||||
double tstv = tstv(point, records);
|
double tstv = tstv(point, records);
|
||||||
double num_calls = num_calls(point, records);
|
double num_calls = num_calls(point, records);
|
||||||
|
|
||||||
|
if (num_calls < min_calls) { continue; }
|
||||||
|
|
||||||
if ((tstv >= min_TsTv) && (num_calls > best_num_calls))
|
if ((tstv >= min_TsTv) && (num_calls > best_num_calls))
|
||||||
{
|
{
|
||||||
best_lod=lod;
|
best_lod=lod;
|
||||||
best_slod=slod;
|
best_slod=slod;
|
||||||
best_tstv=tstv;
|
best_tstv=tstv;
|
||||||
best_num_calls=num_calls;
|
best_num_calls=num_calls;
|
||||||
|
best_lod_idx = lod_idx;
|
||||||
|
best_slod_idx = slod_idx;
|
||||||
flag=true;
|
flag=true;
|
||||||
}
|
}
|
||||||
else if ((tstv >= best_tstv) && (!flag))
|
else if ((tstv >= best_tstv) && (!flag))
|
||||||
|
|
@ -250,12 +265,24 @@ class VCFOptimize extends CommandLineProgram
|
||||||
best_slod=slod;
|
best_slod=slod;
|
||||||
best_tstv=tstv;
|
best_tstv=tstv;
|
||||||
best_num_calls=num_calls;
|
best_num_calls=num_calls;
|
||||||
|
best_lod_idx = lod_idx;
|
||||||
|
best_slod_idx = slod_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (verbose)
|
||||||
|
{
|
||||||
|
System.out.printf("DEBUG: %d | %d %d | %f %f %f %f | %f %f %f %f\n",
|
||||||
|
freq,
|
||||||
|
lod_idx, slod_idx,
|
||||||
|
lod, slod, num_calls, tstv,
|
||||||
|
best_lod, best_slod, best_num_calls, best_tstv);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//System.out.printf("Found optimum: lod=%f slod=%f num_calls=%f tstv=%f\n", best_lod, best_slod, best_num_calls, best_tstv);
|
//System.out.printf("Found optimum: lod=%f slod=%f num_calls=%f tstv=%f\n", best_lod, best_slod, best_num_calls, best_tstv);
|
||||||
System.out.printf("%d %f %f %f %f\n", freq, best_lod, best_slod, best_num_calls, best_tstv);
|
System.out.printf("%d %d %d %f %f %f %f\n", freq, best_lod_idx, best_slod_idx, best_lod, best_slod, best_num_calls, best_tstv);
|
||||||
|
|
||||||
return new Cut(best_lod, best_slod);
|
return new Cut(best_lod, best_slod);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.zip.*;
|
||||||
|
|
||||||
import net.sf.picard.PicardException;
|
import net.sf.picard.PicardException;
|
||||||
import net.sf.picard.io.IoUtil;
|
import net.sf.picard.io.IoUtil;
|
||||||
|
|
@ -33,8 +34,9 @@ class VCFValidate extends CommandLineProgram
|
||||||
{
|
{
|
||||||
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
||||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||||
@Argument(fullName = "print", shortName = "print", doc = "print the vcf records to stdout", required = false) public Boolean print = false;
|
@Argument(fullName = "print", shortName = "print", doc = "print the vcf records to output", required = false) public Boolean print = false;
|
||||||
@Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false;
|
@Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false;
|
||||||
|
@Argument(fullName = "out", shortName = "out", doc = "if --print, write to this file (default is /dev/stdout)", required = false) public String out = "/dev/stdout";
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int execute()
|
protected int execute()
|
||||||
|
|
@ -48,12 +50,19 @@ class VCFValidate extends CommandLineProgram
|
||||||
|
|
||||||
VCFHeader header = reader.getHeader();
|
VCFHeader header = reader.getHeader();
|
||||||
|
|
||||||
|
VCFWriter writer = null;
|
||||||
|
if (print)
|
||||||
|
{
|
||||||
|
writer = new VCFWriter(new File(out));
|
||||||
|
writer.writeHeader(header);
|
||||||
|
}
|
||||||
|
|
||||||
Date start_time = new Date();
|
Date start_time = new Date();
|
||||||
int n_records_processed = 0;
|
int n_records_processed = 0;
|
||||||
while(reader.hasNext())
|
while(reader.hasNext())
|
||||||
{
|
{
|
||||||
VCFRecord record = reader.next();
|
VCFRecord record = reader.next();
|
||||||
if (print) { System.out.println(record.toStringEncoding(header)); }
|
if (print) { writer.addRecord(record); }
|
||||||
|
|
||||||
if ((profile) && (n_records_processed % 10000 == 0))
|
if ((profile) && (n_records_processed % 10000 == 0))
|
||||||
{
|
{
|
||||||
|
|
@ -67,6 +76,8 @@ class VCFValidate extends CommandLineProgram
|
||||||
n_records_processed += 1;
|
n_records_processed += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (print) { writer.close(); }
|
||||||
|
|
||||||
if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); }
|
if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); }
|
||||||
else { System.out.println(filename + " is VALID."); }
|
else { System.out.println(filename + " is VALID."); }
|
||||||
|
|
||||||
|
|
@ -193,12 +204,15 @@ class VCFStats extends CommandLineProgram
|
||||||
System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high);
|
System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high);
|
||||||
System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high);
|
System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high);
|
||||||
System.out.printf("\n");
|
System.out.printf("\n");
|
||||||
|
|
||||||
System.out.printf("table\tAAF\tCount\tTs/Tv\n");
|
System.out.printf("table\tAAF\tCount\tTs/Tv\n");
|
||||||
for (int AC = 1; AC <= highest_AC; AC++)
|
for (int AC = 1; AC <= highest_AC; AC++)
|
||||||
{
|
{
|
||||||
System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC]));
|
System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC]));
|
||||||
}
|
}
|
||||||
System.out.printf("\n");
|
System.out.printf("\n");
|
||||||
|
|
||||||
|
|
||||||
System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n");
|
System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n");
|
||||||
for (int DP = 1; DP <= highest_DP; DP++)
|
for (int DP = 1; DP <= highest_DP; DP++)
|
||||||
{
|
{
|
||||||
|
|
@ -387,7 +401,11 @@ class VCFGrep extends CommandLineProgram
|
||||||
HashSet<String> loci = new HashSet<String>();
|
HashSet<String> loci = new HashSet<String>();
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
Scanner loci_reader = new Scanner(new File(loci_filename));
|
Scanner loci_reader;
|
||||||
|
|
||||||
|
if (loci_filename.endsWith(".gz")) { loci_reader = new Scanner(new GZIPInputStream(new FileInputStream(loci_filename))); }
|
||||||
|
else { loci_reader = new Scanner(new File(loci_filename)); }
|
||||||
|
|
||||||
while(loci_reader.hasNextLine())
|
while(loci_reader.hasNextLine())
|
||||||
{
|
{
|
||||||
String line = loci_reader.nextLine();
|
String line = loci_reader.nextLine();
|
||||||
|
|
@ -404,7 +422,9 @@ class VCFGrep extends CommandLineProgram
|
||||||
{
|
{
|
||||||
PrintStream output = new PrintStream(new File(out_filename));
|
PrintStream output = new PrintStream(new File(out_filename));
|
||||||
|
|
||||||
Scanner reader = new Scanner(new File(in_filename));
|
Scanner reader;
|
||||||
|
if (in_filename.endsWith(".gz")) { reader = new Scanner(new GZIPInputStream(new FileInputStream(in_filename))); }
|
||||||
|
else { reader = new Scanner(new File(in_filename)); }
|
||||||
while(reader.hasNextLine())
|
while(reader.hasNextLine())
|
||||||
{
|
{
|
||||||
String line = reader.nextLine();
|
String line = reader.nextLine();
|
||||||
|
|
@ -709,6 +729,7 @@ class VCFConcordance extends CommandLineProgram
|
||||||
@Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false;
|
@Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false;
|
||||||
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
||||||
@Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null;
|
@Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null;
|
||||||
|
@Argument(fullName = "r2_bin_size", shortName = "r2_bin_size", doc = "size of an r2 bin for calculating error rates", required = false) public double r2_bin_size = 0.01;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -722,6 +743,22 @@ class VCFConcordance extends CommandLineProgram
|
||||||
HashMap<String,GenotypeConcordance> individual = new HashMap<String,GenotypeConcordance>();
|
HashMap<String,GenotypeConcordance> individual = new HashMap<String,GenotypeConcordance>();
|
||||||
HashMap<Long,GenotypeConcordance> AAF = new HashMap<Long,GenotypeConcordance>();
|
HashMap<Long,GenotypeConcordance> AAF = new HashMap<Long,GenotypeConcordance>();
|
||||||
HashMap<Long,GenotypeConcordance> Qual = new HashMap<Long,GenotypeConcordance>();
|
HashMap<Long,GenotypeConcordance> Qual = new HashMap<Long,GenotypeConcordance>();
|
||||||
|
HashMap<Long,GenotypeConcordance> R2 = new HashMap<Long,GenotypeConcordance>();
|
||||||
|
|
||||||
|
int shared_ts = 0;
|
||||||
|
int shared_tv = 0;
|
||||||
|
int shared_dbsnp = 0;
|
||||||
|
int shared_total = 0;
|
||||||
|
|
||||||
|
int unique1_ts = 0;
|
||||||
|
int unique1_tv = 0;
|
||||||
|
int unique1_dbsnp = 0;
|
||||||
|
int unique1_total = 0;
|
||||||
|
|
||||||
|
int unique2_ts = 0;
|
||||||
|
int unique2_tv = 0;
|
||||||
|
int unique2_dbsnp = 0;
|
||||||
|
int unique2_total = 0;
|
||||||
|
|
||||||
//
|
//
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
|
@ -777,6 +814,10 @@ class VCFConcordance extends CommandLineProgram
|
||||||
VCFRecord record1 = reader1.next();
|
VCFRecord record1 = reader1.next();
|
||||||
VCFRecord record2 = reader2.next();
|
VCFRecord record2 = reader2.next();
|
||||||
|
|
||||||
|
int number_sites_unique_to_file1 = 0;
|
||||||
|
int number_sites_unique_to_file2 = 0;
|
||||||
|
int number_sites_shared = 0;
|
||||||
|
|
||||||
while(true)
|
while(true)
|
||||||
{
|
{
|
||||||
if (record1 == null) { break; }
|
if (record1 == null) { break; }
|
||||||
|
|
@ -786,9 +827,10 @@ class VCFConcordance extends CommandLineProgram
|
||||||
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
||||||
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
||||||
|
|
||||||
int comparison = interval1.compareTo(interval2);
|
//int comparison = interval1.compareTo(interval2);
|
||||||
|
int comparison = VCFTool.compareIntervals(interval1, interval2);
|
||||||
|
|
||||||
//System.out.println("DBG: " + interval1 + " " + interval2);
|
//System.out.println("DBG: " + interval1 + " " + interval2 + " " + comparison);
|
||||||
|
|
||||||
if (comparison == 0)
|
if (comparison == 0)
|
||||||
{
|
{
|
||||||
|
|
@ -799,13 +841,31 @@ class VCFConcordance extends CommandLineProgram
|
||||||
{
|
{
|
||||||
record1 = reader1.next();
|
record1 = reader1.next();
|
||||||
record2 = reader2.next();
|
record2 = reader2.next();
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
char ref = record1.getReferenceBase();
|
char ref = record1.getReferenceBase();
|
||||||
|
|
||||||
String[] sample_names1 = record1.getSampleNames();
|
String[] sample_names1 = record1.getSampleNames();
|
||||||
String[] sample_names2 = record2.getSampleNames();
|
String[] sample_names2 = record2.getSampleNames();
|
||||||
|
|
||||||
|
|
||||||
|
Map<String,String> info1 = record1.getInfoValues();
|
||||||
|
Map<String,String> info2 = record2.getInfoValues();
|
||||||
|
double r2_1 = 0;
|
||||||
|
double r2_2 = 0;
|
||||||
|
if (info1.containsKey("R2")) { r2_1 = Double.parseDouble(info1.get("R2")); }
|
||||||
|
if (info2.containsKey("R2")) { r2_2 = Double.parseDouble(info2.get("R2")); }
|
||||||
|
|
||||||
|
|
||||||
|
number_sites_shared += 1;
|
||||||
|
if (VCFTool.isTransition(record1)) { shared_ts += 1; }
|
||||||
|
else { shared_tv += 1; }
|
||||||
|
if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { shared_dbsnp += 1; }
|
||||||
|
shared_total += 1;
|
||||||
|
|
||||||
|
|
||||||
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
|
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
|
||||||
List<VCFGenotypeRecord> genotypes2 = record2.getVCFGenotypeRecords();
|
List<VCFGenotypeRecord> genotypes2 = record2.getVCFGenotypeRecords();
|
||||||
|
|
||||||
|
|
@ -908,31 +968,59 @@ class VCFConcordance extends CommandLineProgram
|
||||||
|
|
||||||
if (verbose)
|
if (verbose)
|
||||||
{
|
{
|
||||||
output.printf("SNP " + SNP.toString());
|
//output.printf("SNP " + SNP.toString());
|
||||||
|
output.printf("SNP " + SNP.toLine());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
|
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
|
||||||
AAF.get(n_alt).add(SNP);
|
AAF.get(n_alt).add(SNP);
|
||||||
|
|
||||||
|
long r2_index = (long)(r2_1 / r2_bin_size);
|
||||||
|
if (! R2.containsKey(r2_index)) { R2.put(r2_index, new GenotypeConcordance(Double.toString(r2_1))); }
|
||||||
|
R2.get(r2_index).add(SNP);
|
||||||
|
|
||||||
|
//System.out.printf("DBG: %f %f\n", r2_1, r2_2);
|
||||||
|
//System.out.printf("DBG: %f %d %s\n", r2_1, r2_index, SNP.toString());
|
||||||
|
|
||||||
record1 = reader1.next();
|
record1 = reader1.next();
|
||||||
record2 = reader2.next();
|
record2 = reader2.next();
|
||||||
}
|
}
|
||||||
else if (comparison > 0)
|
else if (comparison > 0)
|
||||||
{
|
{
|
||||||
|
if (record2.isFiltered()) { record2 = reader2.next(); continue; }
|
||||||
|
|
||||||
// interval1 is later than interval2.
|
// interval1 is later than interval2.
|
||||||
|
Map<String,String> info2 = record2.getInfoValues();
|
||||||
|
number_sites_unique_to_file2 += 1;
|
||||||
|
if (VCFTool.isTransition(record2)) { unique2_ts += 1; }
|
||||||
|
else { unique2_tv += 1; }
|
||||||
|
if ((info2.get("DB") != null) && (Integer.parseInt(info2.get("DB")) == 1)) { unique2_dbsnp += 1; }
|
||||||
|
unique2_total += 1;
|
||||||
|
|
||||||
|
//if (verbose) { output.printf("DBG: skipping %s\n", record2.toStringEncoding(header2)); }
|
||||||
|
|
||||||
record2 = reader2.next();
|
record2 = reader2.next();
|
||||||
}
|
}
|
||||||
else if (comparison < 0)
|
else if (comparison < 0)
|
||||||
{
|
{
|
||||||
|
if (record1.isFiltered()) { record1 = reader1.next(); continue; }
|
||||||
|
|
||||||
// interval2 is later than interval1.
|
// interval2 is later than interval1.
|
||||||
|
Map<String,String> info1 = record1.getInfoValues();
|
||||||
|
number_sites_unique_to_file1 += 1;
|
||||||
|
if (VCFTool.isTransition(record1)) { unique1_ts += 1; }
|
||||||
|
else { unique1_tv += 1; }
|
||||||
|
if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { unique1_dbsnp += 1; }
|
||||||
|
unique1_total += 1;
|
||||||
|
|
||||||
|
//if (verbose) { output.printf("DBG: skipping %s\n", record1.toStringEncoding(header1)); }
|
||||||
|
|
||||||
record1 = reader1.next();
|
record1 = reader1.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Now output the statistics.
|
// Now output the statistics.
|
||||||
{
|
|
||||||
if (verbose)
|
if (verbose)
|
||||||
{
|
{
|
||||||
output.printf("\n");
|
output.printf("\n");
|
||||||
|
|
@ -959,8 +1047,28 @@ class VCFConcordance extends CommandLineProgram
|
||||||
output.print("QUAL " + Qual.get(qual).toString());
|
output.print("QUAL " + Qual.get(qual).toString());
|
||||||
}
|
}
|
||||||
output.printf("\n");
|
output.printf("\n");
|
||||||
|
|
||||||
|
output.printf("\n");
|
||||||
|
Object[] R2s = R2.keySet().toArray();
|
||||||
|
for (int i = 0; i < AAFs.length; i++)
|
||||||
|
{
|
||||||
|
Long r2 = (Long)R2s[i];
|
||||||
|
output.print("R2 " + R2.get(r2).toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output.printf("Number of sites shared : %d %f %f\n", number_sites_shared,
|
||||||
|
(double)shared_ts/(double)shared_tv,
|
||||||
|
(double)shared_dbsnp/(double)(shared_ts+shared_tv));
|
||||||
|
|
||||||
|
output.printf("Number of sites unique to %s: %d %f %f\n", filename1, number_sites_unique_to_file1,
|
||||||
|
(double)unique1_ts/(double)unique1_tv,
|
||||||
|
(double)unique1_dbsnp/(double)(unique1_ts+unique1_tv));
|
||||||
|
|
||||||
|
output.printf("Number of sites unique to %s: %d %f %f\n", filename2, number_sites_unique_to_file2,
|
||||||
|
(double)unique2_ts/(double)unique2_tv,
|
||||||
|
(double)unique2_dbsnp/(double)(unique2_ts+unique2_tv));
|
||||||
|
|
||||||
output.printf("\n");
|
output.printf("\n");
|
||||||
Object[] individuals = individual.keySet().toArray();
|
Object[] individuals = individual.keySet().toArray();
|
||||||
for (int i = 0; i < individuals.length; i++)
|
for (int i = 0; i < individuals.length; i++)
|
||||||
|
|
@ -974,7 +1082,7 @@ class VCFConcordance extends CommandLineProgram
|
||||||
for (int i = 0; i < AAFs.length; i++)
|
for (int i = 0; i < AAFs.length; i++)
|
||||||
{
|
{
|
||||||
Long aaf = (Long)AAFs[i];
|
Long aaf = (Long)AAFs[i];
|
||||||
output.printf("AAF %d %f %d %d\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef());
|
output.printf("AAF %d %f %d %d %f\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef(), AAF.get(aaf).hetErrorRate());
|
||||||
}
|
}
|
||||||
|
|
||||||
output.printf("\n");
|
output.printf("\n");
|
||||||
|
|
@ -985,7 +1093,13 @@ class VCFConcordance extends CommandLineProgram
|
||||||
output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef());
|
output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
output.printf("\n");
|
||||||
|
Object[] R2s = R2.keySet().toArray();
|
||||||
|
for (int i = 0; i < R2s.length; i++)
|
||||||
|
{
|
||||||
|
Long r2 = (Long)R2s[i];
|
||||||
|
output.printf("R2 %f %f %d %d\n", (double)r2 * r2_bin_size, R2.get(r2).errorRate(), R2.get(r2).total(), R2.get(r2).totalNonHomRef());
|
||||||
|
}
|
||||||
|
|
||||||
output.flush();
|
output.flush();
|
||||||
output.close();
|
output.close();
|
||||||
|
|
@ -995,11 +1109,14 @@ class VCFConcordance extends CommandLineProgram
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public class VCFTool
|
public class VCFTool
|
||||||
{
|
{
|
||||||
public static void main(String args[])
|
public static void main(String args[])
|
||||||
{
|
{
|
||||||
|
// silence log4j messages.
|
||||||
|
//appender = new FileAppender(layout, clp.toFile, false);
|
||||||
|
//logger.addAppender(appender);
|
||||||
|
|
||||||
SetupSequenceDictionary();
|
SetupSequenceDictionary();
|
||||||
|
|
||||||
String mode = args[0];
|
String mode = args[0];
|
||||||
|
|
@ -1096,6 +1213,13 @@ public class VCFTool
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mode.equals("merge"))
|
||||||
|
{
|
||||||
|
VCFMerge cm = new VCFMerge();
|
||||||
|
CommandLineProgram.start(cm,realArgs);
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
System.out.printf("ERROR: mode %s not defined.\n", mode);
|
System.out.printf("ERROR: mode %s not defined.\n", mode);
|
||||||
System.exit(-1);
|
System.exit(-1);
|
||||||
|
|
||||||
|
|
@ -1386,5 +1510,35 @@ public class VCFTool
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int compareIntervals(Interval a, Interval b)
|
||||||
|
{
|
||||||
|
int chr_a;
|
||||||
|
int chr_b;
|
||||||
|
|
||||||
|
if (a.getSequence().equals("X")) { chr_a = 23; }
|
||||||
|
else if (a.getSequence().equals("Y")) { chr_a = 24; }
|
||||||
|
else if (a.getSequence().equals("M")) { chr_a = 25; }
|
||||||
|
else { chr_a = Integer.parseInt(a.getSequence()); }
|
||||||
|
|
||||||
|
if (b.getSequence().equals("X")) { chr_b = 23; }
|
||||||
|
else if (b.getSequence().equals("Y")) { chr_b = 24; }
|
||||||
|
else if (b.getSequence().equals("M")) { chr_b = 25; }
|
||||||
|
else { chr_b = Integer.parseInt(b.getSequence()); }
|
||||||
|
|
||||||
|
int start_a = a.getStart();
|
||||||
|
int start_b = b.getStart();
|
||||||
|
|
||||||
|
int end_a = a.getEnd();
|
||||||
|
int end_b = b.getEnd();
|
||||||
|
|
||||||
|
if (chr_a < chr_b) { return -1; }
|
||||||
|
else if (chr_a > chr_b) { return 1; }
|
||||||
|
else if (start_a < start_b) { return -1; }
|
||||||
|
else if (start_a > start_b) { return 1; }
|
||||||
|
else if (end_a < end_b) { return -1; }
|
||||||
|
else if (end_a > end_b) { return 1; }
|
||||||
|
else { return 0; }
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue