diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/GenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/tools/vcf/GenotypeConcordance.java index e42c6a81d..3161007d4 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/vcf/GenotypeConcordance.java +++ b/java/src/org/broadinstitute/sting/playground/tools/vcf/GenotypeConcordance.java @@ -56,6 +56,30 @@ class GenotypeConcordance } } + public String toLine() + { + int on_diag = 0; + int on_diag_not_homref = 0; + int off_diag = 0; + int total = 0; + int total_not_homref = 0; + + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + if (i == j) { on_diag += counts[i][j]; } + if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; } + if (i != j) { off_diag += counts[i][j]; } + if (i != 0 || j != 0) { total_not_homref += counts[i][j]; } + total += counts[i][j]; + } + } + + String s = String.format("SNP %s %d %d %f %f\n", this.name, total, total_not_homref, this.errorRate(), this.hetErrorRate()); + return s; + } + public String toString() { String s = this.name + "\n"; @@ -108,6 +132,23 @@ class GenotypeConcordance return error_rate; } + public double hetErrorRate() + { + int true_hets = 0; + int correct_hets = 0; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + if (j == 1) { true_hets += counts[i][j]; } + } + } + correct_hets = counts[1][1]; + double het_error_rate = 1.0 - ((double)correct_hets / (double)true_hets); + return het_error_rate; + } + + public int total() { int total = 0; diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFHomogenizer.java b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFHomogenizer.java index e020f6288..cc0d72ce0 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFHomogenizer.java +++ b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFHomogenizer.java @@ -109,6 +109,9 @@ class VCFHomogenizer extends InputStream //System.out.println("input : " + input); + // Make it tab-delimited + input = input.replaceAll(" +", "\t"); + ///////// // Header corrections if (input.startsWith("##format=VCFv3.2")) { return "##format=VCRv3.2\n"; } @@ -131,6 +134,7 @@ class VCFHomogenizer extends InputStream // if alt is "N", make it "." if (tokens[4].equals("N")) { tokens[4] = "."; } + if (tokens[5].equals(".")) { tokens[5] = "-1"; } String ref = tokens[3]; String alt = tokens[4]; @@ -165,10 +169,15 @@ class VCFHomogenizer extends InputStream String[] info_tokens = info.split(";"); for (int i = 0; i < info_tokens.length; i++) { - - // Fix the case where AC includes the ref count first. - if (info_tokens[i].startsWith("AC=")) + if (info_tokens[i].startsWith("R2=")) { + // Fix NaN's in RNaN's in R2. + String new_token = info_tokens[i].replace("NaN", "0.0"); + info_tokens[i] = new_token; + } + else if (info_tokens[i].startsWith("AC=")) + { + // Fix the case where AC includes the ref count first. String[] ACs = info_tokens[i].replaceAll("^AC=", "").split(","); if (ACs.length == alts.length+1) { @@ -179,6 +188,7 @@ class VCFHomogenizer extends InputStream if (j != (ACs.length-1)) { new_ACs += ","; } } info_tokens[i] = "AC=" + new_ACs; + continue; } } diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFOptimize.java b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFOptimize.java index adde68227..4ed7448aa 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFOptimize.java +++ b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFOptimize.java @@ -25,7 +25,7 @@ import net.sf.samtools.SAMSequenceRecord; //import org.apache.commons.math.optimization.direct.*; //import org.apache.commons.math.analysis.MultivariateRealFunction; -// First draft of a program for working with VCF files in various ways. +// Program for frequency-specific VCF-files. /** @@ -39,9 +39,12 @@ class VCFOptimize extends CommandLineProgram @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; @Argument(fullName = "target_TsTv", shortName = "target_TsTv", doc = "Minimum acceptable TsTv", required=false) public double target_TsTv = 2.07; @Argument(fullName = "output", shortName = "output", doc = "file to write cuts to", required = true) public String output_filename; + @Argument(fullName = "min_calls", shortName = "min_calls", doc = "Minimum signifigant number of calls", required=false) public int min_calls = 100; + @Argument(fullName = "num_breaks", shortName = "num_breaks", doc = "Number of breaks to search over", required=false) public int num_breaks = 100; - // Debugging argument: + // Debugging arguments: @Argument(fullName = "n_records", shortName = "n_records", doc = "Number of records to load (debugging)", required=false) public int n_records_to_process = Integer.MAX_VALUE; + @Argument(fullName = "verbose", shortName = "verbose", doc = "print detailed debugging info.", required=false) public boolean verbose = false; class OptimizationRecord { @@ -187,11 +190,6 @@ class VCFOptimize extends CommandLineProgram // Just a simple grid search. private Cut optimize(OptimizationRecord[] records, double min_TsTv, int freq) { - double best_lod = Double.NaN; - double best_slod = Double.NaN; - double best_tstv = Double.NEGATIVE_INFINITY; - double best_num_calls = Double.NEGATIVE_INFINITY; - boolean flag = false; double[] lods = new double[records.length]; @@ -205,7 +203,6 @@ class VCFOptimize extends CommandLineProgram Arrays.sort(lods); Arrays.sort(slods); - int num_breaks = 100; double[] lod_breaks = new double[num_breaks]; double[] slod_breaks = new double[num_breaks]; int bin_size = 1 + (records.length / num_breaks); @@ -221,6 +218,19 @@ class VCFOptimize extends CommandLineProgram } //System.out.printf("\n"); + double best_lod = lod_breaks[0]; + double best_slod = slod_breaks[0]; + + int best_lod_idx = 0; + int best_slod_idx = 0; + + double[] point = new double[2]; + point[0] = best_lod; + point[1] = best_slod; + + double best_tstv = tstv(point, records); + double best_num_calls = num_calls(point, records); + boolean flag = false; //for (double lod = 0; lod < 8000; lod += 10) for (int lod_idx = 0; lod_idx < num_breaks; lod_idx += 1) @@ -231,17 +241,22 @@ class VCFOptimize extends CommandLineProgram { double slod = slod_breaks[slod_idx]; - double[] point = new double[2]; + point = new double[2]; point[0] = lod; point[1] = slod; double tstv = tstv(point, records); double num_calls = num_calls(point, records); + + if (num_calls < min_calls) { continue; } + if ((tstv >= min_TsTv) && (num_calls > best_num_calls)) { best_lod=lod; best_slod=slod; best_tstv=tstv; best_num_calls=num_calls; + best_lod_idx = lod_idx; + best_slod_idx = slod_idx; flag=true; } else if ((tstv >= best_tstv) && (!flag)) @@ -250,12 +265,24 @@ class VCFOptimize extends CommandLineProgram best_slod=slod; best_tstv=tstv; best_num_calls=num_calls; + best_lod_idx = lod_idx; + best_slod_idx = slod_idx; + } + + + if (verbose) + { + System.out.printf("DEBUG: %d | %d %d | %f %f %f %f | %f %f %f %f\n", + freq, + lod_idx, slod_idx, + lod, slod, num_calls, tstv, + best_lod, best_slod, best_num_calls, best_tstv); } } } //System.out.printf("Found optimum: lod=%f slod=%f num_calls=%f tstv=%f\n", best_lod, best_slod, best_num_calls, best_tstv); - System.out.printf("%d %f %f %f %f\n", freq, best_lod, best_slod, best_num_calls, best_tstv); + System.out.printf("%d %d %d %f %f %f %f\n", freq, best_lod_idx, best_slod_idx, best_lod, best_slod, best_num_calls, best_tstv); return new Cut(best_lod, best_slod); } diff --git a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java index 331a955a3..1ba3688a4 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java +++ b/java/src/org/broadinstitute/sting/playground/tools/vcf/VCFTool.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import java.io.*; import java.util.*; +import java.util.zip.*; import net.sf.picard.PicardException; import net.sf.picard.io.IoUtil; @@ -33,8 +34,9 @@ class VCFValidate extends CommandLineProgram { @Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename; @Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false; - @Argument(fullName = "print", shortName = "print", doc = "print the vcf records to stdout", required = false) public Boolean print = false; + @Argument(fullName = "print", shortName = "print", doc = "print the vcf records to output", required = false) public Boolean print = false; @Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false; + @Argument(fullName = "out", shortName = "out", doc = "if --print, write to this file (default is /dev/stdout)", required = false) public String out = "/dev/stdout"; @Override protected int execute() @@ -48,12 +50,19 @@ class VCFValidate extends CommandLineProgram VCFHeader header = reader.getHeader(); + VCFWriter writer = null; + if (print) + { + writer = new VCFWriter(new File(out)); + writer.writeHeader(header); + } + Date start_time = new Date(); int n_records_processed = 0; while(reader.hasNext()) { VCFRecord record = reader.next(); - if (print) { System.out.println(record.toStringEncoding(header)); } + if (print) { writer.addRecord(record); } if ((profile) && (n_records_processed % 10000 == 0)) { @@ -67,6 +76,8 @@ class VCFValidate extends CommandLineProgram n_records_processed += 1; } + if (print) { writer.close(); } + if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); } else { System.out.println(filename + " is VALID."); } @@ -193,12 +204,15 @@ class VCFStats extends CommandLineProgram System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high); System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high); System.out.printf("\n"); + System.out.printf("table\tAAF\tCount\tTs/Tv\n"); for (int AC = 1; AC <= highest_AC; AC++) { System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC])); } System.out.printf("\n"); + + System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n"); for (int DP = 1; DP <= highest_DP; DP++) { @@ -387,7 +401,11 @@ class VCFGrep extends CommandLineProgram HashSet loci = new HashSet(); try { - Scanner loci_reader = new Scanner(new File(loci_filename)); + Scanner loci_reader; + + if (loci_filename.endsWith(".gz")) { loci_reader = new Scanner(new GZIPInputStream(new FileInputStream(loci_filename))); } + else { loci_reader = new Scanner(new File(loci_filename)); } + while(loci_reader.hasNextLine()) { String line = loci_reader.nextLine(); @@ -404,7 +422,9 @@ class VCFGrep extends CommandLineProgram { PrintStream output = new PrintStream(new File(out_filename)); - Scanner reader = new Scanner(new File(in_filename)); + Scanner reader; + if (in_filename.endsWith(".gz")) { reader = new Scanner(new GZIPInputStream(new FileInputStream(in_filename))); } + else { reader = new Scanner(new File(in_filename)); } while(reader.hasNextLine()) { String line = reader.nextLine(); @@ -709,6 +729,7 @@ class VCFConcordance extends CommandLineProgram @Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false; @Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1; @Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null; + @Argument(fullName = "r2_bin_size", shortName = "r2_bin_size", doc = "size of an r2 bin for calculating error rates", required = false) public double r2_bin_size = 0.01; @Override @@ -722,6 +743,22 @@ class VCFConcordance extends CommandLineProgram HashMap individual = new HashMap(); HashMap AAF = new HashMap(); HashMap Qual = new HashMap(); + HashMap R2 = new HashMap(); + + int shared_ts = 0; + int shared_tv = 0; + int shared_dbsnp = 0; + int shared_total = 0; + + int unique1_ts = 0; + int unique1_tv = 0; + int unique1_dbsnp = 0; + int unique1_total = 0; + + int unique2_ts = 0; + int unique2_tv = 0; + int unique2_dbsnp = 0; + int unique2_total = 0; // ///////////////////////////////// @@ -777,6 +814,10 @@ class VCFConcordance extends CommandLineProgram VCFRecord record1 = reader1.next(); VCFRecord record2 = reader2.next(); + int number_sites_unique_to_file1 = 0; + int number_sites_unique_to_file2 = 0; + int number_sites_shared = 0; + while(true) { if (record1 == null) { break; } @@ -786,9 +827,10 @@ class VCFConcordance extends CommandLineProgram Interval interval1 = VCFTool.getIntervalFromRecord(record1); Interval interval2 = VCFTool.getIntervalFromRecord(record2); - int comparison = interval1.compareTo(interval2); + //int comparison = interval1.compareTo(interval2); + int comparison = VCFTool.compareIntervals(interval1, interval2); - //System.out.println("DBG: " + interval1 + " " + interval2); + //System.out.println("DBG: " + interval1 + " " + interval2 + " " + comparison); if (comparison == 0) { @@ -799,13 +841,31 @@ class VCFConcordance extends CommandLineProgram { record1 = reader1.next(); record2 = reader2.next(); + continue; } + char ref = record1.getReferenceBase(); String[] sample_names1 = record1.getSampleNames(); String[] sample_names2 = record2.getSampleNames(); + + Map info1 = record1.getInfoValues(); + Map info2 = record2.getInfoValues(); + double r2_1 = 0; + double r2_2 = 0; + if (info1.containsKey("R2")) { r2_1 = Double.parseDouble(info1.get("R2")); } + if (info2.containsKey("R2")) { r2_2 = Double.parseDouble(info2.get("R2")); } + + + number_sites_shared += 1; + if (VCFTool.isTransition(record1)) { shared_ts += 1; } + else { shared_tv += 1; } + if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { shared_dbsnp += 1; } + shared_total += 1; + + List genotypes1 = record1.getVCFGenotypeRecords(); List genotypes2 = record2.getVCFGenotypeRecords(); @@ -908,31 +968,59 @@ class VCFConcordance extends CommandLineProgram if (verbose) { - output.printf("SNP " + SNP.toString()); + //output.printf("SNP " + SNP.toString()); + output.printf("SNP " + SNP.toLine()); } if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); } AAF.get(n_alt).add(SNP); + long r2_index = (long)(r2_1 / r2_bin_size); + if (! R2.containsKey(r2_index)) { R2.put(r2_index, new GenotypeConcordance(Double.toString(r2_1))); } + R2.get(r2_index).add(SNP); + + //System.out.printf("DBG: %f %f\n", r2_1, r2_2); + //System.out.printf("DBG: %f %d %s\n", r2_1, r2_index, SNP.toString()); + record1 = reader1.next(); record2 = reader2.next(); } else if (comparison > 0) { + if (record2.isFiltered()) { record2 = reader2.next(); continue; } + // interval1 is later than interval2. + Map info2 = record2.getInfoValues(); + number_sites_unique_to_file2 += 1; + if (VCFTool.isTransition(record2)) { unique2_ts += 1; } + else { unique2_tv += 1; } + if ((info2.get("DB") != null) && (Integer.parseInt(info2.get("DB")) == 1)) { unique2_dbsnp += 1; } + unique2_total += 1; + + //if (verbose) { output.printf("DBG: skipping %s\n", record2.toStringEncoding(header2)); } + record2 = reader2.next(); } else if (comparison < 0) { + if (record1.isFiltered()) { record1 = reader1.next(); continue; } + // interval2 is later than interval1. + Map info1 = record1.getInfoValues(); + number_sites_unique_to_file1 += 1; + if (VCFTool.isTransition(record1)) { unique1_ts += 1; } + else { unique1_tv += 1; } + if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { unique1_dbsnp += 1; } + unique1_total += 1; + + //if (verbose) { output.printf("DBG: skipping %s\n", record1.toStringEncoding(header1)); } + record1 = reader1.next(); } - } // Now output the statistics. - { if (verbose) { output.printf("\n"); @@ -959,8 +1047,28 @@ class VCFConcordance extends CommandLineProgram output.print("QUAL " + Qual.get(qual).toString()); } output.printf("\n"); + + output.printf("\n"); + Object[] R2s = R2.keySet().toArray(); + for (int i = 0; i < AAFs.length; i++) + { + Long r2 = (Long)R2s[i]; + output.print("R2 " + R2.get(r2).toString()); + } } + output.printf("Number of sites shared : %d %f %f\n", number_sites_shared, + (double)shared_ts/(double)shared_tv, + (double)shared_dbsnp/(double)(shared_ts+shared_tv)); + + output.printf("Number of sites unique to %s: %d %f %f\n", filename1, number_sites_unique_to_file1, + (double)unique1_ts/(double)unique1_tv, + (double)unique1_dbsnp/(double)(unique1_ts+unique1_tv)); + + output.printf("Number of sites unique to %s: %d %f %f\n", filename2, number_sites_unique_to_file2, + (double)unique2_ts/(double)unique2_tv, + (double)unique2_dbsnp/(double)(unique2_ts+unique2_tv)); + output.printf("\n"); Object[] individuals = individual.keySet().toArray(); for (int i = 0; i < individuals.length; i++) @@ -974,7 +1082,7 @@ class VCFConcordance extends CommandLineProgram for (int i = 0; i < AAFs.length; i++) { Long aaf = (Long)AAFs[i]; - output.printf("AAF %d %f %d %d\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef()); + output.printf("AAF %d %f %d %d %f\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef(), AAF.get(aaf).hetErrorRate()); } output.printf("\n"); @@ -985,7 +1093,13 @@ class VCFConcordance extends CommandLineProgram output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef()); } - } + output.printf("\n"); + Object[] R2s = R2.keySet().toArray(); + for (int i = 0; i < R2s.length; i++) + { + Long r2 = (Long)R2s[i]; + output.printf("R2 %f %f %d %d\n", (double)r2 * r2_bin_size, R2.get(r2).errorRate(), R2.get(r2).total(), R2.get(r2).totalNonHomRef()); + } output.flush(); output.close(); @@ -995,11 +1109,14 @@ class VCFConcordance extends CommandLineProgram } } - public class VCFTool { public static void main(String args[]) { + // silence log4j messages. + //appender = new FileAppender(layout, clp.toFile, false); + //logger.addAppender(appender); + SetupSequenceDictionary(); String mode = args[0]; @@ -1096,6 +1213,13 @@ public class VCFTool System.exit(0); } + if (mode.equals("merge")) + { + VCFMerge cm = new VCFMerge(); + CommandLineProgram.start(cm,realArgs); + System.exit(0); + } + System.out.printf("ERROR: mode %s not defined.\n", mode); System.exit(-1); @@ -1386,5 +1510,35 @@ public class VCFTool return ans; } + public static int compareIntervals(Interval a, Interval b) + { + int chr_a; + int chr_b; + + if (a.getSequence().equals("X")) { chr_a = 23; } + else if (a.getSequence().equals("Y")) { chr_a = 24; } + else if (a.getSequence().equals("M")) { chr_a = 25; } + else { chr_a = Integer.parseInt(a.getSequence()); } + + if (b.getSequence().equals("X")) { chr_b = 23; } + else if (b.getSequence().equals("Y")) { chr_b = 24; } + else if (b.getSequence().equals("M")) { chr_b = 25; } + else { chr_b = Integer.parseInt(b.getSequence()); } + + int start_a = a.getStart(); + int start_b = b.getStart(); + + int end_a = a.getEnd(); + int end_b = b.getEnd(); + + if (chr_a < chr_b) { return -1; } + else if (chr_a > chr_b) { return 1; } + else if (start_a < start_b) { return -1; } + else if (start_a > start_b) { return 1; } + else if (end_a < end_b) { return -1; } + else if (end_a > end_b) { return 1; } + else { return 0; } + } + }