lots of new stuff, some generally useful, some one-off.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2673 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
78890c0bee
commit
877957761f
|
|
@ -157,6 +157,37 @@ class VCFHomogenizer extends InputStream
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/////////
|
||||||
|
// Info-level corrections
|
||||||
|
|
||||||
|
String info = tokens[7];
|
||||||
|
String new_info = "";
|
||||||
|
String[] info_tokens = info.split(";");
|
||||||
|
for (int i = 0; i < info_tokens.length; i++)
|
||||||
|
{
|
||||||
|
|
||||||
|
// Fix the case where AC includes the ref count first.
|
||||||
|
if (info_tokens[i].startsWith("AC="))
|
||||||
|
{
|
||||||
|
String[] ACs = info_tokens[i].replaceAll("^AC=", "").split(",");
|
||||||
|
if (ACs.length == alts.length+1)
|
||||||
|
{
|
||||||
|
String new_ACs = "";
|
||||||
|
for (int j = 1; j < ACs.length; j++)
|
||||||
|
{
|
||||||
|
new_ACs += ACs[j];
|
||||||
|
if (j != (ACs.length-1)) { new_ACs += ","; }
|
||||||
|
}
|
||||||
|
info_tokens[i] = "AC=" + new_ACs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new_info += info_tokens[i];
|
||||||
|
if (i != (info_tokens.length-1)) { new_info += ";"; }
|
||||||
|
}
|
||||||
|
tokens[7] = new_info;
|
||||||
|
|
||||||
|
|
||||||
/////////
|
/////////
|
||||||
// Now put it back together and emit.
|
// Now put it back together and emit.
|
||||||
String output = tokens[0];
|
String output = tokens[0];
|
||||||
|
|
|
||||||
|
|
@ -66,11 +66,14 @@ class VCFSequenomAnalysis extends CommandLineProgram
|
||||||
VCFRecord record1 = reader1.next();
|
VCFRecord record1 = reader1.next();
|
||||||
VCFRecord record2 = reader2.next();
|
VCFRecord record2 = reader2.next();
|
||||||
|
|
||||||
|
|
||||||
while(true)
|
while(true)
|
||||||
{
|
{
|
||||||
if (record1 == null) { break; }
|
if (record1 == null) { break; }
|
||||||
if (record2 == null) { break; }
|
if (record2 == null) { break; }
|
||||||
|
|
||||||
|
String[] sample_names = record2.getSampleNames();
|
||||||
|
|
||||||
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
||||||
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
||||||
|
|
||||||
|
|
@ -97,7 +100,7 @@ class VCFSequenomAnalysis extends CommandLineProgram
|
||||||
int n_alt_sequenom = VCFTool.Compute_n_alt(record1);
|
int n_alt_sequenom = VCFTool.Compute_n_alt(record1);
|
||||||
int n_alt_sequencing = VCFTool.Compute_n_alt(record2);
|
int n_alt_sequencing = VCFTool.Compute_n_alt(record2);
|
||||||
|
|
||||||
double HWE_sequenom = VCFTool.Compute_HWE(record1);
|
double HWE_sequenom = VCFTool.Compute_HWE(record1, sample_names);
|
||||||
double HWE_sequencing = VCFTool.Compute_HWE(record2);
|
double HWE_sequencing = VCFTool.Compute_HWE(record2);
|
||||||
|
|
||||||
boolean isPolymorphic_sequenom = (n_alt_sequenom > 0) ? true : false;
|
boolean isPolymorphic_sequenom = (n_alt_sequenom > 0) ? true : false;
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||||
|
|
||||||
import edu.mit.broad.picard.util.Interval;
|
import edu.mit.broad.picard.util.Interval;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
@ -76,7 +77,6 @@ class VCFValidate extends CommandLineProgram
|
||||||
class VCFStats extends CommandLineProgram
|
class VCFStats extends CommandLineProgram
|
||||||
{
|
{
|
||||||
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
||||||
//@Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename;
|
|
||||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||||
@Argument(fullName = "locus", shortName = "locus", doc = "file listing loci to extract", required = true) public String locus_string;
|
@Argument(fullName = "locus", shortName = "locus", doc = "file listing loci to extract", required = true) public String locus_string;
|
||||||
|
|
||||||
|
|
@ -102,25 +102,56 @@ class VCFStats extends CommandLineProgram
|
||||||
// Stats collectors
|
// Stats collectors
|
||||||
int transitions = 0;
|
int transitions = 0;
|
||||||
int transversions = 0;
|
int transversions = 0;
|
||||||
|
int dbsnp = 0;
|
||||||
int total_snps = 0;
|
int total_snps = 0;
|
||||||
int[] AC_histogram = new int[1000]; int highest_AC = 0;
|
int[] AC_histogram = new int[1000]; int highest_AC = 0;
|
||||||
|
int[] DP_histogram = new int[1000000]; int highest_DP = 0;
|
||||||
|
|
||||||
|
int[] AC_transitions = new int[1000];
|
||||||
|
int[] DP_transitions = new int[1000];
|
||||||
|
|
||||||
|
int depth_sum = 0;
|
||||||
|
|
||||||
boolean before = true;
|
boolean before = true;
|
||||||
|
|
||||||
while(reader.hasNext())
|
while(reader.hasNext())
|
||||||
{
|
{
|
||||||
VCFRecord record = reader.next();
|
VCFRecord record = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
record = reader.next();
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
System.err.printf("WARNING: %s\n", e.toString());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
Interval this_locus = VCFTool.getIntervalFromRecord(record);
|
Interval this_locus = VCFTool.getIntervalFromRecord(record);
|
||||||
if (locus.intersects(this_locus))
|
if (locus.intersects(this_locus))
|
||||||
{
|
{
|
||||||
before = false;
|
before = false;
|
||||||
|
|
||||||
Map<String,String> info = record.getInfoValues();
|
Map<String,String> info = record.getInfoValues();
|
||||||
int AC = Integer.parseInt(info.get("AC"));
|
|
||||||
|
int AC = 0;
|
||||||
|
int DP = 0;
|
||||||
|
int DB = 0;
|
||||||
|
|
||||||
|
if (info.containsKey("AC")) { AC = Integer.parseInt(info.get("AC")); }
|
||||||
|
if (info.containsKey("DP")) { DP = Integer.parseInt(info.get("DP")); }
|
||||||
|
if (info.containsKey("DB")) { DB = Integer.parseInt(info.get("DB")); }
|
||||||
|
|
||||||
|
depth_sum += DP;
|
||||||
|
|
||||||
|
dbsnp += DB; // 1 if in dbsnp, 0 otherwise
|
||||||
|
|
||||||
AC_histogram[AC] += 1;
|
AC_histogram[AC] += 1;
|
||||||
if (AC > highest_AC) { highest_AC = AC; }
|
if (AC > highest_AC) { highest_AC = AC; }
|
||||||
|
|
||||||
if (VCFTool.isTransition(record)) { transitions += 1; }
|
DP_histogram[DP] += 1;
|
||||||
|
if (DP > highest_DP) { highest_DP = DP; }
|
||||||
|
|
||||||
|
if (VCFTool.isTransition(record)) { transitions += 1; AC_transitions[AC] += 1; DP_transitions[DP] += 1; }
|
||||||
else { transversions += 1; }
|
else { transversions += 1; }
|
||||||
|
|
||||||
total_snps += 1;
|
total_snps += 1;
|
||||||
|
|
@ -129,13 +160,49 @@ class VCFStats extends CommandLineProgram
|
||||||
else if ((before == false) && (this_locus.compareTo(locus) > 0)) { break; }
|
else if ((before == false) && (this_locus.compareTo(locus) > 0)) { break; }
|
||||||
}
|
}
|
||||||
|
|
||||||
System.out.printf("Total SNPs : %d\n", total_snps);
|
double mean_depth = (double)depth_sum / (double)total_snps;
|
||||||
System.out.printf("Ts/Tv : %.02f\n", (double)transitions / (double)transversions);
|
double snp_rate = 1.0 / ((double)total_snps / (double)locus.length());
|
||||||
|
|
||||||
|
int DP_running_sum = 0;
|
||||||
|
int DP_1percent_low = -1;
|
||||||
|
int DP_5percent_low = -1;
|
||||||
|
for (int DP = 1; DP <= highest_DP; DP++)
|
||||||
|
{
|
||||||
|
if ((DP_1percent_low == -1) && (DP_running_sum >= 0.01*(double)total_snps)) { DP_1percent_low = DP; }
|
||||||
|
if ((DP_5percent_low == -1) && (DP_running_sum >= 0.05*(double)total_snps)) { DP_5percent_low = DP; }
|
||||||
|
DP_running_sum += DP_histogram[DP];
|
||||||
|
}
|
||||||
|
|
||||||
|
DP_running_sum = 0;
|
||||||
|
int DP_1percent_high = -1;
|
||||||
|
int DP_5percent_high = -1;
|
||||||
|
for (int DP = highest_DP; DP >= 0; DP--)
|
||||||
|
{
|
||||||
|
if ((DP_1percent_high == -1) && (DP_running_sum >= 0.01*(double)total_snps)) { DP_1percent_high = DP; }
|
||||||
|
if ((DP_5percent_high == -1) && (DP_running_sum >= 0.05*(double)total_snps)) { DP_5percent_high = DP; }
|
||||||
|
DP_running_sum += DP_histogram[DP];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
System.out.printf("Locus : %s\n", locus.toString());
|
||||||
|
System.out.printf("Total SNPs : %d\n", total_snps);
|
||||||
|
System.out.printf("SNP Rate : 1/%f\n", snp_rate);
|
||||||
|
System.out.printf("Ts/Tv : %.02f\n", (double)transitions / (double)transversions);
|
||||||
|
System.out.printf("%%dbsnp : %.02f\n", 100.0 * (double)dbsnp / (double)total_snps);
|
||||||
|
System.out.printf("Average Depth : %f\n", mean_depth);
|
||||||
|
System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high);
|
||||||
|
System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high);
|
||||||
System.out.printf("\n");
|
System.out.printf("\n");
|
||||||
System.out.printf("AAF\tCount\n");
|
System.out.printf("table\tAAF\tCount\tTs/Tv\n");
|
||||||
for (int AC = 1; AC <= highest_AC; AC++)
|
for (int AC = 1; AC <= highest_AC; AC++)
|
||||||
{
|
{
|
||||||
System.out.printf("%d\t%d\n", AC, AC_histogram[AC]);
|
System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC]));
|
||||||
|
}
|
||||||
|
System.out.printf("\n");
|
||||||
|
System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n");
|
||||||
|
for (int DP = 1; DP <= highest_DP; DP++)
|
||||||
|
{
|
||||||
|
System.out.printf("%d\t%d\t%f\n", DP, DP_histogram[DP], (double)DP_transitions[DP]/(double)(DP_histogram[DP]-DP_transitions[DP]));
|
||||||
}
|
}
|
||||||
System.out.printf("\n");
|
System.out.printf("\n");
|
||||||
|
|
||||||
|
|
@ -307,7 +374,6 @@ class FixRefFields extends CommandLineProgram
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class VCFGrep extends CommandLineProgram
|
class VCFGrep extends CommandLineProgram
|
||||||
{
|
{
|
||||||
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
||||||
|
|
@ -315,6 +381,58 @@ class VCFGrep extends CommandLineProgram
|
||||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||||
@Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename;
|
@Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int execute()
|
||||||
|
{
|
||||||
|
HashSet<String> loci = new HashSet<String>();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Scanner loci_reader = new Scanner(new File(loci_filename));
|
||||||
|
while(loci_reader.hasNextLine())
|
||||||
|
{
|
||||||
|
String line = loci_reader.nextLine();
|
||||||
|
line = line.replaceAll("\\s+", "");
|
||||||
|
loci.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
PrintStream output = new PrintStream(new File(out_filename));
|
||||||
|
|
||||||
|
Scanner reader = new Scanner(new File(in_filename));
|
||||||
|
while(reader.hasNextLine())
|
||||||
|
{
|
||||||
|
String line = reader.nextLine();
|
||||||
|
|
||||||
|
if (line.matches("^\\#.*$")) { output.print(line + "\n"); continue; }
|
||||||
|
|
||||||
|
String[] tokens = line.split("\\s+");
|
||||||
|
String locus = tokens[0] + ":" + tokens[1];
|
||||||
|
if (loci.contains(locus)) { output.print(line + "\n"); continue; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class VCFGrep_old extends CommandLineProgram
|
||||||
|
{
|
||||||
|
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
||||||
|
@Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename;
|
||||||
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||||
|
@Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int execute()
|
protected int execute()
|
||||||
{
|
{
|
||||||
|
|
@ -579,193 +697,6 @@ class VCFSimpleStats extends CommandLineProgram
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
{
|
|
||||||
@Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1;
|
|
||||||
@Argument(fullName = "vcf2", shortName = "vcf2", doc = "file to open", required = true) public String filename2;
|
|
||||||
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
|
|
||||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
||||||
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected int execute()
|
|
||||||
{
|
|
||||||
//System.out.println("Loading " + filename + "...");
|
|
||||||
|
|
||||||
PrintStream output = null;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
output = new PrintStream(new FileOutputStream(output_filename));
|
|
||||||
}
|
|
||||||
catch (Exception e)
|
|
||||||
{
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
VCFReader reader1;
|
|
||||||
VCFReader reader2;
|
|
||||||
|
|
||||||
if (autocorrect)
|
|
||||||
{
|
|
||||||
reader1 = new VCFReader(VCFHomogenizer.create(filename1));
|
|
||||||
reader2 = new VCFReader(VCFHomogenizer.create(filename2));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
reader1 = new VCFReader(new File(filename1));
|
|
||||||
reader2 = new VCFReader(new File(filename2));
|
|
||||||
}
|
|
||||||
|
|
||||||
VCFHeader header1 = reader1.getHeader();
|
|
||||||
VCFHeader header2 = reader2.getHeader();
|
|
||||||
|
|
||||||
VCFRecord record1 = reader1.next();
|
|
||||||
VCFRecord record2 = reader2.next();
|
|
||||||
|
|
||||||
int TP = 0;
|
|
||||||
int FP = 0;
|
|
||||||
int TN = 0;
|
|
||||||
int FN = 0;
|
|
||||||
int total = 0;
|
|
||||||
|
|
||||||
while(true)
|
|
||||||
{
|
|
||||||
if (record1 == null) { break; }
|
|
||||||
if (record2 == null) { break; }
|
|
||||||
|
|
||||||
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
|
||||||
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
|
||||||
|
|
||||||
int comparison = interval1.compareTo(interval2);
|
|
||||||
|
|
||||||
if (comparison == 0)
|
|
||||||
{
|
|
||||||
// records match! compute concordance.
|
|
||||||
|
|
||||||
// (unless one of them is "filtered")
|
|
||||||
if (record1.isFiltered() || record2.isFiltered())
|
|
||||||
{
|
|
||||||
record1 = reader1.next();
|
|
||||||
record2 = reader2.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
char ref = record1.getReferenceBase();
|
|
||||||
|
|
||||||
String[] sample_names1 = record1.getSampleNames();
|
|
||||||
String[] sample_names2 = record2.getSampleNames();
|
|
||||||
|
|
||||||
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
|
|
||||||
List<VCFGenotypeRecord> genotypes2 = record2.getVCFGenotypeRecords();
|
|
||||||
|
|
||||||
Map<String, VCFGenotypeRecord> map2 = new HashMap<String, VCFGenotypeRecord>();
|
|
||||||
for (int i = 0; i < genotypes2.size(); i++)
|
|
||||||
{
|
|
||||||
map2.put(genotypes2.get(i).getSampleName(), genotypes2.get(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
long n_ref_1 = 0;
|
|
||||||
long n_alt_1 = 0;
|
|
||||||
|
|
||||||
long n_ref_2 = 0;
|
|
||||||
long n_alt_2 = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < sample_names1.length; i++)
|
|
||||||
{
|
|
||||||
VCFGenotypeRecord rec1 = genotypes1.get(i);
|
|
||||||
VCFGenotypeRecord rec2 = map2.get(sample_names1[i]);
|
|
||||||
|
|
||||||
if (rec2 == null) { continue; }
|
|
||||||
|
|
||||||
Long gq1;
|
|
||||||
|
|
||||||
if (rec1.getFields().get("GQ") != null)
|
|
||||||
{
|
|
||||||
Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ"));
|
|
||||||
gq1 = gq1_double.longValue();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
gq1 = 0L;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<VCFGenotypeEncoding> alleles1 = rec1.getAlleles();
|
|
||||||
List<VCFGenotypeEncoding> alleles2 = rec2.getAlleles();
|
|
||||||
|
|
||||||
String g1 = "";
|
|
||||||
String g2 = "";
|
|
||||||
|
|
||||||
for (int j = 0; j < alleles1.size(); j++) { g1 += alleles1.get(j).getBases(); }
|
|
||||||
for (int j = 0; j < alleles2.size(); j++) { g2 += alleles2.get(j).getBases(); }
|
|
||||||
|
|
||||||
char[] c1 = g1.toCharArray();
|
|
||||||
char[] c2 = g2.toCharArray();
|
|
||||||
|
|
||||||
Arrays.sort(c1);
|
|
||||||
Arrays.sort(c2);
|
|
||||||
|
|
||||||
g1 = new String(c1);
|
|
||||||
g2 = new String(c2);
|
|
||||||
|
|
||||||
if ((g1.equals("..")) ||
|
|
||||||
(g2.equals("..")))
|
|
||||||
{
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (g1.charAt(0) == ref) { n_ref_1 += 1; } else { n_alt_1 += 1; }
|
|
||||||
if (g1.charAt(1) == ref) { n_ref_1 += 1; } else { n_alt_1 += 1; }
|
|
||||||
|
|
||||||
if (g2.charAt(0) == ref) { n_ref_2 += 1; } else { n_alt_2 += 1; }
|
|
||||||
if (g2.charAt(1) == ref) { n_ref_2 += 1; } else { n_alt_2 += 1; }
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((n_alt_1 > 0) && (n_alt_2 > 0)) { TP += 1; }
|
|
||||||
if ((n_alt_1 > 0) && (n_alt_2 == 0)) { FP += 1; }
|
|
||||||
if ((n_alt_1 == 0) && (n_alt_2 > 0)) { FN += 1; }
|
|
||||||
if ((n_alt_1 == 0) && (n_alt_2 == 0)) { TN += 1; }
|
|
||||||
total += 1;
|
|
||||||
|
|
||||||
if (verbose)
|
|
||||||
{
|
|
||||||
output.printf("SNP "
|
|
||||||
+ interval1.toString()
|
|
||||||
+ " " + n_ref_1 + " " + n_alt_1
|
|
||||||
+ " " + n_ref_2 + " " + n_alt_2 + "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
record1 = reader1.next();
|
|
||||||
record2 = reader2.next();
|
|
||||||
}
|
|
||||||
else if (comparison > 0)
|
|
||||||
{
|
|
||||||
// interval1 is later than interval2.
|
|
||||||
record2 = reader2.next();
|
|
||||||
}
|
|
||||||
else if (comparison < 0)
|
|
||||||
{
|
|
||||||
// interval2 is later than interval1.
|
|
||||||
record1 = reader1.next();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Now output the statistics.
|
|
||||||
|
|
||||||
output.printf("TP FP TN FN\n%d(%f) %d(%f) %d(%f) %d(%f)\n",
|
|
||||||
TP, (double)TP/(double)total,
|
|
||||||
FP, (double)FP/(double)total,
|
|
||||||
TN, (double)TN/(double)total,
|
|
||||||
FN, (double)FN/(double)total);
|
|
||||||
|
|
||||||
output.flush();
|
|
||||||
output.close();
|
|
||||||
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
class VCFConcordance extends CommandLineProgram
|
class VCFConcordance extends CommandLineProgram
|
||||||
|
|
@ -775,7 +706,9 @@ class VCFConcordance extends CommandLineProgram
|
||||||
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
|
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
|
||||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||||
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
||||||
|
@Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false;
|
||||||
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
||||||
|
@Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
@ -792,7 +725,28 @@ class VCFConcordance extends CommandLineProgram
|
||||||
|
|
||||||
//
|
//
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
|
||||||
|
HashSet<String> sample_mask = new HashSet<String>();
|
||||||
|
if (samples_filename != null)
|
||||||
|
{
|
||||||
|
Scanner samples_reader = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
samples_reader = new Scanner(new File(samples_filename));
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
while(samples_reader.hasNextLine())
|
||||||
|
{
|
||||||
|
String line = samples_reader.nextLine();
|
||||||
|
line.replaceAll("^\\s+|\\s+$", "");
|
||||||
|
sample_mask.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
PrintStream output = null;
|
PrintStream output = null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
|
@ -833,6 +787,8 @@ class VCFConcordance extends CommandLineProgram
|
||||||
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
||||||
|
|
||||||
int comparison = interval1.compareTo(interval2);
|
int comparison = interval1.compareTo(interval2);
|
||||||
|
|
||||||
|
//System.out.println("DBG: " + interval1 + " " + interval2);
|
||||||
|
|
||||||
if (comparison == 0)
|
if (comparison == 0)
|
||||||
{
|
{
|
||||||
|
|
@ -866,13 +822,19 @@ class VCFConcordance extends CommandLineProgram
|
||||||
|
|
||||||
for (int i = 0; i < sample_names1.length; i++)
|
for (int i = 0; i < sample_names1.length; i++)
|
||||||
{
|
{
|
||||||
|
if ((samples_filename != null) &&
|
||||||
|
(! sample_mask.contains(sample_names1[i])))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
VCFGenotypeRecord rec1 = genotypes1.get(i);
|
VCFGenotypeRecord rec1 = genotypes1.get(i);
|
||||||
VCFGenotypeRecord rec2 = map2.get(sample_names1[i]);
|
VCFGenotypeRecord rec2 = map2.get(sample_names1[i]);
|
||||||
|
|
||||||
if (rec2 == null) { continue; }
|
if (rec2 == null) { continue; }
|
||||||
|
|
||||||
Long gq1;
|
Long gq1;
|
||||||
|
|
||||||
if (rec1.getFields().get("GQ") != null)
|
if (rec1.getFields().get("GQ") != null)
|
||||||
{
|
{
|
||||||
Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ"));
|
Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ"));
|
||||||
|
|
@ -883,6 +845,17 @@ class VCFConcordance extends CommandLineProgram
|
||||||
gq1 = 0L;
|
gq1 = 0L;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Long gq2;
|
||||||
|
if (rec2.getFields().get("GQ") != null)
|
||||||
|
{
|
||||||
|
Double gq2_double = Double.parseDouble(rec2.getFields().get("GQ"));
|
||||||
|
gq2 = gq2_double.longValue();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
gq2 = 0L;
|
||||||
|
}
|
||||||
|
|
||||||
List<VCFGenotypeEncoding> alleles1 = rec1.getAlleles();
|
List<VCFGenotypeEncoding> alleles1 = rec1.getAlleles();
|
||||||
List<VCFGenotypeEncoding> alleles2 = rec2.getAlleles();
|
List<VCFGenotypeEncoding> alleles2 = rec2.getAlleles();
|
||||||
|
|
||||||
|
|
@ -901,6 +874,20 @@ class VCFConcordance extends CommandLineProgram
|
||||||
g1 = new String(c1);
|
g1 = new String(c1);
|
||||||
g2 = new String(c2);
|
g2 = new String(c2);
|
||||||
|
|
||||||
|
if (list_genotypes)
|
||||||
|
{
|
||||||
|
String flag = "";
|
||||||
|
if (! g1.equals(g2)) { flag = "X"; }
|
||||||
|
output.printf("GENOTYPES "
|
||||||
|
+ interval1.toString()
|
||||||
|
+ " " + sample_names1[i]
|
||||||
|
+ " " + g1
|
||||||
|
+ " " + g2
|
||||||
|
+ " " + gq1
|
||||||
|
+ " " + gq2
|
||||||
|
+ " " + flag + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
if ((g1.equals("..")) ||
|
if ((g1.equals("..")) ||
|
||||||
(g2.equals("..")))
|
(g2.equals("..")))
|
||||||
{
|
{
|
||||||
|
|
@ -917,18 +904,12 @@ class VCFConcordance extends CommandLineProgram
|
||||||
Qual.get(gq1).add(ref, g1, g2);
|
Qual.get(gq1).add(ref, g1, g2);
|
||||||
SNP.add(ref, g1, g2);
|
SNP.add(ref, g1, g2);
|
||||||
|
|
||||||
/*
|
|
||||||
if (verbose)
|
|
||||||
{
|
|
||||||
String flag = "";
|
|
||||||
if (! g1.equals(g2)) { flag = "X"; }
|
|
||||||
output.printf("GENOTYPES " + interval1.toString() + " " + sample_names1[i] + " " + g1 + " " + g2 + " " + flag + "\n");
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbose) { output.printf("SNP " + SNP.toString()); }
|
if (verbose)
|
||||||
|
{
|
||||||
|
output.printf("SNP " + SNP.toString());
|
||||||
|
}
|
||||||
|
|
||||||
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
|
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
|
||||||
AAF.get(n_alt).add(SNP);
|
AAF.get(n_alt).add(SNP);
|
||||||
|
|
@ -1019,6 +1000,8 @@ public class VCFTool
|
||||||
{
|
{
|
||||||
public static void main(String args[])
|
public static void main(String args[])
|
||||||
{
|
{
|
||||||
|
SetupSequenceDictionary();
|
||||||
|
|
||||||
String mode = args[0];
|
String mode = args[0];
|
||||||
String[] realArgs = Arrays.copyOfRange(args, 1, args.length);
|
String[] realArgs = Arrays.copyOfRange(args, 1, args.length);
|
||||||
|
|
||||||
|
|
@ -1099,6 +1082,20 @@ public class VCFTool
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mode.equals("optimize"))
|
||||||
|
{
|
||||||
|
VCFOptimize cm = new VCFOptimize();
|
||||||
|
CommandLineProgram.start(cm,realArgs);
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mode.equals("apply_cuts"))
|
||||||
|
{
|
||||||
|
VCFApplyCuts cm = new VCFApplyCuts();
|
||||||
|
CommandLineProgram.start(cm,realArgs);
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
System.out.printf("ERROR: mode %s not defined.\n", mode);
|
System.out.printf("ERROR: mode %s not defined.\n", mode);
|
||||||
System.exit(-1);
|
System.exit(-1);
|
||||||
|
|
||||||
|
|
@ -1107,7 +1104,22 @@ public class VCFTool
|
||||||
|
|
||||||
/////////////////////////
|
/////////////////////////
|
||||||
// Some helpful utilities.
|
// Some helpful utilities.
|
||||||
|
|
||||||
|
// Total hack to set up a sequence dictionary for 1kG hg18/build36 without needing to load a fasta.
|
||||||
|
public static SAMSequenceDictionary dict;
|
||||||
|
public static void SetupSequenceDictionary()
|
||||||
|
{
|
||||||
|
dict = new SAMSequenceDictionary();
|
||||||
|
for (int i = 1; i <= 22; i++)
|
||||||
|
{
|
||||||
|
dict.addSequence(new SAMSequenceRecord(String.format("%d", i)));
|
||||||
|
}
|
||||||
|
dict.addSequence(new SAMSequenceRecord("X"));
|
||||||
|
dict.addSequence(new SAMSequenceRecord("Y"));
|
||||||
|
dict.addSequence(new SAMSequenceRecord("M"));
|
||||||
|
GenomeLocParser.setupRefContigOrdering(dict);
|
||||||
|
}
|
||||||
|
|
||||||
public static Interval getIntervalFromRecord(VCFRecord record)
|
public static Interval getIntervalFromRecord(VCFRecord record)
|
||||||
{
|
{
|
||||||
String chr = record.getLocation().getContig();
|
String chr = record.getLocation().getContig();
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue