1571 lines
47 KiB
Java
1571 lines
47 KiB
Java
/*
|
|
* Copyright (c) 2010 The Broad Institute
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use,
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following
|
|
* conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
package org.broadinstitute.sting.playground.tools.vcf;
|
|
import org.broad.tribble.vcf.VCFGenotypeEncoding;
|
|
import org.broad.tribble.vcf.VCFGenotypeRecord;
|
|
import org.broad.tribble.vcf.VCFHeader;
|
|
import org.broad.tribble.vcf.VCFRecord;
|
|
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
|
import org.broadinstitute.sting.commandline.Argument;
|
|
|
|
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
|
|
|
|
|
import java.io.*;
|
|
import java.util.*;
|
|
import java.util.zip.*;
|
|
|
|
import net.sf.picard.util.Interval;
|
|
import net.sf.picard.reference.ReferenceSequenceFileWalker;
|
|
import net.sf.samtools.SAMSequenceDictionary;
|
|
import net.sf.samtools.SAMSequenceRecord;
|
|
|
|
|
|
// First draft of a program for working with VCF files in various ways.
|
|
|
|
|
|
/**
|
|
* @author jmaguire
|
|
*/
|
|
|
|
|
|
class VCFValidate extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "print", shortName = "print", doc = "print the vcf records to output", required = false) public Boolean print = false;
|
|
@Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false;
|
|
@Argument(fullName = "out", shortName = "out", doc = "if --print, write to this file (default is /dev/stdout)", required = false) public String out = "/dev/stdout";
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
System.out.println("Validating " + filename + "...");
|
|
|
|
VCFReader reader = null;
|
|
|
|
if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); }
|
|
else { reader = new VCFReader(new File(filename)); }
|
|
|
|
VCFHeader header = reader.getHeader();
|
|
|
|
VCFWriter writer = null;
|
|
if (print)
|
|
{
|
|
writer = new VCFWriter(new File(out));
|
|
writer.writeHeader(header);
|
|
}
|
|
|
|
Date start_time = new Date();
|
|
int n_records_processed = 0;
|
|
while(reader.hasNext())
|
|
{
|
|
VCFRecord record = reader.next();
|
|
if (print) { writer.addRecord(record); }
|
|
|
|
if ((profile) && (n_records_processed % 10000 == 0))
|
|
{
|
|
Date current_time = new Date();
|
|
long elapsed = current_time.getTime() - start_time.getTime();
|
|
System.out.printf("RUNTIME: %d records processed in %f seconds; %f seconds per record.\n",
|
|
n_records_processed,
|
|
(double)elapsed/1000.0,
|
|
((double)elapsed/1000.0)/(double)n_records_processed);
|
|
}
|
|
n_records_processed += 1;
|
|
}
|
|
|
|
if (print) { writer.close(); }
|
|
|
|
if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); }
|
|
else { System.out.println(filename + " is VALID."); }
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
class VCFStats extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "locus", shortName = "locus", doc = "file listing loci to extract", required = true) public String locus_string;
|
|
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
VCFReader reader = null;
|
|
|
|
String[] tokens = locus_string.split("\\:|\\-");
|
|
String chr = tokens[0];
|
|
String start = tokens[1];
|
|
String stop = tokens[2];
|
|
Interval locus = new Interval(chr, Integer.parseInt(start), Integer.parseInt(stop));
|
|
|
|
if (autocorrect)
|
|
reader = new VCFReader(new File(in_filename),new VCFHomogenizer());
|
|
else
|
|
reader = new VCFReader(new File(in_filename));
|
|
|
|
VCFHeader header = reader.getHeader();
|
|
|
|
|
|
//////////////
|
|
// Stats collectors
|
|
int transitions = 0;
|
|
int transversions = 0;
|
|
int dbsnp = 0;
|
|
int total_snps = 0;
|
|
int[] AC_histogram = new int[1000]; int highest_AC = 0;
|
|
int[] DP_histogram = new int[1000000]; int highest_DP = 0;
|
|
|
|
int[] AC_transitions = new int[1000];
|
|
int[] DP_transitions = new int[1000];
|
|
|
|
int depth_sum = 0;
|
|
|
|
boolean before = true;
|
|
|
|
while(reader.hasNext())
|
|
{
|
|
VCFRecord record = null;
|
|
try
|
|
{
|
|
record = reader.next();
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
System.err.printf("WARNING: %s\n", e.toString());
|
|
continue;
|
|
}
|
|
Interval this_locus = VCFTool.getIntervalFromRecord(record);
|
|
if (locus.intersects(this_locus))
|
|
{
|
|
before = false;
|
|
|
|
Map<String,String> info = record.getInfoValues();
|
|
|
|
int AC = 0;
|
|
int DP = 0;
|
|
int DB = 0;
|
|
|
|
if (info.containsKey("AC")) { AC = Integer.parseInt(info.get("AC")); }
|
|
if (info.containsKey("DP")) { DP = Integer.parseInt(info.get("DP")); }
|
|
if (info.containsKey("DB")) { DB = Integer.parseInt(info.get("DB")); }
|
|
|
|
depth_sum += DP;
|
|
|
|
dbsnp += DB; // 1 if in dbsnp, 0 otherwise
|
|
|
|
AC_histogram[AC] += 1;
|
|
if (AC > highest_AC) { highest_AC = AC; }
|
|
|
|
DP_histogram[DP] += 1;
|
|
if (DP > highest_DP) { highest_DP = DP; }
|
|
|
|
if (VCFTool.isTransition(record)) { transitions += 1; AC_transitions[AC] += 1; DP_transitions[DP] += 1; }
|
|
else { transversions += 1; }
|
|
|
|
total_snps += 1;
|
|
//System.out.printf("%s\n", record.toStringEncoding(header));
|
|
}
|
|
else if ((before == false) && (this_locus.compareTo(locus) > 0)) { break; }
|
|
}
|
|
|
|
double mean_depth = (double)depth_sum / (double)total_snps;
|
|
double snp_rate = 1.0 / ((double)total_snps / (double)locus.length());
|
|
|
|
int DP_running_sum = 0;
|
|
int DP_1percent_low = -1;
|
|
int DP_5percent_low = -1;
|
|
for (int DP = 1; DP <= highest_DP; DP++)
|
|
{
|
|
if ((DP_1percent_low == -1) && (DP_running_sum >= 0.01*(double)total_snps)) { DP_1percent_low = DP; }
|
|
if ((DP_5percent_low == -1) && (DP_running_sum >= 0.05*(double)total_snps)) { DP_5percent_low = DP; }
|
|
DP_running_sum += DP_histogram[DP];
|
|
}
|
|
|
|
DP_running_sum = 0;
|
|
int DP_1percent_high = -1;
|
|
int DP_5percent_high = -1;
|
|
for (int DP = highest_DP; DP >= 0; DP--)
|
|
{
|
|
if ((DP_1percent_high == -1) && (DP_running_sum >= 0.01*(double)total_snps)) { DP_1percent_high = DP; }
|
|
if ((DP_5percent_high == -1) && (DP_running_sum >= 0.05*(double)total_snps)) { DP_5percent_high = DP; }
|
|
DP_running_sum += DP_histogram[DP];
|
|
}
|
|
|
|
|
|
System.out.printf("Locus : %s\n", locus.toString());
|
|
System.out.printf("Total SNPs : %d\n", total_snps);
|
|
System.out.printf("SNP Rate : 1/%f\n", snp_rate);
|
|
System.out.printf("Ts/Tv : %.02f\n", (double)transitions / (double)transversions);
|
|
System.out.printf("%%dbsnp : %.02f\n", 100.0 * (double)dbsnp / (double)total_snps);
|
|
System.out.printf("Average Depth : %f\n", mean_depth);
|
|
System.out.printf("1%% Depth bounds : %d %d\n", DP_1percent_low, DP_1percent_high);
|
|
System.out.printf("5%% Depth bounds : %d %d\n", DP_5percent_low, DP_5percent_high);
|
|
System.out.printf("\n");
|
|
|
|
System.out.printf("table\tAAF\tCount\tTs/Tv\n");
|
|
for (int AC = 1; AC <= highest_AC; AC++)
|
|
{
|
|
System.out.printf("AAF\t%d\t%d\t%f\n", AC, AC_histogram[AC], (double)AC_transitions[AC]/(double)(AC_histogram[AC]-AC_transitions[AC]));
|
|
}
|
|
System.out.printf("\n");
|
|
|
|
|
|
System.out.printf("DEPTH\ttable\tDepth\tCount\tTs/Tv\n");
|
|
for (int DP = 1; DP <= highest_DP; DP++)
|
|
{
|
|
System.out.printf("%d\t%d\t%f\n", DP, DP_histogram[DP], (double)DP_transitions[DP]/(double)(DP_histogram[DP]-DP_transitions[DP]));
|
|
}
|
|
System.out.printf("\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
}
|
|
|
|
class CheckRefFields extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "fasta", shortName = "fasta", doc = "reference FASTA", required = true) public String fasta_filename;
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
System.out.println("Checking " + filename + "...");
|
|
|
|
VCFReader reader = null;
|
|
|
|
if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); }
|
|
else { reader = new VCFReader(new File(filename)); }
|
|
|
|
ReferenceSequenceFileWalker ref = new ReferenceSequenceFileWalker(new File(fasta_filename));
|
|
String ref_seq_name = "";
|
|
byte[] ref_seq = null;
|
|
SAMSequenceDictionary ref_dict = ref.getSequenceDictionary();
|
|
|
|
VCFHeader header = reader.getHeader();
|
|
|
|
Date start_time = new Date();
|
|
int n_records_processed = 0;
|
|
while(reader.hasNext())
|
|
{
|
|
VCFRecord record = reader.next();
|
|
|
|
String chr = record.getChr();
|
|
if (! chr.equals(ref_seq_name))
|
|
{
|
|
System.out.println("Loading " + chr);
|
|
ref_seq = ref.get(ref_dict.getSequence(chr).getSequenceIndex()).getBases();
|
|
ref_seq_name = chr;
|
|
}
|
|
|
|
long offset = record.getStart();
|
|
char vcf_ref_base = record.getReference().charAt(0);
|
|
char fasta_ref_base = (char)ref_seq[(int)offset-1];
|
|
|
|
List<VCFGenotypeEncoding> alleles = record.getAlternateAlleles();
|
|
char vcf_alt_base = alleles.get(0).getBases().charAt(0);
|
|
|
|
//System.out.println(chr + " " + offset + " " + fasta_ref_base + " " + vcf_ref_base + " " + vcf_alt_base);
|
|
|
|
String ans = null;
|
|
if (vcf_ref_base != fasta_ref_base)
|
|
{
|
|
System.out.println("Error! Ref field does not match fasta. Fasta says " + fasta_ref_base);
|
|
System.out.println(record.toStringEncoding(header));
|
|
}
|
|
}
|
|
|
|
System.out.println("All reference fields correct.");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
class FixRefFields extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "fasta", shortName = "fasta", doc = "reference FASTA", required = true) public String fasta_filename;
|
|
@Argument(fullName = "output", shortName = "output", doc = "output file", required = true) public String output_filename;
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
System.out.println("Fixing " + filename + "...");
|
|
|
|
VCFReader reader = null;
|
|
|
|
if (autocorrect) { reader = new VCFReader(new File(filename),new VCFHomogenizer()); }
|
|
else { reader = new VCFReader(new File(filename)); }
|
|
|
|
ReferenceSequenceFileWalker ref = new ReferenceSequenceFileWalker(new File(fasta_filename));
|
|
String ref_seq_name = "";
|
|
byte[] ref_seq = null;
|
|
SAMSequenceDictionary ref_dict = ref.getSequenceDictionary();
|
|
|
|
|
|
VCFHeader header = reader.getHeader();
|
|
|
|
PrintStream output;
|
|
try
|
|
{
|
|
VCFWriter writer = new VCFWriter(new File(output_filename));
|
|
writer.writeHeader(header);
|
|
writer.close();
|
|
output = new PrintStream(new FileOutputStream(output_filename, true));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
Date start_time = new Date();
|
|
int n_records_processed = 0;
|
|
while(reader.hasNext())
|
|
{
|
|
VCFRecord record = reader.next();
|
|
|
|
String chr = record.getChr();
|
|
if (! chr.equals(ref_seq_name))
|
|
{
|
|
System.out.println("Loading " + chr);
|
|
ref_seq = ref.get(ref_dict.getSequence(chr).getSequenceIndex()).getBases();
|
|
ref_seq_name = chr;
|
|
}
|
|
|
|
long offset = record.getStart();
|
|
char vcf_ref_base = record.getReference().charAt(0);
|
|
char fasta_ref_base = (char)ref_seq[(int)offset-1];
|
|
|
|
List<VCFGenotypeEncoding> alleles = record.getAlternateAlleles();
|
|
char vcf_alt_base = alleles.get(0).getBases().charAt(0);
|
|
|
|
//System.out.println(chr + " " + offset + " " + fasta_ref_base + " " + vcf_ref_base + " " + vcf_alt_base);
|
|
|
|
String ans = null;
|
|
if ((vcf_ref_base != fasta_ref_base) && ((vcf_alt_base == fasta_ref_base) || (vcf_alt_base == '.')))
|
|
{
|
|
// swap!
|
|
String s = record.toStringEncoding(header);
|
|
String[] tokens = s.split("\\s+");
|
|
tokens[3] = Character.toString(fasta_ref_base);
|
|
tokens[4] = Character.toString(vcf_ref_base);
|
|
for (int i = 9; i < tokens.length; i++)
|
|
{
|
|
tokens[i] = tokens[i].replaceAll("0", "A");
|
|
tokens[i] = tokens[i].replaceAll("1", "B");
|
|
tokens[i] = tokens[i].replaceAll("B", "0");
|
|
tokens[i] = tokens[i].replaceAll("A", "1");
|
|
}
|
|
|
|
ans = "";
|
|
for (int i = 0; i < tokens.length; i++)
|
|
{
|
|
ans = ans + tokens[i] + "\t";
|
|
}
|
|
ans.replaceAll("\\s+$", "");
|
|
|
|
//System.out.println("from: " + s);
|
|
//System.out.println("to: " + ans);
|
|
}
|
|
else
|
|
{
|
|
ans = record.toStringEncoding(header);
|
|
}
|
|
|
|
output.println(ans);
|
|
}
|
|
|
|
output.flush();
|
|
output.close();
|
|
|
|
System.out.println("Done.");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
class VCFGrep extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
|
@Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename;
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
HashSet<String> loci = new HashSet<String>();
|
|
try
|
|
{
|
|
Scanner loci_reader;
|
|
|
|
if (loci_filename.endsWith(".gz")) { loci_reader = new Scanner(new GZIPInputStream(new FileInputStream(loci_filename))); }
|
|
else { loci_reader = new Scanner(new File(loci_filename)); }
|
|
|
|
while(loci_reader.hasNextLine())
|
|
{
|
|
String line = loci_reader.nextLine();
|
|
line = line.replaceAll("\\s+", "");
|
|
loci.add(line);
|
|
}
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
try
|
|
{
|
|
PrintStream output = new PrintStream(new File(out_filename));
|
|
|
|
Scanner reader;
|
|
if (in_filename.endsWith(".gz")) { reader = new Scanner(new GZIPInputStream(new FileInputStream(in_filename))); }
|
|
else { reader = new Scanner(new File(in_filename)); }
|
|
while(reader.hasNextLine())
|
|
{
|
|
String line = reader.nextLine();
|
|
|
|
if (line.matches("^\\#.*$")) { output.print(line + "\n"); continue; }
|
|
|
|
String[] tokens = line.split("\\s+");
|
|
String locus = tokens[0] + ":" + tokens[1];
|
|
if (loci.contains(locus)) { output.print(line + "\n"); continue; }
|
|
}
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
}
|
|
|
|
class VCFGrep_old extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
|
|
@Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename;
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
VCFReader reader = null;
|
|
VCFWriter writer = null;
|
|
|
|
HashSet<Interval> loci = new HashSet<Interval>();
|
|
try
|
|
{
|
|
Scanner loci_reader = new Scanner(new File(loci_filename));
|
|
while(loci_reader.hasNextLine())
|
|
{
|
|
String line = loci_reader.nextLine();
|
|
String[] tokens = line.split("\\:");
|
|
|
|
String chr = tokens[0];
|
|
String off = tokens[1];
|
|
loci.add(new Interval(chr, Integer.parseInt(off), Integer.parseInt(off)));
|
|
}
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
if (autocorrect) { reader = new VCFReader(new File(in_filename),new VCFHomogenizer()); }
|
|
else { reader = new VCFReader(new File(in_filename)); }
|
|
|
|
|
|
writer = new VCFWriter(new File(out_filename));
|
|
writer.writeHeader(reader.getHeader());
|
|
|
|
while(reader.hasNext())
|
|
{
|
|
VCFRecord record = reader.next();
|
|
Interval locus = VCFTool.getIntervalFromRecord(record);
|
|
if (loci.contains(locus)) { writer.addRecord(record); }
|
|
}
|
|
writer.close();
|
|
|
|
return 0;
|
|
}
|
|
|
|
}
|
|
|
|
class PrintGQ extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
VCFReader reader;
|
|
VCFReader reader2;
|
|
|
|
reader = new VCFReader(new File(filename),new VCFHomogenizer());
|
|
|
|
VCFHeader header = reader.getHeader();
|
|
VCFRecord record = reader.next();
|
|
|
|
while(true)
|
|
{
|
|
if (record == null) { break; }
|
|
|
|
Interval interval = VCFTool.getIntervalFromRecord(record);
|
|
|
|
if (record.isFiltered())
|
|
{
|
|
record = reader.next();
|
|
}
|
|
|
|
char ref = record.getReference().charAt(0);
|
|
|
|
String[] sample_names = record.getSampleNames();
|
|
|
|
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
|
|
|
for (int i = 0; i < sample_names.length; i++)
|
|
{
|
|
VCFGenotypeRecord rec = genotypes.get(i);
|
|
|
|
String gq = rec.getFields().get("GQ");
|
|
|
|
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
|
|
|
String g = "";
|
|
|
|
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
|
char[] c = g.toCharArray();
|
|
|
|
Arrays.sort(c);
|
|
|
|
g = new String(c);
|
|
|
|
System.out.println(g + " " + gq);
|
|
}
|
|
|
|
record = reader.next();
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
class VCFSimpleStats extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1;
|
|
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
|
@Argument(fullName = "min_call_rate", shortName = "min_call_rate", doc = "what fraction of samples must have a call", required = false) public double min_call_rate = 0.9;
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
//System.out.println("Loading " + filename + "...");
|
|
|
|
PrintStream output = null;
|
|
try
|
|
{
|
|
output = new PrintStream(new FileOutputStream(output_filename));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
VCFReader reader1;
|
|
|
|
if (autocorrect)
|
|
{
|
|
reader1 = new VCFReader(new File(filename1),new VCFHomogenizer());
|
|
}
|
|
else
|
|
{
|
|
reader1 = new VCFReader(new File(filename1));
|
|
}
|
|
|
|
VCFHeader header1 = reader1.getHeader();
|
|
|
|
VCFRecord record1 = reader1.next();
|
|
|
|
int TP = 0;
|
|
int FP = 0;
|
|
int TN = 0;
|
|
int FN = 0;
|
|
int total = 0;
|
|
int dropped = 0;
|
|
|
|
int ts = 0;
|
|
int tv = 0;
|
|
|
|
while(true)
|
|
{
|
|
if (record1 == null) { break; }
|
|
|
|
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
|
|
|
// (unless it is "filtered")
|
|
if (record1.isFiltered())
|
|
{
|
|
record1 = reader1.next();
|
|
}
|
|
|
|
char ref = record1.getReference().charAt(0);
|
|
|
|
|
|
String[] sample_names1 = record1.getSampleNames();
|
|
|
|
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
|
|
|
|
long n_ref_1 = 0;
|
|
long n_alt_1 = 0;
|
|
long n_total_1 = 0;
|
|
long n_calls_1 = 0;
|
|
long n_dropped_1 = 0;
|
|
|
|
for (int i = 0; i < sample_names1.length; i++)
|
|
{
|
|
VCFGenotypeRecord rec1 = genotypes1.get(i);
|
|
|
|
//if (rec2 == null) { continue; }
|
|
|
|
Long gq1;
|
|
|
|
if (rec1.getFields().get("GQ") != null)
|
|
{
|
|
Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ"));
|
|
gq1 = gq1_double.longValue();
|
|
}
|
|
else
|
|
{
|
|
gq1 = 0L;
|
|
}
|
|
|
|
List<VCFGenotypeEncoding> alleles1 = rec1.getAlleles();
|
|
|
|
String g1 = "";
|
|
|
|
for (int j = 0; j < alleles1.size(); j++) { g1 += alleles1.get(j).getBases(); }
|
|
|
|
char[] c1 = g1.toCharArray();
|
|
|
|
Arrays.sort(c1);
|
|
|
|
g1 = new String(c1);
|
|
|
|
n_total_1 += 1;
|
|
|
|
if (g1.equals(".."))
|
|
{
|
|
n_dropped_1 += 1;
|
|
continue;
|
|
}
|
|
|
|
n_calls_1 += 1;
|
|
|
|
if (g1.charAt(0) == ref) { n_ref_1 += 1; } else { n_alt_1 += 1; }
|
|
if (g1.charAt(1) == ref) { n_ref_1 += 1; } else { n_alt_1 += 1; }
|
|
}
|
|
|
|
if (((double)n_calls_1 / (double)n_total_1) >= min_call_rate)
|
|
{
|
|
if (n_alt_1 == 0) { FP += 1; }
|
|
if (n_alt_1 > 0) { TP += 1; }
|
|
total += 1;
|
|
|
|
if (VCFTool.isTransition(record1)) { ts += 1; }
|
|
else { tv += 1; }
|
|
}
|
|
else
|
|
{
|
|
dropped += 1;
|
|
}
|
|
|
|
if ((verbose) && (((double)n_calls_1 / (double)n_total_1) >= min_call_rate))
|
|
{
|
|
//output.printf("SNP "
|
|
// + interval1.toString()
|
|
// + " " + n_total_1 + " " + n_calls_1 + " " + (double)n_calls_1/(double)n_total_1 + " " + n_ref_1 + " " + n_alt_1 + "\n");
|
|
if (n_alt_1 == 0) { output.printf("FP: %s\n", interval1.toString()); }
|
|
if (n_alt_1 != 0) { output.printf("TP: %s\n", interval1.toString()); }
|
|
}
|
|
|
|
record1 = reader1.next();
|
|
}
|
|
|
|
|
|
// Now output the statistics.
|
|
|
|
output.printf("TP FP dropped ts tv ts/tv\n%d(%f) %d(%f) %d %d %d %f\n",
|
|
TP, (double)TP/(double)total,
|
|
FP, (double)FP/(double)total,
|
|
dropped,
|
|
ts, tv,
|
|
(double)ts/(double)tv);
|
|
|
|
output.flush();
|
|
output.close();
|
|
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
class VCFConcordance extends CommandLineProgram
|
|
{
|
|
@Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1;
|
|
@Argument(fullName = "vcf2", shortName = "vcf2", doc = "file to open", required = true) public String filename2;
|
|
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
|
|
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
|
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
|
@Argument(fullName = "list_genotypes", shortName = "list_genotypes", doc = "print each person's genotype for debugging", required = false) public Boolean list_genotypes = false;
|
|
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
|
@Argument(fullName = "samples", shortName = "samples", doc = "optional list of individuals to score", required = false) public String samples_filename = null;
|
|
@Argument(fullName = "r2_bin_size", shortName = "r2_bin_size", doc = "size of an r2 bin for calculating error rates", required = false) public double r2_bin_size = 0.01;
|
|
|
|
|
|
@Override
|
|
protected int execute()
|
|
{
|
|
//System.out.println("Loading " + filename + "...");
|
|
|
|
/////////////////////////////////
|
|
// All the various concordance counters
|
|
|
|
HashMap<String,GenotypeConcordance> individual = new HashMap<String,GenotypeConcordance>();
|
|
HashMap<Long,GenotypeConcordance> AAF = new HashMap<Long,GenotypeConcordance>();
|
|
HashMap<Long,GenotypeConcordance> Qual = new HashMap<Long,GenotypeConcordance>();
|
|
HashMap<Long,GenotypeConcordance> R2 = new HashMap<Long,GenotypeConcordance>();
|
|
|
|
int shared_ts = 0;
|
|
int shared_tv = 0;
|
|
int shared_dbsnp = 0;
|
|
int shared_total = 0;
|
|
|
|
int unique1_ts = 0;
|
|
int unique1_tv = 0;
|
|
int unique1_dbsnp = 0;
|
|
int unique1_total = 0;
|
|
|
|
int unique2_ts = 0;
|
|
int unique2_tv = 0;
|
|
int unique2_dbsnp = 0;
|
|
int unique2_total = 0;
|
|
|
|
//
|
|
/////////////////////////////////
|
|
|
|
HashSet<String> sample_mask = new HashSet<String>();
|
|
if (samples_filename != null)
|
|
{
|
|
Scanner samples_reader = null;
|
|
try
|
|
{
|
|
samples_reader = new Scanner(new File(samples_filename));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
while(samples_reader.hasNextLine())
|
|
{
|
|
String line = samples_reader.nextLine();
|
|
line.replaceAll("^\\s+|\\s+$", "");
|
|
sample_mask.add(line);
|
|
}
|
|
}
|
|
|
|
|
|
PrintStream output = null;
|
|
try
|
|
{
|
|
output = new PrintStream(new FileOutputStream(output_filename));
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw new RuntimeException(e);
|
|
}
|
|
|
|
VCFReader reader1;
|
|
VCFReader reader2;
|
|
|
|
if (autocorrect)
|
|
{
|
|
reader1 = new VCFReader(new File(filename1),new VCFHomogenizer());
|
|
reader2 = new VCFReader(new File(filename2),new VCFHomogenizer());
|
|
}
|
|
else
|
|
{
|
|
reader1 = new VCFReader(new File(filename1));
|
|
reader2 = new VCFReader(new File(filename2));
|
|
}
|
|
|
|
VCFHeader header1 = reader1.getHeader();
|
|
VCFHeader header2 = reader2.getHeader();
|
|
|
|
VCFRecord record1 = reader1.next();
|
|
VCFRecord record2 = reader2.next();
|
|
|
|
int number_sites_unique_to_file1 = 0;
|
|
int number_sites_unique_to_file2 = 0;
|
|
int number_sites_shared = 0;
|
|
|
|
while(true)
|
|
{
|
|
if (record1 == null) { break; }
|
|
if (record2 == null) { break; }
|
|
|
|
|
|
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
|
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
|
|
|
//int comparison = interval1.compareTo(interval2);
|
|
int comparison = VCFTool.compareIntervals(interval1, interval2);
|
|
|
|
//System.out.println("DBG: " + interval1 + " " + interval2 + " " + comparison);
|
|
|
|
if (comparison == 0)
|
|
{
|
|
// records match! compute concordance.
|
|
|
|
// (unless one of them is "filtered")
|
|
if (record1.isFiltered() || record2.isFiltered())
|
|
{
|
|
record1 = reader1.next();
|
|
record2 = reader2.next();
|
|
continue;
|
|
}
|
|
|
|
|
|
char ref = record1.getReference().charAt(0);
|
|
|
|
String[] sample_names1 = record1.getSampleNames();
|
|
String[] sample_names2 = record2.getSampleNames();
|
|
|
|
|
|
Map<String,String> info1 = record1.getInfoValues();
|
|
Map<String,String> info2 = record2.getInfoValues();
|
|
double r2_1 = 0;
|
|
double r2_2 = 0;
|
|
if (info1.containsKey("R2")) { r2_1 = Double.parseDouble(info1.get("R2")); }
|
|
if (info2.containsKey("R2")) { r2_2 = Double.parseDouble(info2.get("R2")); }
|
|
|
|
|
|
number_sites_shared += 1;
|
|
if (VCFTool.isTransition(record1)) { shared_ts += 1; }
|
|
else { shared_tv += 1; }
|
|
if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { shared_dbsnp += 1; }
|
|
shared_total += 1;
|
|
|
|
|
|
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
|
|
List<VCFGenotypeRecord> genotypes2 = record2.getVCFGenotypeRecords();
|
|
|
|
Map<String, VCFGenotypeRecord> map2 = new HashMap<String, VCFGenotypeRecord>();
|
|
for (int i = 0; i < genotypes2.size(); i++)
|
|
{
|
|
map2.put(genotypes2.get(i).getSampleName(), genotypes2.get(i));
|
|
}
|
|
|
|
GenotypeConcordance SNP = new GenotypeConcordance(interval1.toString());
|
|
|
|
long n_ref = 0;
|
|
long n_alt = 0;
|
|
|
|
for (int i = 0; i < sample_names1.length; i++)
|
|
{
|
|
if ((samples_filename != null) &&
|
|
(! sample_mask.contains(sample_names1[i])))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
|
|
VCFGenotypeRecord rec1 = genotypes1.get(i);
|
|
VCFGenotypeRecord rec2 = map2.get(sample_names1[i]);
|
|
|
|
if (rec2 == null) { continue; }
|
|
|
|
Long gq1;
|
|
if (rec1.getFields().get("GQ") != null)
|
|
{
|
|
Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ"));
|
|
gq1 = gq1_double.longValue();
|
|
}
|
|
else
|
|
{
|
|
gq1 = 0L;
|
|
}
|
|
|
|
Long gq2;
|
|
if (rec2.getFields().get("GQ") != null)
|
|
{
|
|
Double gq2_double = Double.parseDouble(rec2.getFields().get("GQ"));
|
|
gq2 = gq2_double.longValue();
|
|
}
|
|
else
|
|
{
|
|
gq2 = 0L;
|
|
}
|
|
|
|
List<VCFGenotypeEncoding> alleles1 = rec1.getAlleles();
|
|
List<VCFGenotypeEncoding> alleles2 = rec2.getAlleles();
|
|
|
|
String g1 = "";
|
|
String g2 = "";
|
|
|
|
for (int j = 0; j < alleles1.size(); j++) { g1 += alleles1.get(j).getBases(); }
|
|
for (int j = 0; j < alleles2.size(); j++) { g2 += alleles2.get(j).getBases(); }
|
|
|
|
char[] c1 = g1.toCharArray();
|
|
char[] c2 = g2.toCharArray();
|
|
|
|
Arrays.sort(c1);
|
|
Arrays.sort(c2);
|
|
|
|
g1 = new String(c1);
|
|
g2 = new String(c2);
|
|
|
|
if (list_genotypes)
|
|
{
|
|
String flag = "";
|
|
if (! g1.equals(g2)) { flag = "X"; }
|
|
output.printf("GENOTYPES "
|
|
+ interval1.toString()
|
|
+ " " + sample_names1[i]
|
|
+ " " + g1
|
|
+ " " + g2
|
|
+ " " + gq1
|
|
+ " " + gq2
|
|
+ " " + flag + "\n");
|
|
}
|
|
|
|
if ((g1.equals("..")) ||
|
|
(g2.equals("..")))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (g1.charAt(0) == ref) { n_ref += 1; } else { n_alt += 1; }
|
|
if (g1.charAt(1) == ref) { n_ref += 1; } else { n_alt += 1; }
|
|
|
|
if (! individual.containsKey(sample_names1[i])) { individual.put(sample_names1[i], new GenotypeConcordance(sample_names1[i])); }
|
|
if (! Qual.containsKey(gq1)) { Qual.put(gq1, new GenotypeConcordance(Long.toString(gq1))); }
|
|
|
|
individual.get(sample_names1[i]).add(ref, g1, g2);
|
|
Qual.get(gq1).add(ref, g1, g2);
|
|
SNP.add(ref, g1, g2);
|
|
|
|
}
|
|
|
|
if (verbose)
|
|
{
|
|
//output.printf("SNP " + SNP.toString());
|
|
output.printf("SNP " + SNP.toLine());
|
|
}
|
|
|
|
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
|
|
AAF.get(n_alt).add(SNP);
|
|
|
|
long r2_index = (long)(r2_1 / r2_bin_size);
|
|
if (! R2.containsKey(r2_index)) { R2.put(r2_index, new GenotypeConcordance(Double.toString(r2_1))); }
|
|
R2.get(r2_index).add(SNP);
|
|
|
|
//System.out.printf("DBG: %f %f\n", r2_1, r2_2);
|
|
//System.out.printf("DBG: %f %d %s\n", r2_1, r2_index, SNP.toString());
|
|
|
|
record1 = reader1.next();
|
|
record2 = reader2.next();
|
|
}
|
|
else if (comparison > 0)
|
|
{
|
|
if (record2.isFiltered()) { record2 = reader2.next(); continue; }
|
|
|
|
// interval1 is later than interval2.
|
|
Map<String,String> info2 = record2.getInfoValues();
|
|
number_sites_unique_to_file2 += 1;
|
|
if (VCFTool.isTransition(record2)) { unique2_ts += 1; }
|
|
else { unique2_tv += 1; }
|
|
if ((info2.get("DB") != null) && (Integer.parseInt(info2.get("DB")) == 1)) { unique2_dbsnp += 1; }
|
|
unique2_total += 1;
|
|
|
|
//if (verbose) { output.printf("DBG: skipping %s\n", record2.toStringEncoding(header2)); }
|
|
|
|
record2 = reader2.next();
|
|
}
|
|
else if (comparison < 0)
|
|
{
|
|
if (record1.isFiltered()) { record1 = reader1.next(); continue; }
|
|
|
|
// interval2 is later than interval1.
|
|
Map<String,String> info1 = record1.getInfoValues();
|
|
number_sites_unique_to_file1 += 1;
|
|
if (VCFTool.isTransition(record1)) { unique1_ts += 1; }
|
|
else { unique1_tv += 1; }
|
|
if ((info1.get("DB") != null) && (Integer.parseInt(info1.get("DB")) == 1)) { unique1_dbsnp += 1; }
|
|
unique1_total += 1;
|
|
|
|
//if (verbose) { output.printf("DBG: skipping %s\n", record1.toStringEncoding(header1)); }
|
|
|
|
record1 = reader1.next();
|
|
}
|
|
}
|
|
|
|
|
|
// Now output the statistics.
|
|
if (verbose)
|
|
{
|
|
output.printf("\n");
|
|
Object[] individuals = individual.keySet().toArray();
|
|
for (int i = 0; i < individuals.length; i++)
|
|
{
|
|
String ind = (String)individuals[i];
|
|
output.print("INDIVIDUAL " + individual.get(ind).toString());
|
|
}
|
|
|
|
output.printf("\n");
|
|
Object[] AAFs = AAF.keySet().toArray();
|
|
for (int i = 0; i < AAFs.length; i++)
|
|
{
|
|
Long aaf = (Long)AAFs[i];
|
|
output.print("AAF " + AAF.get(aaf).toString());
|
|
}
|
|
|
|
output.printf("\n");
|
|
Object[] quals = Qual.keySet().toArray();
|
|
for (int i = 0; i < quals.length; i++)
|
|
{
|
|
Long qual = (Long)quals[i];
|
|
output.print("QUAL " + Qual.get(qual).toString());
|
|
}
|
|
output.printf("\n");
|
|
|
|
output.printf("\n");
|
|
Object[] R2s = R2.keySet().toArray();
|
|
for (int i = 0; i < AAFs.length; i++)
|
|
{
|
|
Long r2 = (Long)R2s[i];
|
|
output.print("R2 " + R2.get(r2).toString());
|
|
}
|
|
}
|
|
|
|
output.printf("Number of sites shared : %d %f %f\n", number_sites_shared,
|
|
(double)shared_ts/(double)shared_tv,
|
|
(double)shared_dbsnp/(double)(shared_ts+shared_tv));
|
|
|
|
output.printf("Number of sites unique to %s: %d %f %f\n", filename1, number_sites_unique_to_file1,
|
|
(double)unique1_ts/(double)unique1_tv,
|
|
(double)unique1_dbsnp/(double)(unique1_ts+unique1_tv));
|
|
|
|
output.printf("Number of sites unique to %s: %d %f %f\n", filename2, number_sites_unique_to_file2,
|
|
(double)unique2_ts/(double)unique2_tv,
|
|
(double)unique2_dbsnp/(double)(unique2_ts+unique2_tv));
|
|
|
|
output.printf("\n");
|
|
Object[] individuals = individual.keySet().toArray();
|
|
for (int i = 0; i < individuals.length; i++)
|
|
{
|
|
String ind = (String)individuals[i];
|
|
output.printf("INDIVIDUAL %s %f %d %d\n", ind, individual.get(ind).errorRate(), individual.get(ind).total(), individual.get(ind).totalNonHomRef());
|
|
}
|
|
|
|
output.printf("\n");
|
|
Object[] AAFs = AAF.keySet().toArray();
|
|
for (int i = 0; i < AAFs.length; i++)
|
|
{
|
|
Long aaf = (Long)AAFs[i];
|
|
output.printf("AAF %d %f %d %d %f\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef(), AAF.get(aaf).hetErrorRate());
|
|
}
|
|
|
|
output.printf("\n");
|
|
Object[] quals = Qual.keySet().toArray();
|
|
for (int i = 0; i < quals.length; i++)
|
|
{
|
|
Long qual = (Long)quals[i];
|
|
output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef());
|
|
}
|
|
|
|
output.printf("\n");
|
|
Object[] R2s = R2.keySet().toArray();
|
|
for (int i = 0; i < R2s.length; i++)
|
|
{
|
|
Long r2 = (Long)R2s[i];
|
|
output.printf("R2 %f %f %d %d\n", (double)r2 * r2_bin_size, R2.get(r2).errorRate(), R2.get(r2).total(), R2.get(r2).totalNonHomRef());
|
|
}
|
|
|
|
output.flush();
|
|
output.close();
|
|
|
|
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
public class VCFTool
|
|
{
|
|
public static void main(String args[])
|
|
{
|
|
// silence log4j messages.
|
|
//appender = new FileAppender(layout, clp.toFile, false);
|
|
//logger.addAppender(appender);
|
|
|
|
SetupSequenceDictionary();
|
|
|
|
String mode = args[0];
|
|
String[] realArgs = Arrays.copyOfRange(args, 1, args.length);
|
|
|
|
if (mode.equals("validate"))
|
|
{
|
|
VCFValidate cm = new VCFValidate();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("grep"))
|
|
{
|
|
VCFGrep cm = new VCFGrep();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("concordance"))
|
|
{
|
|
VCFConcordance cm = new VCFConcordance();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("simple_stats"))
|
|
{
|
|
VCFSimpleStats cm = new VCFSimpleStats();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("printGQ"))
|
|
{
|
|
PrintGQ cm = new PrintGQ();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("fix_ref_fields"))
|
|
{
|
|
FixRefFields cm = new FixRefFields();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("check_ref_fields"))
|
|
{
|
|
CheckRefFields cm = new CheckRefFields();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("stats"))
|
|
{
|
|
VCFStats cm = new VCFStats();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("sequenom"))
|
|
{
|
|
VCFSequenomAnalysis cm = new VCFSequenomAnalysis();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("sequenom2"))
|
|
{
|
|
VCFSequenomAnalysis2 cm = new VCFSequenomAnalysis2();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("call_rates"))
|
|
{
|
|
VCFCallRates cm = new VCFCallRates();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("optimize"))
|
|
{
|
|
VCFOptimize cm = new VCFOptimize();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("apply_cuts"))
|
|
{
|
|
VCFApplyCuts cm = new VCFApplyCuts();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
if (mode.equals("merge"))
|
|
{
|
|
VCFMerge cm = new VCFMerge();
|
|
CommandLineProgram.start(cm,realArgs);
|
|
System.exit(0);
|
|
}
|
|
|
|
System.out.printf("ERROR: mode %s not defined.\n", mode);
|
|
System.exit(-1);
|
|
|
|
}
|
|
|
|
|
|
/////////////////////////
|
|
// Some helpful utilities.
|
|
|
|
// Total hack to set up a sequence dictionary for 1kG hg18/build36 without needing to load a fasta.
|
|
public static SAMSequenceDictionary dict;
|
|
public static void SetupSequenceDictionary()
|
|
{
|
|
dict = new SAMSequenceDictionary();
|
|
for (int i = 1; i <= 22; i++)
|
|
{
|
|
dict.addSequence(new SAMSequenceRecord(String.format("%d", i)));
|
|
}
|
|
dict.addSequence(new SAMSequenceRecord("X"));
|
|
dict.addSequence(new SAMSequenceRecord("Y"));
|
|
dict.addSequence(new SAMSequenceRecord("M"));
|
|
GenomeLocParser.setupRefContigOrdering(dict);
|
|
}
|
|
|
|
public static Interval getIntervalFromRecord(VCFRecord record)
|
|
{
|
|
String chr = record.getChr();
|
|
long off = record.getStart();
|
|
return new Interval(chr, (int)off, (int)off);
|
|
}
|
|
|
|
public static char getAlt(VCFRecord record)
|
|
{
|
|
List<VCFGenotypeEncoding> alleles = record.getAlternateAlleles();
|
|
char alt = alleles.get(0).getBases().charAt(0);
|
|
return alt;
|
|
}
|
|
|
|
public static boolean isTransition(VCFRecord record)
|
|
{
|
|
char ref = record.getReference().charAt(0);
|
|
List<VCFGenotypeEncoding> alleles = record.getAlternateAlleles();
|
|
char alt = alleles.get(0).getBases().charAt(0);
|
|
|
|
if (((ref == 'A') && (alt == 'G')) ||
|
|
((ref == 'G') && (alt == 'A')) ||
|
|
((ref == 'C') && (alt == 'T')) ||
|
|
((ref == 'T') && (alt == 'C')))
|
|
{
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
public static int Compute_n_total(VCFRecord record)
|
|
{
|
|
return VCFTool.Compute_n_total(record, (String[])null);
|
|
}
|
|
|
|
public static int Compute_n_total(VCFRecord record, String[] sample_names)
|
|
{
|
|
HashSet<String> set = null;
|
|
if (sample_names != null)
|
|
{
|
|
set = new HashSet<String>();
|
|
for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); }
|
|
}
|
|
return VCFTool.Compute_n_total(record, set);
|
|
}
|
|
|
|
public static int Compute_n_total(VCFRecord record, Set<String> sample_mask)
|
|
{
|
|
String[] sample_names = record.getSampleNames();
|
|
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
|
int n_ref = 0;
|
|
int n_alt = 0;
|
|
for (int i = 0; i < sample_names.length; i++)
|
|
{
|
|
if ((sample_mask != null) && (! sample_mask.contains(sample_names[i])))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
VCFGenotypeRecord rec = genotypes.get(i);
|
|
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
|
String g = "";
|
|
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
|
char[] c = g.toCharArray();
|
|
Arrays.sort(c);
|
|
g = new String(c);
|
|
if (g.equals("..")) { continue; }
|
|
if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
}
|
|
return n_alt + n_ref;
|
|
}
|
|
|
|
public static int Compute_n_alt(VCFRecord record)
|
|
{
|
|
return VCFTool.Compute_n_alt(record, (String[])null);
|
|
}
|
|
|
|
public static int Compute_n_alt(VCFRecord record, String[] sample_names)
|
|
{
|
|
HashSet<String> set = null;
|
|
if (sample_names != null)
|
|
{
|
|
set = new HashSet<String>();
|
|
for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); }
|
|
}
|
|
return VCFTool.Compute_n_alt(record, set);
|
|
}
|
|
|
|
public static int Compute_n_alt(VCFRecord record, Set<String> sample_mask)
|
|
{
|
|
String[] sample_names = record.getSampleNames();
|
|
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
|
int n_ref = 0;
|
|
int n_alt = 0;
|
|
for (int i = 0; i < sample_names.length; i++)
|
|
{
|
|
// Skip samples we should skip.
|
|
if ((sample_mask != null) && (! sample_mask.contains(sample_names[i])))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
VCFGenotypeRecord rec = genotypes.get(i);
|
|
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
|
String g = "";
|
|
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
|
char[] c = g.toCharArray();
|
|
Arrays.sort(c);
|
|
g = new String(c);
|
|
if (g.equals("..")) { continue; }
|
|
if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
}
|
|
return n_alt;
|
|
}
|
|
|
|
|
|
public static int Compute_n_het(VCFRecord record)
|
|
{
|
|
return VCFTool.Compute_n_het(record, (String[])null);
|
|
}
|
|
|
|
public static int Compute_n_het(VCFRecord record, String[] sample_names)
|
|
{
|
|
HashSet<String> set = null;
|
|
if (sample_names != null)
|
|
{
|
|
set = new HashSet<String>();
|
|
for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); }
|
|
}
|
|
return VCFTool.Compute_n_het(record, set);
|
|
}
|
|
|
|
public static int Compute_n_het(VCFRecord record, Set<String> sample_mask)
|
|
{
|
|
String[] sample_names = record.getSampleNames();
|
|
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
|
int n_het = 0;
|
|
for (int i = 0; i < sample_names.length; i++)
|
|
{
|
|
// Skip samples we should skip.
|
|
if ((sample_mask != null) && (! sample_mask.contains(sample_names[i])))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
int n_ref = 0;
|
|
int n_alt = 0;
|
|
|
|
VCFGenotypeRecord rec = genotypes.get(i);
|
|
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
|
String g = "";
|
|
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
|
char[] c = g.toCharArray();
|
|
Arrays.sort(c);
|
|
g = new String(c);
|
|
if (g.equals("..")) { continue; }
|
|
if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
if (n_alt == 1) { n_het += 1; }
|
|
}
|
|
return n_het;
|
|
}
|
|
|
|
public static double Compute_failure_rate(VCFRecord record)
|
|
{
|
|
String[] sample_names = record.getSampleNames();
|
|
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
|
double failure_rate = 0.0;
|
|
for (int i = 0; i < sample_names.length; i++)
|
|
{
|
|
VCFGenotypeRecord rec = genotypes.get(i);
|
|
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
|
String g = "";
|
|
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
|
char[] c = g.toCharArray();
|
|
Arrays.sort(c);
|
|
g = new String(c);
|
|
if (g.equals("..")) { failure_rate += 1; continue; }
|
|
}
|
|
return failure_rate / (double)sample_names.length;
|
|
}
|
|
|
|
public static double Compute_HWE(VCFRecord record)
|
|
{
|
|
return VCFTool.Compute_HWE(record, (String[])null);
|
|
}
|
|
|
|
public static double Compute_HWE(VCFRecord record, String[] sample_names)
|
|
{
|
|
HashSet<String> set = null;
|
|
if (sample_names != null)
|
|
{
|
|
set = new HashSet<String>();
|
|
for (int i = 0; i < sample_names.length; i++) { set.add(sample_names[i]); }
|
|
}
|
|
return VCFTool.Compute_HWE(record, set);
|
|
}
|
|
|
|
public static double Compute_HWE(VCFRecord record, Set<String> sample_mask)
|
|
{
|
|
int ref = 0;
|
|
int het = 0;
|
|
int hom = 0;
|
|
int N = 0;
|
|
|
|
String[] sample_names = record.getSampleNames();
|
|
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
|
for (int i = 0; i < sample_names.length; i++)
|
|
{
|
|
// Skip samples we should skip.
|
|
if ((sample_mask != null) && (! sample_mask.contains(sample_names[i])))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
int n_ref = 0;
|
|
int n_alt = 0;
|
|
|
|
VCFGenotypeRecord rec = genotypes.get(i);
|
|
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
|
String g = "";
|
|
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
|
char[] c = g.toCharArray();
|
|
Arrays.sort(c);
|
|
g = new String(c);
|
|
if (g.equals("..")) { continue; }
|
|
if (g.charAt(0) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
if (g.charAt(1) == record.getReference().charAt(0)) { n_ref += 1; } else { n_alt += 1; }
|
|
|
|
if (n_ref == 2) { ref += 1; }
|
|
else if (n_ref == 1 && n_alt == 1) { het += 1; }
|
|
else if (n_alt == 2) { hom += 1; }
|
|
|
|
N += 1;
|
|
}
|
|
|
|
double p = (2.0 * ref + het) / (2.0 * (ref + het + hom));
|
|
double q = 1.0 - p;
|
|
|
|
//System.out.printf("DBG: p=%f q=%f ref=%d het=%d hom=%d\n", p, q, ref, het, hom);
|
|
|
|
double expected_ref = p * p * N;
|
|
double expected_het = 2.0 * p * q * N;
|
|
double expected_hom = q * q * N;
|
|
|
|
double chi_squared = (Math.pow(ref - expected_ref,2)/expected_ref) + (Math.pow(het - expected_het,2)/expected_het) + (Math.pow(hom - expected_hom,2)/expected_hom);
|
|
|
|
return chi_squared;
|
|
}
|
|
|
|
// This function assumes a 1-degree of freedom chi-squared.
|
|
public static double P_from_Chi(double chi)
|
|
{
|
|
double gamma = 1.772454;
|
|
double a = Math.pow(2,0.5) * gamma;
|
|
double b = Math.pow(chi, 0.5-1.0) * Math.exp((-1.0 * chi)/2.0);
|
|
double ans = (1.0/a) * b;
|
|
return ans;
|
|
}
|
|
|
|
public static int compareIntervals(Interval a, Interval b)
|
|
{
|
|
int chr_a;
|
|
int chr_b;
|
|
|
|
if (a.getSequence().equals("X")) { chr_a = 23; }
|
|
else if (a.getSequence().equals("Y")) { chr_a = 24; }
|
|
else if (a.getSequence().equals("M")) { chr_a = 25; }
|
|
else { chr_a = Integer.parseInt(a.getSequence()); }
|
|
|
|
if (b.getSequence().equals("X")) { chr_b = 23; }
|
|
else if (b.getSequence().equals("Y")) { chr_b = 24; }
|
|
else if (b.getSequence().equals("M")) { chr_b = 25; }
|
|
else { chr_b = Integer.parseInt(b.getSequence()); }
|
|
|
|
int start_a = a.getStart();
|
|
int start_b = b.getStart();
|
|
|
|
int end_a = a.getEnd();
|
|
int end_b = b.getEnd();
|
|
|
|
if (chr_a < chr_b) { return -1; }
|
|
else if (chr_a > chr_b) { return 1; }
|
|
else if (start_a < start_b) { return -1; }
|
|
else if (start_a > start_b) { return 1; }
|
|
else if (end_a < end_b) { return -1; }
|
|
else if (end_a > end_b) { return 1; }
|
|
else { return 0; }
|
|
}
|
|
|
|
}
|
|
|