VCFHomogenizer: A class that extends InputStream and dynamically re-writes pilot1 VCF's to be on-spec.

VCFTool: A command-line tool with various useful VCF functions (validate, grep, concordance).




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2202 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
jmaguire 2009-12-01 17:55:42 +00:00
parent adf8f1f8b3
commit 74f6526e09
2 changed files with 735 additions and 0 deletions

View File

@ -0,0 +1,217 @@
package org.broadinstitute.sting.playground.tools.vcf;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import edu.mit.broad.picard.util.Interval;
import java.io.*;
import java.util.*;
import java.util.zip.*;
// Edit a VCF on the fly to be on-spec.
/**
* @author jmaguire
*/
class VCFHomogenizer extends InputStream
{
private BufferedReader in = null;
private String currentLine;
private ByteArrayInputStream stream;
public VCFHomogenizer(Reader in)
{
this.in = (BufferedReader)in;
currentLine = this.readLine();
stream = new ByteArrayInputStream(currentLine.getBytes());
}
////////////////////////////////////////
// InputStream methods
public int available()
{
return 1;
}
public void close() throws java.io.IOException
{
this.in.close();
}
public void mark(int readlimit)
{
System.err.println("not implemented");
System.exit(-1);
}
public void reset()
{
System.err.println("not implemented");
System.exit(-1);
}
public boolean markSupported()
{
return false;
}
public int read()
{
if ((stream == null) || (stream.available() == 0))
{
currentLine = this.readLine();
if (currentLine == null) { return -1; }
stream = new ByteArrayInputStream(currentLine.getBytes());
}
return stream.read();
}
// END InputStream methods
/////////////////////////////////////
public static VCFHomogenizer create(String filename)
{
try
{
if (filename.endsWith(".gz"))
{
return new VCFHomogenizer(new BufferedReader(
new InputStreamReader(
new GZIPInputStream(
new FileInputStream(filename)))));
}
else
{
return new VCFHomogenizer(new BufferedReader(
new InputStreamReader(
new FileInputStream(filename))));
}
}
catch (Exception e)
{
e.printStackTrace();
System.exit(-1);
}
return null;
}
//my ($chr, $off, $id, $ref, $alt, $qual, $filter, $info, $format, @genotypes) = @tokens;
private String editLine(String input)
{
if (input == null) { return null; }
//System.out.println("input : " + input);
/////////
// Header corrections
if (input.startsWith("##format=VCFv3.2")) { return "##format=VCRv3.2\n"; }
if (input.startsWith("#CHROM")) { return input.replaceAll("PROB", "QUAL"); }
if (input.startsWith("#")) { return input; }
/////////
// Line-level corrections
// make "nan" into "NaN"
input = input.replaceAll("nan", "NaN");
input = input.replaceAll("DB(\\;|\\s)", "DB=1$1");
input = input.replaceAll("HM2(\\;|\\s)", "HM2=1$1");
input = input.replaceAll("HM3(\\;|\\s)", "HM3=1$1");
String[] tokens = input.split("\\s+");
/////////
// Token-level corrections
// if alt is "N", make it "."
if (tokens[4].equals("N")) { tokens[4] = "."; }
String ref = tokens[3];
String alt = tokens[4];
String[] alts = alt.split(",");
for (int i = 9; i < tokens.length; i++)
{
if (tokens[i].equals(".")) { tokens[i] = "./.:0"; }
tokens[i] = tokens[i].replaceAll(ref, "0");
if (! alt.equals("."))
{
if (alts.length == 1)
{
tokens[i] = tokens[i].replaceAll(alt, "1");
}
else
{
for (int j = 0; j < alts.length; j++)
{
tokens[i] = tokens[i].replaceAll(alts[j], "1");
}
}
}
}
/////////
// Now put it back together and emit.
String output = tokens[0];
for (int i = 1; i < tokens.length; i++)
{
output = output + "\t" + tokens[i];
}
output = output + "\n";
//System.out.println("output: " + output);
return output;
}
public String readLine()
{
try
{
String line = in.readLine();
if (line == null) { return null; }
else { return editLine(line + "\n"); }
}
catch (Exception e)
{
e.printStackTrace();
System.exit(-1);
}
return null;
}
/*
public int read()
{
throw new RuntimeException("VCFHomogenizer.read() not implemented.");
}
public byte read(byte[] b)
{
throw new RuntimeException("VCFHomogenizer.read(byte[]) not implemented.");
}
public int read(byte[] b, int off, int len)
{
throw new RuntimeException("VCFHomogenizer.read(byte[], int, int) not implemented.");
}
public long skip(long n)
{
throw new RuntimeException("VCFHomogenizer.skip(long) not implemented.");
}
public boolean markSupported() { return false; }
*/
}

View File

@ -0,0 +1,518 @@
package org.broadinstitute.sting.playground.tools.vcf;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import edu.mit.broad.picard.util.Interval;
import java.io.*;
import java.util.*;
// First draft of a program for working with VCF files in various ways.
/**
* @author jmaguire
*/
class VCFValidate extends CommandLineProgram
{
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "print", shortName = "print", doc = "print the vcf records to stdout", required = false) public Boolean print = false;
@Argument(fullName = "profile", shortName = "profile", doc = "print performance information", required = false) public Boolean profile = false;
@Override
protected int execute()
{
System.out.println("Validating " + filename + "...");
VCFReader reader = null;
if (autocorrect) { reader = new VCFReader(VCFHomogenizer.create(filename)); }
else { reader = new VCFReader(new File(filename)); }
VCFHeader header = reader.getHeader();
Date start_time = new Date();
int n_records_processed = 0;
while(reader.hasNext())
{
VCFRecord record = reader.next();
if (print) { System.out.println(record.toStringEncoding(header)); }
if ((profile) && (n_records_processed % 10000 == 0))
{
Date current_time = new Date();
long elapsed = current_time.getTime() - start_time.getTime();
System.out.printf("RUNTIME: %d records processed in %f seconds; %f seconds per record.\n",
n_records_processed,
(double)elapsed/1000.0,
((double)elapsed/1000.0)/(double)n_records_processed);
}
n_records_processed += 1;
}
if (autocorrect) { System.out.println(filename + " is VALID (after auto-correction)."); }
else { System.out.println(filename + " is VALID."); }
return 0;
}
}
class VCFGrep extends CommandLineProgram
{
@Argument(fullName = "input", shortName = "input", doc = "file to read", required = true) public String in_filename;
@Argument(fullName = "output", shortName = "output", doc = "file to write", required = true) public String out_filename;
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "loci", shortName = "loci", doc = "file listing loci to extract", required = true) public String loci_filename;
@Override
protected int execute()
{
VCFReader reader = null;
VCFWriter writer = null;
HashSet<Interval> loci = new HashSet<Interval>();
try
{
Scanner loci_reader = new Scanner(new File(loci_filename));
while(loci_reader.hasNextLine())
{
String line = loci_reader.nextLine();
String[] tokens = line.split("\\:");
String chr = tokens[0];
String off = tokens[1];
loci.add(new Interval(chr, Integer.parseInt(off), Integer.parseInt(off)));
}
}
catch (Exception e)
{
throw new RuntimeException(e);
}
if (autocorrect) { reader = new VCFReader(VCFHomogenizer.create(in_filename)); }
else { reader = new VCFReader(new File(in_filename)); }
VCFHeader header = reader.getHeader();
writer = new VCFWriter(header, new File(out_filename));
while(reader.hasNext())
{
VCFRecord record = reader.next();
Interval locus = VCFTool.getIntervalFromRecord(record);
if (loci.contains(locus)) { writer.addRecord(record); }
}
writer.close();
return 0;
}
}
class VCFConcordance extends CommandLineProgram
{
@Argument(fullName = "vcf1", shortName = "vcf1", doc = "file to open", required = true) public String filename1;
@Argument(fullName = "vcf2", shortName = "vcf2", doc = "file to open", required = true) public String filename2;
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
class GenotypeConcordance
{
String name;
protected int[][] counts = {{0,0,0},
{0,0,0},
{0,0,0}};
public GenotypeConcordance(String name)
{
this.name = name;
}
public void add(char ref, String g1, String g2)
{
int g1_dosage = 0;
int g2_dosage = 0;
if (g1.charAt(0) != ref) { g1_dosage += 1; }
if (g1.charAt(1) != ref) { g1_dosage += 1; }
if (g2.charAt(0) != ref) { g2_dosage += 1; }
if (g2.charAt(1) != ref) { g2_dosage += 1; }
counts[g1_dosage][g2_dosage] += 1;
}
public void add(GenotypeConcordance G)
{
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
counts[i][j] += G.counts[i][j];
}
}
}
public String toString()
{
String s = this.name + "\n";
int on_diag = 0;
int on_diag_not_homref = 0;
int off_diag = 0;
int total = 0;
int total_not_homref = 0;
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
s += counts[i][j] + "\t";
if (i == j) { on_diag += counts[i][j]; }
if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; }
if (i != j) { off_diag += counts[i][j]; }
if (i != 0 || j != 0) { total_not_homref += counts[i][j]; }
total += counts[i][j];
}
s += "\n";
}
s += String.format("On-Diagonal = %.02f\n", 100.0 * (double)on_diag / (double)total);
s += String.format("On-Diagonal (not hom-ref) = %.02f\n", 100.0 * (double)on_diag_not_homref / (double)total_not_homref);
s += String.format("Off-Diagonal = %.02f\n", 100.0 * (double)off_diag / (double)total_not_homref);
s += String.format("Total = %d\n", total);
s += String.format("Total (not hom-ref) = %d\n", total_not_homref);
s += "\n";
return s;
}
public double errorRate()
{
int off_diag = 0;
int total_not_homref = 0;
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
if (i != j) { off_diag += counts[i][j]; }
if (i != 0 || j != 0) { total_not_homref += counts[i][j]; }
}
}
double error_rate = (double)off_diag / (double)total_not_homref;
return error_rate;
}
public int total()
{
int total = 0;
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
total += counts[i][j];
}
}
return total;
}
public int totalNonHomRef()
{
int total = 0;
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
if (i != 0 || j != 0) { total += counts[i][j]; }
}
}
return total;
}
}
@Override
protected int execute()
{
//System.out.println("Loading " + filename + "...");
/////////////////////////////////
// All the various concordance counters
HashMap<String,GenotypeConcordance> individual = new HashMap<String,GenotypeConcordance>();
HashMap<Long,GenotypeConcordance> AAF = new HashMap<Long,GenotypeConcordance>();
HashMap<Long,GenotypeConcordance> Qual = new HashMap<Long,GenotypeConcordance>();
//
/////////////////////////////////
PrintStream output = null;
try
{
output = new PrintStream(new FileOutputStream(output_filename));
}
catch (Exception e)
{
throw new RuntimeException(e);
}
VCFReader reader1;
VCFReader reader2;
if (autocorrect)
{
reader1 = new VCFReader(VCFHomogenizer.create(filename1));
reader2 = new VCFReader(VCFHomogenizer.create(filename2));
}
else
{
reader1 = new VCFReader(new File(filename1));
reader2 = new VCFReader(new File(filename2));
}
VCFHeader header1 = reader1.getHeader();
VCFHeader header2 = reader2.getHeader();
VCFRecord record1 = reader1.next();
VCFRecord record2 = reader2.next();
while(true)
{
if (record1 == null) { break; }
if (record2 == null) { break; }
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
int comparison = interval1.compareTo(interval2);
if (comparison == 0)
{
// records match! compute concordance.
// (unless one of them is "filtered")
if (record1.isFiltered() || record2.isFiltered())
{
record1 = reader1.next();
record2 = reader2.next();
}
char ref = record1.getReferenceBase();
String[] sample_names1 = record1.getSampleNames();
String[] sample_names2 = record2.getSampleNames();
List<VCFGenotypeRecord> genotypes1 = record1.getVCFGenotypeRecords();
List<VCFGenotypeRecord> genotypes2 = record2.getVCFGenotypeRecords();
Map<String, VCFGenotypeRecord> map2 = new HashMap<String, VCFGenotypeRecord>();
for (int i = 0; i < genotypes2.size(); i++)
{
map2.put(genotypes2.get(i).getSampleName(), genotypes2.get(i));
}
GenotypeConcordance SNP = new GenotypeConcordance(interval1.toString());
long n_ref = 0;
long n_alt = 0;
for (int i = 0; i < sample_names1.length; i++)
{
VCFGenotypeRecord rec1 = genotypes1.get(i);
VCFGenotypeRecord rec2 = map2.get(sample_names1[i]);
if (rec2 == null) { continue; }
Long gq1;
if (rec1.getFields().get("GQ") != null)
{
Double gq1_double = Double.parseDouble(rec1.getFields().get("GQ"));
gq1 = gq1_double.longValue();
}
else
{
gq1 = 0L;
}
List<VCFGenotypeEncoding> alleles1 = rec1.getAlleles();
List<VCFGenotypeEncoding> alleles2 = rec2.getAlleles();
String g1 = "";
String g2 = "";
for (int j = 0; j < alleles1.size(); j++) { g1 += alleles1.get(j).getBases(); }
for (int j = 0; j < alleles2.size(); j++) { g2 += alleles2.get(j).getBases(); }
char[] c1 = g1.toCharArray();
char[] c2 = g2.toCharArray();
Arrays.sort(c1);
Arrays.sort(c2);
g1 = new String(c1);
g2 = new String(c2);
if ((g1.equals("..")) ||
(g2.equals("..")))
{
continue;
}
if (g1.charAt(0) == ref) { n_ref += 1; } else { n_alt += 1; }
if (g1.charAt(1) == ref) { n_ref += 1; } else { n_alt += 1; }
if (! individual.containsKey(sample_names1[i])) { individual.put(sample_names1[i], new GenotypeConcordance(sample_names1[i])); }
if (! Qual.containsKey(gq1)) { Qual.put(gq1, new GenotypeConcordance(Long.toString(gq1))); }
individual.get(sample_names1[i]).add(ref, g1, g2);
Qual.get(gq1).add(ref, g1, g2);
SNP.add(ref, g1, g2);
}
if (verbose) { output.printf("SNP " + SNP.toString()); }
if (! AAF.containsKey(n_alt)) { AAF.put(n_alt, new GenotypeConcordance(Long.toString(n_alt))); }
AAF.get(n_alt).add(SNP);
record1 = reader1.next();
record2 = reader2.next();
}
else if (comparison > 0)
{
// interval1 is later than interval2.
record2 = reader2.next();
}
else if (comparison < 0)
{
// interval2 is later than interval1.
record1 = reader1.next();
}
}
// Now output the statistics.
{
if (verbose)
{
output.printf("\n");
Object[] individuals = individual.keySet().toArray();
for (int i = 0; i < individuals.length; i++)
{
String ind = (String)individuals[i];
output.print("INDIVIDUAL " + individual.get(ind).toString());
}
output.printf("\n");
Object[] AAFs = AAF.keySet().toArray();
for (int i = 0; i < AAFs.length; i++)
{
Long aaf = (Long)AAFs[i];
output.print("AAF " + AAF.get(aaf).toString());
}
output.printf("\n");
Object[] quals = Qual.keySet().toArray();
for (int i = 0; i < quals.length; i++)
{
Long qual = (Long)quals[i];
output.print("QUAL " + Qual.get(qual).toString());
}
output.printf("\n");
}
output.printf("\n");
Object[] individuals = individual.keySet().toArray();
for (int i = 0; i < individuals.length; i++)
{
String ind = (String)individuals[i];
output.printf("INDIVIDUAL %s %f %d %d\n", ind, individual.get(ind).errorRate(), individual.get(ind).total(), individual.get(ind).totalNonHomRef());
}
output.printf("\n");
Object[] AAFs = AAF.keySet().toArray();
for (int i = 0; i < AAFs.length; i++)
{
Long aaf = (Long)AAFs[i];
output.printf("AAF %d %f %d %d\n", aaf, AAF.get(aaf).errorRate(), AAF.get(aaf).total(), AAF.get(aaf).totalNonHomRef());
}
output.printf("\n");
Object[] quals = Qual.keySet().toArray();
for (int i = 0; i < quals.length; i++)
{
Long qual = (Long)quals[i];
output.printf("QUAL %d %f %d %d\n", qual, Qual.get(qual).errorRate(), Qual.get(qual).total(), Qual.get(qual).totalNonHomRef());
}
}
output.flush();
output.close();
return 0;
}
}
public class VCFTool
{
public static void main(String args[])
{
String mode = args[0];
String[] realArgs = Arrays.copyOfRange(args, 1, args.length);
if (mode.equals("validate"))
{
VCFValidate cm = new VCFValidate();
CommandLineProgram.start(cm,realArgs);
System.exit(0);
}
if (mode.equals("grep"))
{
VCFGrep cm = new VCFGrep();
CommandLineProgram.start(cm,realArgs);
System.exit(0);
}
if (mode.equals("concordance"))
{
VCFConcordance cm = new VCFConcordance();
CommandLineProgram.start(cm,realArgs);
System.exit(0);
}
System.out.printf("ERROR: mode %s not defined.\n", mode);
System.exit(-1);
}
/////////////////////////
// Some helpful utilities.
public static Interval getIntervalFromRecord(VCFRecord record)
{
String chr = record.getChromosome();
long off = record.getPosition();
return new Interval(chr, (int)off, (int)off);
}
}