Various and sundry additions to VCF tools. Some useful to the general public, some one-offs.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2396 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
6fbf77be95
commit
8787dd4c5e
|
|
@ -0,0 +1,137 @@
|
|||
package org.broadinstitute.sting.playground.tools.vcf;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.picard.io.IoUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileWalker;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
class GenotypeConcordance
|
||||
{
|
||||
String name;
|
||||
|
||||
protected int[][] counts = {{0,0,0},
|
||||
{0,0,0},
|
||||
{0,0,0}};
|
||||
|
||||
public GenotypeConcordance(String name)
|
||||
{
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void add(char ref, String g1, String g2)
|
||||
{
|
||||
int g1_dosage = 0;
|
||||
int g2_dosage = 0;
|
||||
|
||||
if (g1.charAt(0) != ref) { g1_dosage += 1; }
|
||||
if (g1.charAt(1) != ref) { g1_dosage += 1; }
|
||||
|
||||
if (g2.charAt(0) != ref) { g2_dosage += 1; }
|
||||
if (g2.charAt(1) != ref) { g2_dosage += 1; }
|
||||
|
||||
counts[g1_dosage][g2_dosage] += 1;
|
||||
}
|
||||
|
||||
public void add(GenotypeConcordance G)
|
||||
{
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
counts[i][j] += G.counts[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
String s = this.name + "\n";
|
||||
|
||||
int on_diag = 0;
|
||||
int on_diag_not_homref = 0;
|
||||
int off_diag = 0;
|
||||
int total = 0;
|
||||
int total_not_homref = 0;
|
||||
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
s += counts[i][j] + "\t";
|
||||
|
||||
if (i == j) { on_diag += counts[i][j]; }
|
||||
if (i == j && i != 0) { on_diag_not_homref += counts[i][j]; }
|
||||
if (i != j) { off_diag += counts[i][j]; }
|
||||
if (i != 0 || j != 0) { total_not_homref += counts[i][j]; }
|
||||
total += counts[i][j];
|
||||
}
|
||||
s += "\n";
|
||||
}
|
||||
|
||||
//s += String.format("On-Diagonal = %.02f\n", 100.0 * (double)on_diag / (double)total);
|
||||
//s += String.format("On-Diagonal (not hom-ref) = %.02f\n", 100.0 * (double)on_diag_not_homref / (double)total_not_homref);
|
||||
//s += String.format("Off-Diagonal = %.02f\n", 100.0 * (double)off_diag / (double)total_not_homref);
|
||||
s += String.format("Total = %d\n", total);
|
||||
s += String.format("Total (not hom-ref) = %d\n", total_not_homref);
|
||||
s += String.format("Error Rate = %f\n", this.errorRate());
|
||||
|
||||
s += "\n";
|
||||
return s;
|
||||
}
|
||||
|
||||
public double errorRate()
|
||||
{
|
||||
int off_diag = 0;
|
||||
int total_not_homref = 0;
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
if (i != j) { off_diag += counts[i][j]; }
|
||||
if (i != 0 || j != 0) { total_not_homref += counts[i][j]; }
|
||||
}
|
||||
}
|
||||
double error_rate = (double)off_diag / (double)total_not_homref;
|
||||
return error_rate;
|
||||
}
|
||||
|
||||
public int total()
|
||||
{
|
||||
int total = 0;
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
total += counts[i][j];
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
public int totalNonHomRef()
|
||||
{
|
||||
int total = 0;
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
if (i != 0 || j != 0) { total += counts[i][j]; }
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
package org.broadinstitute.sting.playground.tools.vcf;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.picard.io.IoUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileWalker;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
|
||||
|
||||
class VCFCallRates extends CommandLineProgram
|
||||
{
|
||||
@Argument(fullName = "vcf", shortName = "vcf", doc = "file to open", required = true) public String filename;
|
||||
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = true) public String output_filename;
|
||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = false;
|
||||
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
||||
@Argument(fullName = "min_call_rate", shortName = "min_call_rate", doc = "what fraction of samples must have a call", required = false) public double min_call_rate = 0.9;
|
||||
|
||||
@Override
|
||||
protected int execute()
|
||||
{
|
||||
//System.out.println("Loading " + filename + "...");
|
||||
|
||||
PrintStream output = null;
|
||||
try
|
||||
{
|
||||
output = new PrintStream(new FileOutputStream(output_filename));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
VCFReader reader;
|
||||
|
||||
if (autocorrect)
|
||||
{
|
||||
reader = new VCFReader(VCFHomogenizer.create(filename));
|
||||
}
|
||||
else
|
||||
{
|
||||
reader = new VCFReader(new File(filename));
|
||||
}
|
||||
|
||||
VCFHeader header = reader.getHeader();
|
||||
VCFRecord record = reader.next();
|
||||
|
||||
String[] sample_names = record.getSampleNames();
|
||||
int[] individual_counts = new int[sample_names.length];
|
||||
int[] individual_drops = new int[sample_names.length];
|
||||
|
||||
while(true)
|
||||
{
|
||||
if (record == null) { break; }
|
||||
|
||||
Interval interval = VCFTool.getIntervalFromRecord(record);
|
||||
|
||||
// (unless it is "filtered")
|
||||
if (record.isFiltered())
|
||||
{
|
||||
record = reader.next();
|
||||
}
|
||||
|
||||
char ref = record.getReferenceBase();
|
||||
|
||||
String[] new_sample_names = record.getSampleNames();
|
||||
if (new_sample_names.length != sample_names.length) { throw new RuntimeException(); }
|
||||
for (int i = 0; i < new_sample_names.length; i++) { if (! sample_names[i].equals(new_sample_names[i])) { throw new RuntimeException(); } }
|
||||
|
||||
List<VCFGenotypeRecord> genotypes = record.getVCFGenotypeRecords();
|
||||
|
||||
long n_ref = 0;
|
||||
long n_alt = 0;
|
||||
long n_total = 0;
|
||||
long n_calls = 0;
|
||||
long n_dropped = 0;
|
||||
|
||||
for (int i = 0; i < sample_names.length; i++)
|
||||
{
|
||||
VCFGenotypeRecord rec = genotypes.get(i);
|
||||
|
||||
Long gq;
|
||||
|
||||
if (rec.getFields().get("GQ") != null)
|
||||
{
|
||||
Double gq_double = Double.parseDouble(rec.getFields().get("GQ"));
|
||||
gq = gq_double.longValue();
|
||||
}
|
||||
else
|
||||
{
|
||||
gq = 0L;
|
||||
}
|
||||
|
||||
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
||||
|
||||
String g = "";
|
||||
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
||||
char[] c = g.toCharArray();
|
||||
Arrays.sort(c);
|
||||
g = new String(c);
|
||||
n_total += 1;
|
||||
|
||||
individual_counts[i] += 1;
|
||||
if (g.equals(".."))
|
||||
{
|
||||
n_dropped += 1;
|
||||
individual_drops[i] += 1;
|
||||
continue;
|
||||
}
|
||||
n_calls += 1;
|
||||
if (g.charAt(0) == ref) { n_ref += 1; } else { n_alt += 1; }
|
||||
if (g.charAt(1) == ref) { n_ref += 1; } else { n_alt += 1; }
|
||||
}
|
||||
|
||||
output.printf("SNP %s %d %d %f\n", interval, n_total, n_dropped, (double)n_dropped / (double)n_total);
|
||||
|
||||
record = reader.next();
|
||||
}
|
||||
|
||||
|
||||
// Now output the statistics.
|
||||
|
||||
for (int i = 0; i < sample_names.length; i++)
|
||||
{
|
||||
int n_total = individual_counts[i];
|
||||
int n_dropped = individual_drops[i];
|
||||
output.printf("INDIVIDUAL %s %d %d %f\n", sample_names[i], n_total, n_dropped, (double)n_dropped / (double)n_total);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
package org.broadinstitute.sting.playground.tools.vcf;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.picard.io.IoUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileWalker;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
|
||||
class VCFSequenomAnalysis extends CommandLineProgram
|
||||
{
|
||||
@Argument(fullName = "sequenom", shortName = "sequenom", doc = "file to open", required = true) public String filename1;
|
||||
@Argument(fullName = "sequencing", shortName = "sequencing", doc = "file to open", required = true) public String filename2;
|
||||
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = false) public String output_filename = "/dev/stdout";
|
||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = true;
|
||||
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
||||
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
||||
|
||||
|
||||
@Override
|
||||
protected int execute()
|
||||
{
|
||||
//System.out.println("Loading " + filename + "...");
|
||||
|
||||
PrintStream output = null;
|
||||
try
|
||||
{
|
||||
output = new PrintStream(new FileOutputStream(output_filename));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
output.printf("interval flag ref alt missing_base n_total_sequenom failure_rate_sequenom n_alt_sequencing HWE_sequencing_chi HWE_sequenom_chi HWE_sequencing_p HWE_sequenom_p\n");
|
||||
|
||||
VCFReader reader1;
|
||||
VCFReader reader2;
|
||||
|
||||
if (autocorrect)
|
||||
{
|
||||
reader1 = new VCFReader(VCFHomogenizer.create(filename1));
|
||||
reader2 = new VCFReader(VCFHomogenizer.create(filename2));
|
||||
}
|
||||
else
|
||||
{
|
||||
reader1 = new VCFReader(new File(filename1));
|
||||
reader2 = new VCFReader(new File(filename2));
|
||||
}
|
||||
|
||||
VCFHeader header1 = reader1.getHeader();
|
||||
VCFHeader header2 = reader2.getHeader();
|
||||
|
||||
VCFRecord record1 = reader1.next();
|
||||
VCFRecord record2 = reader2.next();
|
||||
|
||||
while(true)
|
||||
{
|
||||
if (record1 == null) { break; }
|
||||
if (record2 == null) { break; }
|
||||
|
||||
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
||||
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
||||
|
||||
int comparison = interval1.compareTo(interval2);
|
||||
|
||||
if (comparison == 0)
|
||||
{
|
||||
// records match! compute concordance.
|
||||
|
||||
// (unless one of them is "filtered")
|
||||
if (record1.isFiltered() || record2.isFiltered())
|
||||
{
|
||||
record1 = reader1.next();
|
||||
record2 = reader2.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
char ref = record1.getReferenceBase();
|
||||
char alt = VCFTool.getAlt(record2);
|
||||
|
||||
int n_total_sequenom = VCFTool.Compute_n_total(record1);
|
||||
double failure_rate_sequenom = VCFTool.Compute_failure_rate(record1);
|
||||
|
||||
int n_alt_sequenom = VCFTool.Compute_n_alt(record1);
|
||||
int n_alt_sequencing = VCFTool.Compute_n_alt(record2);
|
||||
|
||||
double HWE_sequenom = VCFTool.Compute_HWE(record1);
|
||||
double HWE_sequencing = VCFTool.Compute_HWE(record2);
|
||||
|
||||
boolean isPolymorphic_sequenom = (n_alt_sequenom > 0) ? true : false;
|
||||
boolean isPolymorphic_sequencing = (n_alt_sequencing > 0) ? true : false;
|
||||
|
||||
String flag = null;
|
||||
char missing_base = '.';
|
||||
|
||||
if (isPolymorphic_sequenom)
|
||||
{
|
||||
flag = "TP";
|
||||
if (n_alt_sequenom == n_total_sequenom) { missing_base = ref; }
|
||||
}
|
||||
else
|
||||
{
|
||||
flag = "FP";
|
||||
missing_base = alt;
|
||||
}
|
||||
|
||||
output.printf("%s %s %c %c %c %d %f %d %f %f %f %f\n",
|
||||
interval1,
|
||||
flag,
|
||||
ref,
|
||||
alt,
|
||||
missing_base,
|
||||
n_total_sequenom,
|
||||
failure_rate_sequenom,
|
||||
n_alt_sequencing,
|
||||
HWE_sequencing,
|
||||
HWE_sequenom,
|
||||
VCFTool.P_from_Chi(HWE_sequencing),
|
||||
VCFTool.P_from_Chi(HWE_sequenom));
|
||||
|
||||
record1 = reader1.next();
|
||||
record2 = reader2.next();
|
||||
}
|
||||
else if (comparison > 0)
|
||||
{
|
||||
// interval1 is later than interval2.
|
||||
//System.err.printf("Skipping (2): %s\n", VCFTool.getIntervalFromRecord(record2));
|
||||
record2 = reader2.next();
|
||||
}
|
||||
else if (comparison < 0)
|
||||
{
|
||||
// interval2 is later than interval1.
|
||||
//System.err.printf("Skipping (1): %s\n", VCFTool.getIntervalFromRecord(record1));
|
||||
record1 = reader1.next();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,317 @@
|
|||
package org.broadinstitute.sting.playground.tools.vcf;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.*;
|
||||
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.lang.*;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.picard.io.IoUtil;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileWalker;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
|
||||
|
||||
class VCFSequenomAnalysis2 extends CommandLineProgram
|
||||
{
|
||||
@Argument(fullName = "sequenom", shortName = "sequenom", doc = "file to open", required = true) public String filename1;
|
||||
@Argument(fullName = "sequencing", shortName = "sequencing", doc = "file to open", required = true) public String filename2;
|
||||
@Argument(fullName = "out", shortName = "out", doc = "file to write results to", required = false) public String output_filename = "/dev/stdout";
|
||||
@Argument(fullName = "auto_correct", shortName = "auto_correct", doc = "auto-correct the VCF file if it's off-spec", required = false) public Boolean autocorrect = true;
|
||||
@Argument(fullName = "verbose", shortName = "verbose", doc = "print extremely detailed stats", required = false) public Boolean verbose = false;
|
||||
@Argument(fullName = "qual_threshold", shortName = "qual_threshold", doc = "minimum genotype quality to consider", required = false) public long qual_threshold = 1;
|
||||
|
||||
|
||||
@Override
|
||||
protected int execute()
|
||||
{
|
||||
//System.out.println("Loading " + filename + "...");
|
||||
|
||||
PrintStream output = null;
|
||||
try
|
||||
{
|
||||
output = new PrintStream(new FileOutputStream(output_filename));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
output.printf("PROBE interval flag ref alt n_total_sequenom failure_rate_sequenom n_alt_sequencing n_alt_sequenom p_alt_sequencing p_alt_sequenom HWE_sequencing_chi HWE_sequenom_chi HWE_sequencing_p HWE_sequenom_p is_singleton_in_sequencing singleton_matched_in_sequenom n_sequencing_hets n_sequenom_hets num_hets_also_het_in_sequenom num_hets_dropped_in_sequenom\n");
|
||||
|
||||
VCFReader reader1;
|
||||
VCFReader reader2;
|
||||
|
||||
if (autocorrect)
|
||||
{
|
||||
reader1 = new VCFReader(VCFHomogenizer.create(filename1));
|
||||
reader2 = new VCFReader(VCFHomogenizer.create(filename2));
|
||||
}
|
||||
else
|
||||
{
|
||||
reader1 = new VCFReader(new File(filename1));
|
||||
reader2 = new VCFReader(new File(filename2));
|
||||
}
|
||||
|
||||
VCFHeader header1 = reader1.getHeader();
|
||||
VCFHeader header2 = reader2.getHeader();
|
||||
|
||||
VCFRecord record1 = reader1.next();
|
||||
VCFRecord record2 = reader2.next();
|
||||
|
||||
int[] sequenom_aaf_counts = new int[1024];
|
||||
int[] sequencing_aaf_counts = new int[1024];
|
||||
int max_aaf = 0;
|
||||
|
||||
while(true)
|
||||
{
|
||||
if (record1 == null) { break; }
|
||||
if (record2 == null) { break; }
|
||||
|
||||
Interval interval1 = VCFTool.getIntervalFromRecord(record1);
|
||||
Interval interval2 = VCFTool.getIntervalFromRecord(record2);
|
||||
|
||||
int comparison = interval1.compareTo(interval2);
|
||||
|
||||
if (comparison == 0)
|
||||
{
|
||||
// records match! compute concordance.
|
||||
|
||||
// (unless one of them is "filtered")
|
||||
if (record1.isFiltered() || record2.isFiltered())
|
||||
{
|
||||
record1 = reader1.next();
|
||||
record2 = reader2.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
String[] sample_names = record2.getSampleNames();
|
||||
|
||||
char ref = record1.getReferenceBase();
|
||||
char alt = VCFTool.getAlt(record2);
|
||||
|
||||
int n_total_sequenom = VCFTool.Compute_n_total(record1, sample_names);
|
||||
int n_total_sequencing = VCFTool.Compute_n_total(record2, sample_names);
|
||||
double failure_rate_sequenom = VCFTool.Compute_failure_rate(record1);
|
||||
|
||||
int n_alt_sequenom = VCFTool.Compute_n_alt(record1, sample_names);
|
||||
int n_alt_sequencing = VCFTool.Compute_n_alt(record2, sample_names);
|
||||
|
||||
double p_alt_sequenom = (double)n_alt_sequenom / (double)n_total_sequenom;
|
||||
double p_alt_sequencing = (double)n_alt_sequencing / (double)n_total_sequencing;
|
||||
|
||||
int n_het_sequenom = VCFTool.Compute_n_het(record1, sample_names);
|
||||
int n_het_sequencing = VCFTool.Compute_n_het(record2, sample_names);
|
||||
|
||||
sequenom_aaf_counts[n_alt_sequenom] += 1;
|
||||
sequencing_aaf_counts[n_alt_sequencing] += 1;
|
||||
if (n_alt_sequenom > max_aaf) { max_aaf = n_alt_sequenom; }
|
||||
if (n_alt_sequencing > max_aaf) { max_aaf = n_alt_sequencing; }
|
||||
|
||||
double HWE_sequenom = VCFTool.Compute_HWE(record1, sample_names);
|
||||
double HWE_sequencing = VCFTool.Compute_HWE(record2, sample_names);
|
||||
|
||||
boolean isPolymorphic_sequenom = (n_alt_sequenom > 0) ? true : false;
|
||||
boolean isPolymorphic_sequencing = (n_alt_sequencing > 0) ? true : false;
|
||||
|
||||
int is_singleton_in_sequencing = 0;
|
||||
int singleton_matched_in_sequenom = 0;
|
||||
|
||||
if ((n_alt_sequencing == 1) && (VCFTool.Compute_n_alt(record1) > 0))
|
||||
{
|
||||
is_singleton_in_sequencing = 1;
|
||||
singleton_matched_in_sequenom = CheckSingletonMatch(record2, record1);
|
||||
}
|
||||
|
||||
int[] het_match_ans = ComputeHetMatches(record2, record1);
|
||||
int num_hets_also_het_in_sequenom = het_match_ans[0];
|
||||
int num_hets_dropped_in_sequenom = het_match_ans[1];
|
||||
|
||||
String flag = null;
|
||||
if (isPolymorphic_sequenom) { flag = "TP"; }
|
||||
else { flag = "FP"; }
|
||||
|
||||
output.printf("PROBE %s %s %c %c %d %f %d %d %f %f %f %f %f %f %d %d %d %d %d %d\n",
|
||||
interval1,
|
||||
flag,
|
||||
ref,
|
||||
alt,
|
||||
n_total_sequenom,
|
||||
failure_rate_sequenom,
|
||||
n_alt_sequencing,
|
||||
n_alt_sequenom,
|
||||
p_alt_sequencing,
|
||||
p_alt_sequenom,
|
||||
HWE_sequencing,
|
||||
HWE_sequenom,
|
||||
VCFTool.P_from_Chi(HWE_sequencing),
|
||||
VCFTool.P_from_Chi(HWE_sequenom),
|
||||
is_singleton_in_sequencing,
|
||||
singleton_matched_in_sequenom,
|
||||
n_het_sequencing,
|
||||
n_het_sequenom,
|
||||
num_hets_also_het_in_sequenom,
|
||||
num_hets_dropped_in_sequenom);
|
||||
|
||||
record1 = reader1.next();
|
||||
record2 = reader2.next();
|
||||
}
|
||||
else if (comparison > 0)
|
||||
{
|
||||
// interval1 is later than interval2.
|
||||
//System.err.printf("Skipping (2): %s\n", VCFTool.getIntervalFromRecord(record2));
|
||||
record2 = reader2.next();
|
||||
}
|
||||
else if (comparison < 0)
|
||||
{
|
||||
// interval2 is later than interval1.
|
||||
//System.err.printf("Skipping (1): %s\n", VCFTool.getIntervalFromRecord(record1));
|
||||
record1 = reader1.next();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (int i = 0; i < max_aaf; i++)
|
||||
{
|
||||
output.printf("AAF %d %d %d\n", i, sequenom_aaf_counts[i], sequencing_aaf_counts[i]);
|
||||
}
|
||||
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CheckSingletonMatch(VCFRecord sequencing, VCFRecord sequenom)
|
||||
{
|
||||
String singleton_name = "";
|
||||
|
||||
// first, check sequencing
|
||||
String[] sample_names = sequencing.getSampleNames();
|
||||
List<VCFGenotypeRecord> genotypes = sequencing.getVCFGenotypeRecords();
|
||||
int n_ref = 0;
|
||||
int n_alt = 0;
|
||||
for (int i = 0; i < sample_names.length; i++)
|
||||
{
|
||||
VCFGenotypeRecord rec = genotypes.get(i);
|
||||
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
||||
String g = "";
|
||||
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
||||
char[] c = g.toCharArray();
|
||||
Arrays.sort(c);
|
||||
g = new String(c);
|
||||
if (g.equals("..")) { continue; }
|
||||
if (g.charAt(0) == sequencing.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; }
|
||||
if (g.charAt(1) == sequencing.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; }
|
||||
}
|
||||
if (n_alt != 1) { throw new RuntimeException(); }
|
||||
if (singleton_name.equals("")) { throw new RuntimeException(); }
|
||||
|
||||
// now, check sequenom
|
||||
sample_names = sequenom.getSampleNames();
|
||||
genotypes = sequenom.getVCFGenotypeRecords();
|
||||
n_ref = 0;
|
||||
n_alt = 0;
|
||||
for (int i = 0; i < sample_names.length; i++)
|
||||
{
|
||||
if (sample_names[i].equals(singleton_name))
|
||||
{
|
||||
VCFGenotypeRecord rec = genotypes.get(i);
|
||||
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
||||
String g = "";
|
||||
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
||||
char[] c = g.toCharArray();
|
||||
Arrays.sort(c);
|
||||
g = new String(c);
|
||||
if (g.equals("..")) { continue; }
|
||||
if (g.charAt(0) == sequenom.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; }
|
||||
if (g.charAt(1) == sequenom.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; singleton_name = sample_names[i]; }
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (n_alt > 0) { return 1; }
|
||||
else if (n_ref != 0) { return 0; }
|
||||
else { return -1; }
|
||||
}
|
||||
|
||||
int[] ComputeHetMatches(VCFRecord sequencing, VCFRecord sequenom)
|
||||
{
|
||||
// first, check sequencing
|
||||
String[] sample_names = sequencing.getSampleNames();
|
||||
List<VCFGenotypeRecord> genotypes = sequencing.getVCFGenotypeRecords();
|
||||
ArrayList<String> het_samples = new ArrayList<String>();
|
||||
for (int i = 0; i < sample_names.length; i++)
|
||||
{
|
||||
int n_ref = 0;
|
||||
int n_alt = 0;
|
||||
|
||||
VCFGenotypeRecord rec = genotypes.get(i);
|
||||
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
||||
String g = "";
|
||||
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
||||
char[] c = g.toCharArray();
|
||||
Arrays.sort(c);
|
||||
g = new String(c);
|
||||
if (g.equals("..")) { continue; }
|
||||
if (g.charAt(0) == sequencing.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; }
|
||||
if (g.charAt(1) == sequencing.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; }
|
||||
if (n_alt == 1) { het_samples.add(sample_names[i]); }
|
||||
}
|
||||
|
||||
// now, check sequenom
|
||||
sample_names = sequenom.getSampleNames();
|
||||
genotypes = sequenom.getVCFGenotypeRecords();
|
||||
int matched_hets = 0;
|
||||
int dropped_hets = 0;
|
||||
int mismatched_hets = 0;
|
||||
int num_hets = het_samples.size();
|
||||
for (int i = 0; i < sample_names.length; i++)
|
||||
{
|
||||
if (het_samples.contains(sample_names[i]))
|
||||
{
|
||||
het_samples.remove(sample_names[i]);
|
||||
|
||||
int n_ref = 0;
|
||||
int n_alt = 0;
|
||||
VCFGenotypeRecord rec = genotypes.get(i);
|
||||
List<VCFGenotypeEncoding> alleles = rec.getAlleles();
|
||||
String g = "";
|
||||
for (int j = 0; j < alleles.size(); j++) { g += alleles.get(j).getBases(); }
|
||||
char[] c = g.toCharArray();
|
||||
Arrays.sort(c);
|
||||
g = new String(c);
|
||||
if (g.equals("..")) { dropped_hets += 1; continue; }
|
||||
if (g.charAt(0) == sequenom.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; }
|
||||
if (g.charAt(1) == sequenom.getReferenceBase()) { n_ref += 1; } else { n_alt += 1; }
|
||||
if (n_alt == 1) { matched_hets += 1; }
|
||||
else { mismatched_hets += 1; }
|
||||
}
|
||||
}
|
||||
|
||||
if ((matched_hets + dropped_hets + mismatched_hets) != num_hets)
|
||||
{
|
||||
String warning = String.format("WARNING: %d + %d + %d != %d ",
|
||||
matched_hets,
|
||||
dropped_hets,
|
||||
mismatched_hets,
|
||||
num_hets);
|
||||
for (int i = 0; i < het_samples.size(); i++) { warning += het_samples.get(i) + " "; }
|
||||
System.out.println(warning);
|
||||
}
|
||||
|
||||
int[] ans = new int[2];
|
||||
ans[0] = matched_hets;
|
||||
ans[1] = dropped_hets;
|
||||
return ans;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue