1) Initial complete version of VCF4 writer. There are still issues (see below) but at least this version is fully functional. It incorporates getting rid of intermediate VCFRecord so we now operate from VariantContext objects directly to VCF 4.0 output.
See VCF4WriterTestWalker for usage example: it just amounts to adding vcfWriter.add(vc,ref.getBases()) in walker. add() method in VCFWriter is polymorphic and can also take a VCFRecord, lthough eventually this should be obsolete. addRecord is still supported so all backward compatibility is maintained. Resulting VCF4.0 are still not perfect, so additional changes are in progress. Specifically: a) INFO codes of length 0 (e.g. HM, DB) are not emitted correctly (they should emit just "HM" but now they emit "HM=1"). b) Genotype values that are specified as Integer in header are ignored in type and are printed out as Doubles. Both issues should be corrected with better header parsing. 2) Check in ability of Beagle to mask an additional percentage of genotype likelihoods (0 by default), for testing purposes. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3664 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4a451949ba
commit
ed71e53dd4
|
|
@ -24,7 +24,10 @@ public class VCFGenotypeEncoding {
|
|||
private final TYPE mType;
|
||||
|
||||
// public constructor, that parses out the base string
|
||||
public VCFGenotypeEncoding(String baseString) {
|
||||
public VCFGenotypeEncoding(String baseString){
|
||||
this(baseString, false);
|
||||
}
|
||||
public VCFGenotypeEncoding(String baseString, boolean allowMultipleBaseReference) {
|
||||
if ((baseString.length() == 1)) {
|
||||
// are we an empty (no-call) genotype?
|
||||
if (baseString.equals(VCFGenotypeRecord.EMPTY_ALLELE)) {
|
||||
|
|
@ -39,19 +42,23 @@ public class VCFGenotypeEncoding {
|
|||
mType = TYPE.SINGLE_BASE;
|
||||
}
|
||||
} else { // deletion or insertion
|
||||
if (baseString.length() < 1 || (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I')) {
|
||||
if (baseString.length() < 1 ||(!allowMultipleBaseReference && (baseString.toUpperCase().charAt(0) != 'D' && baseString.toUpperCase().charAt(0) != 'I'))) {
|
||||
throw new IllegalArgumentException("Genotype encoding of " + baseString + " was passed in, but is not a valid deletion, insertion, base, or no call (.)");
|
||||
}
|
||||
if (baseString.toUpperCase().charAt(0) == 'D') {
|
||||
mLength = Integer.valueOf(baseString.substring(1, baseString.length()));
|
||||
mBases = "";
|
||||
mType = TYPE.DELETION;
|
||||
} else { // we're an I
|
||||
} else if (baseString.toUpperCase().charAt(0) == 'I') { // we're an I
|
||||
mBases = baseString.substring(1, baseString.length()).toUpperCase();
|
||||
if (!validBases(mBases))
|
||||
throw new IllegalArgumentException("The insertion base string contained invalid bases -> " + baseString);
|
||||
mLength = mBases.length();
|
||||
mType = TYPE.INSERTION;
|
||||
} else{
|
||||
mBases = baseString;
|
||||
mType = TYPE.MIXED;
|
||||
mLength = mBases.length();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -97,6 +104,7 @@ public class VCFGenotypeEncoding {
|
|||
switch (mType) {
|
||||
case SINGLE_BASE:
|
||||
case UNCALLED:
|
||||
case MIXED:
|
||||
builder.append(mBases);
|
||||
break;
|
||||
case INSERTION:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,149 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers;
|
||||
|
||||
import org.broad.tribble.util.AsciiLineReader;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFReader;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Prints out all of the RODs in the input data set. Data is rendered using the toString() method
|
||||
* of the given ROD.
|
||||
*/
|
||||
public class VCF4WriterTestWalker extends RodWalker<Integer, Integer> {
|
||||
private VCFWriter vcfWriter;
|
||||
|
||||
@Argument(fullName="output_file", shortName="output", doc="VCF file to which output should be written", required=true)
|
||||
private String OUTPUT_FILE = null;
|
||||
|
||||
|
||||
public static final String INPUT_ROD_NAME = "vcf";
|
||||
|
||||
protected static String line = null;
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
VCF4Codec vcf4codec = new VCF4Codec();
|
||||
|
||||
/**
|
||||
* Initialize the number of loci processed to zero.
|
||||
*
|
||||
* @return 0
|
||||
*/
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public void initialize() {
|
||||
final List<ReferenceOrderedDataSource> dataSources = this.getToolkit().getRodDataSources();
|
||||
|
||||
// Open output file specified by output VCF ROD
|
||||
vcfWriter = new VCFWriter(new File(OUTPUT_FILE));
|
||||
|
||||
for( final ReferenceOrderedDataSource source : dataSources ) {
|
||||
final RMDTrack rod = source.getReferenceOrderedData();
|
||||
Object p = rod.getRecordType();
|
||||
if(rod.getType().equals(vcf4codec.getClass()) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) {
|
||||
try {
|
||||
AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath()));
|
||||
int lineNumber = vcf4codec.readHeader(lineReader);
|
||||
out.printf("Read %d header lines%n", lineNumber);
|
||||
|
||||
VCFHeader header = vcf4codec.getHeader(VCFHeader.class);
|
||||
vcfWriter.writeHeader(header);
|
||||
}
|
||||
catch (FileNotFoundException e ) {
|
||||
throw new StingException(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param tracker the meta-data tracker
|
||||
* @param ref the reference base
|
||||
* @param context the context for the given locus
|
||||
* @return 1 if the locus was successfully processed, 0 if otherwise
|
||||
*/
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
GenomeLoc loc = context.getLocation();
|
||||
VariantContext vc = tracker.getVariantContext(ref,"vcf", null, loc, true);
|
||||
|
||||
|
||||
if (vc == null)
|
||||
return 0;
|
||||
|
||||
// Write directly variant context to VCF4.0 format.
|
||||
vcfWriter.add(vc, ref.getBases());
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the number of rods processed.
|
||||
*
|
||||
* @param value result of the map.
|
||||
* @param sum accumulator for the reduce.
|
||||
* @return the new number of rods processed.
|
||||
*/
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {}
|
||||
}
|
||||
|
|
@ -54,9 +54,14 @@ import java.util.*;
|
|||
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||
|
||||
@Argument(fullName = "beagle_file", shortName = "beagle", doc = "File to print BEAGLE-specific data for use with imputation", required = true)
|
||||
public PrintStream beagleWriter = null;
|
||||
public PrintStream beagleWriter = null;
|
||||
@Argument(fullName = "genotypes_file", shortName = "genotypes", doc = "File to print reference genotypes for error analysis", required = false)
|
||||
public PrintStream beagleGenotypesWriter = null;
|
||||
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
|
||||
public double insertedNoCallRate = 0;
|
||||
|
||||
final TreeSet<String> samples = new TreeSet<String>();
|
||||
Random generator;
|
||||
|
||||
public void initialize() {
|
||||
|
||||
|
|
@ -71,11 +76,14 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
generator = new Random();
|
||||
beagleWriter.print("marker alleleA alleleB");
|
||||
for ( String sample : samples )
|
||||
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
|
||||
|
||||
beagleWriter.println();
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.println("dummy header");
|
||||
}
|
||||
public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
|
|
@ -90,6 +98,9 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
// output marker ID to Beagle input file
|
||||
beagleWriter.print(String.format("%s ",vc_eval.getLocation().toString()));
|
||||
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.print(String.format("%s ",vc_eval.getLocation().toString()));
|
||||
|
||||
for (Allele allele: vc_eval.getAlleles()) {
|
||||
// TODO -- check whether this is really needed by Beagle
|
||||
String bglPrintString;
|
||||
|
|
@ -120,20 +131,39 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
|||
Double dg = Double.valueOf(gl);
|
||||
if (dg> maxLikelihood)
|
||||
maxLikelihood = dg;
|
||||
|
||||
|
||||
likeArray.add(dg);
|
||||
}
|
||||
}
|
||||
|
||||
for (Double likeVal: likeArray)
|
||||
beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood)));
|
||||
// see if we need to randomly mask out genotype in this position.
|
||||
Double d = generator.nextDouble();
|
||||
if (d > insertedNoCallRate ) {
|
||||
// System.out.format("%5.4f ", d);
|
||||
for (Double likeVal: likeArray)
|
||||
beagleWriter.print(String.format("%5.4f ",Math.pow(10, likeVal-maxLikelihood)));
|
||||
}
|
||||
else {
|
||||
// we are masking out this genotype
|
||||
beagleWriter.print("0.33 0.33 0.33 ");
|
||||
}
|
||||
|
||||
if (beagleGenotypesWriter != null) {
|
||||
char a = genotype.getAllele(0).toString().charAt(0);
|
||||
char b = genotype.getAllele(0).toString().charAt(0);
|
||||
|
||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
||||
}
|
||||
}
|
||||
else
|
||||
else {
|
||||
beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
|
||||
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.print(". . ");
|
||||
}
|
||||
}
|
||||
|
||||
beagleWriter.println();
|
||||
|
||||
if (beagleGenotypesWriter != null)
|
||||
beagleGenotypesWriter.println();
|
||||
}
|
||||
return 1;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,26 +1,50 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.VCFHeaderLine;
|
||||
import org.broad.tribble.vcf.VCFHeaderVersion;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.CalledGenotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.TreeSet;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* this class writers VCF files
|
||||
*/
|
||||
public class VCFWriter {
|
||||
|
||||
|
||||
|
||||
// the VCF header we're storing
|
||||
private VCFHeader mHeader = null;
|
||||
|
||||
// the print stream we're writting to
|
||||
BufferedWriter mWriter;
|
||||
private final String FIELD_SEPERATOR = "\t";
|
||||
|
||||
// our genotype sample fields
|
||||
private static final List<VCFGenotypeRecord> mGenotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||
|
||||
// commonly used strings that are in the standard
|
||||
private final String FORMAT_FIELD_SEPARATOR = ":";
|
||||
private static final String GENOTYPE_FIELD_SEPARATOR = ":";
|
||||
private static final String FIELD_SEPARATOR = "\t";
|
||||
private static final String FILTER_CODE_SEPARATOR = ";";
|
||||
private static final String INFO_FIELD_SEPARATOR = ";";
|
||||
|
||||
// default values
|
||||
private static final String UNFILTERED = ".";
|
||||
private static final String PASSES_FILTERS = "0";
|
||||
private static final String PASSES_FILTERS_VCF_4_0 = "PASS";
|
||||
private static final String EMPTY_INFO_FIELD = ".";
|
||||
private static final String EMPTY_ID_FIELD = ".";
|
||||
private static final String EMPTY_ALLELE_FIELD = ".";
|
||||
private static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
|
||||
private static final String MISSING_GENOTYPE_QUALITY = ".";
|
||||
|
||||
/**
|
||||
* create a VCF writer, given a file to write to
|
||||
|
|
@ -48,18 +72,32 @@ public class VCFWriter {
|
|||
}
|
||||
|
||||
public void writeHeader(VCFHeader header) {
|
||||
// use 3.3 format by default
|
||||
// TODO - update to true once 4.0 is on by default
|
||||
writeHeader(header, false);
|
||||
}
|
||||
public void writeHeader(VCFHeader header, boolean useVCF4Format) {
|
||||
this.mHeader = header;
|
||||
try {
|
||||
// the fileformat field needs to be written first
|
||||
// the file format field needs to be written first
|
||||
TreeSet<VCFHeaderLine> nonFormatMetaData = new TreeSet<VCFHeaderLine>();
|
||||
for ( VCFHeaderLine line : header.getMetaData() ) {
|
||||
if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
|
||||
if (useVCF4Format) {
|
||||
if ( line.getKey().equals(VCFHeaderVersion.VCF4_0.getFormatString()) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_0.getFormatString() + "=" + VCFHeaderVersion.VCF4_0.getVersionString() + "\n");
|
||||
} else {
|
||||
nonFormatMetaData.add(line);
|
||||
}
|
||||
}
|
||||
else if ( line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF3_2.getFormatString() + "=" + VCFHeaderVersion.VCF3_2.getVersionString() + "\n");
|
||||
} else {
|
||||
nonFormatMetaData.add(line);
|
||||
else {
|
||||
if ( line.getKey().equals(VCFHeaderVersion.VCF3_3.getFormatString()) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
|
||||
}
|
||||
else if ( line.getKey().equals(VCFHeaderVersion.VCF3_2.getFormatString()) ) {
|
||||
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF3_2.getFormatString() + "=" + VCFHeaderVersion.VCF3_2.getVersionString() + "\n");
|
||||
} else {
|
||||
nonFormatMetaData.add(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -70,10 +108,10 @@ public class VCFWriter {
|
|||
// write out the column line
|
||||
StringBuilder b = new StringBuilder();
|
||||
b.append(VCFHeader.HEADER_INDICATOR);
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPERATOR);
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPARATOR);
|
||||
if (header.hasGenotypingData()) {
|
||||
b.append("FORMAT" + FIELD_SEPERATOR);
|
||||
for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPERATOR);
|
||||
b.append("FORMAT" + FIELD_SEPARATOR);
|
||||
for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPARATOR);
|
||||
}
|
||||
mWriter.write(b.toString() + "\n");
|
||||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
|
|
@ -88,10 +126,27 @@ public class VCFWriter {
|
|||
*
|
||||
* @param record the record to output
|
||||
*/
|
||||
public void add(VCFRecord record) {
|
||||
addRecord(record);
|
||||
}
|
||||
|
||||
public void addRecord(VCFRecord record) {
|
||||
addRecord(record, VCFGenotypeWriter.VALIDATION_STRINGENCY.STRICT);
|
||||
}
|
||||
|
||||
public void add(VariantContext vc, byte[] refBases) {
|
||||
if ( mHeader == null )
|
||||
throw new IllegalStateException("The VCF Header must be written before records can be added");
|
||||
|
||||
String vcfString = toStringEncoding(vc, mHeader, refBases);
|
||||
try {
|
||||
mWriter.write(vcfString + "\n");
|
||||
mWriter.flush(); // necessary so that writing to an output stream will work
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to write the VCF object to a file");
|
||||
}
|
||||
|
||||
}
|
||||
/**
|
||||
* output a record to the VCF file
|
||||
*
|
||||
|
|
@ -124,4 +179,308 @@ public class VCFWriter {
|
|||
}
|
||||
}
|
||||
|
||||
private String toStringEncoding(VariantContext vc, VCFHeader header, byte[] refBases) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// CHROM \t POS \t ID \t REF \t ALT \t QUAL \t FILTER \t INFO
|
||||
GenomeLoc loc = vc.getLocation();
|
||||
//String referenceBases = new String(vc.getReference().getBases());
|
||||
|
||||
String contig = loc.getContig();
|
||||
long position = loc.getStart();
|
||||
String ID = vc.hasAttribute("ID") ? vc.getAttributeAsString("ID") : EMPTY_ID_FIELD;
|
||||
|
||||
builder.append(contig);
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
builder.append(position);
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
builder.append(ID);
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
|
||||
// deal with the reference
|
||||
String referenceBases = new String(vc.getReference().getBases());
|
||||
|
||||
double qual = vc.hasNegLog10PError() ? vc.getPhredScaledQual() : -1;
|
||||
// TODO- clean up these flags and associated code
|
||||
boolean filtersWereAppliedToContext = true;
|
||||
List<String> allowedGenotypeAttributeKeys = null;
|
||||
boolean filtersWereAppliedToGenotypes = false;
|
||||
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : (filtersWereAppliedToContext ? PASSES_FILTERS_VCF_4_0 : UNFILTERED);
|
||||
|
||||
String referenceString = new String(vc.getReference().getBases());
|
||||
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
|
||||
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
|
||||
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
|
||||
boolean isComplexEvent = false;
|
||||
if (referenceBases.length()>1) {
|
||||
// complex event: by VCF 4.0, reference from previous position is included.
|
||||
position--;
|
||||
isComplexEvent = true;
|
||||
}
|
||||
for ( Allele a : vc.getAlleles() ) {
|
||||
|
||||
VCFGenotypeEncoding encoding;
|
||||
|
||||
|
||||
String alleleString = new String(a.getBases());
|
||||
if (isComplexEvent) {// vc.getType() == VariantContext.Type.MIXED ) {
|
||||
// Mixed variants (e.g. microsatellites) have the reference before the event included
|
||||
String s = new String(refBases)+alleleString;
|
||||
encoding = new VCFGenotypeEncoding(s, true);
|
||||
if (a.isReference())
|
||||
referenceString = s;
|
||||
|
||||
|
||||
} else if ( vc.getType() == VariantContext.Type.INDEL ) {
|
||||
if ( a.isNull() ) {
|
||||
if ( a.isReference() ) {
|
||||
// ref, where alt is insertion
|
||||
encoding = new VCFGenotypeEncoding(new String(refBases));
|
||||
} else {
|
||||
// non-ref deletion
|
||||
encoding = new VCFGenotypeEncoding(".");
|
||||
}
|
||||
} else if ( a.isReference() ) {
|
||||
// ref, where alt is deletion
|
||||
encoding = new VCFGenotypeEncoding(new String(refBases));
|
||||
} else {
|
||||
// non-ref insertion
|
||||
referenceString = new String(refBases)+alleleString;
|
||||
encoding = new VCFGenotypeEncoding(referenceString, true);
|
||||
}
|
||||
} else {
|
||||
// no variation, ref or alt for snp
|
||||
encoding = new VCFGenotypeEncoding(alleleString);
|
||||
}
|
||||
|
||||
if ( a.isNonReference() ) {
|
||||
vcfAltAlleles.add(encoding);
|
||||
}
|
||||
|
||||
alleleMap.put(a, encoding);
|
||||
}
|
||||
|
||||
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>();
|
||||
if ( vc.hasGenotypes() ) {
|
||||
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_KEY);
|
||||
for ( String key : calcVCFGenotypeKeys(vc) ) {
|
||||
if ( allowedGenotypeAttributeKeys == null || allowedGenotypeAttributeKeys.contains(key) )
|
||||
vcfGenotypeAttributeKeys.add(key);
|
||||
}
|
||||
if ( filtersWereAppliedToGenotypes )
|
||||
vcfGenotypeAttributeKeys.add(VCFGenotypeRecord.GENOTYPE_FILTER_KEY);
|
||||
}
|
||||
String genotypeFormatString = Utils.join(GENOTYPE_FIELD_SEPARATOR, vcfGenotypeAttributeKeys);
|
||||
|
||||
List<VCFGenotypeRecord> genotypeObjects = new ArrayList<VCFGenotypeRecord>(vc.getGenotypes().size());
|
||||
for ( Genotype g : vc.getGenotypesSortedByName() ) {
|
||||
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
|
||||
|
||||
for ( Allele a : g.getAlleles() ) {
|
||||
encodings.add(alleleMap.get(a));
|
||||
}
|
||||
|
||||
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
|
||||
VCFGenotypeRecord vcfG = new VCFGenotypeRecord(g.getSampleName(), encodings, phasing);
|
||||
|
||||
for ( String key : vcfGenotypeAttributeKeys ) {
|
||||
if ( key.equals(VCFGenotypeRecord.GENOTYPE_KEY) )
|
||||
continue;
|
||||
|
||||
Object val = g.getAttribute(key);
|
||||
// some exceptions
|
||||
if ( key.equals(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY) ) {
|
||||
if ( MathUtils.compareDoubles(g.getNegLog10PError(), Genotype.NO_NEG_LOG_10PERROR) == 0 )
|
||||
val = VCFGenotypeRecord.MISSING_GENOTYPE_QUALITY;
|
||||
else
|
||||
val = Math.min(g.getPhredScaledQual(), VCFGenotypeRecord.MAX_QUAL_VALUE);
|
||||
} else if ( key.equals(VCFGenotypeRecord.DEPTH_KEY) && val == null ) {
|
||||
ReadBackedPileup pileup = (ReadBackedPileup)g.getAttribute(CalledGenotype.READBACKEDPILEUP_ATTRIBUTE_KEY);
|
||||
if ( pileup != null )
|
||||
val = pileup.size();
|
||||
} else if ( key.equals(VCFGenotypeRecord.GENOTYPE_FILTER_KEY) ) {
|
||||
val = g.isFiltered() ? Utils.join(";", Utils.sorted(g.getFilters())) : PASSES_FILTERS;
|
||||
}
|
||||
|
||||
String outputValue = formatVCFField(key, val);
|
||||
if ( outputValue != null )
|
||||
vcfG.setField(key, outputValue);
|
||||
}
|
||||
|
||||
genotypeObjects.add(vcfG);
|
||||
}
|
||||
|
||||
mGenotypeRecords.clear();
|
||||
mGenotypeRecords.addAll(genotypeObjects);
|
||||
// info fields
|
||||
Map<String, String> infoFields = new HashMap<String, String>();
|
||||
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
|
||||
String key = elt.getKey();
|
||||
if ( key.equals("ID") )
|
||||
continue;
|
||||
|
||||
String outputValue = formatVCFField(key, elt.getValue());
|
||||
if ( outputValue != null )
|
||||
infoFields.put(key, outputValue);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
builder.append(referenceString);
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
|
||||
if ( vcfAltAlleles.size() > 0 ) {
|
||||
builder.append(vcfAltAlleles.get(0));
|
||||
for ( int i = 1; i < vcfAltAlleles.size(); i++ ) {
|
||||
builder.append(",");
|
||||
builder.append(vcfAltAlleles.get(i));
|
||||
}
|
||||
} else {
|
||||
builder.append(EMPTY_ALLELE_FIELD);
|
||||
}
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
|
||||
|
||||
if ( qual == -1 )
|
||||
builder.append(MISSING_GENOTYPE_QUALITY);
|
||||
else
|
||||
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, qual));
|
||||
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
|
||||
|
||||
builder.append(filters);
|
||||
builder.append(FIELD_SEPARATOR);
|
||||
builder.append(createInfoString(infoFields));
|
||||
|
||||
if ( genotypeFormatString != null && genotypeFormatString.length() > 0 ) {
|
||||
addGenotypeData(builder, header, genotypeFormatString, vcfAltAlleles);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* add the genotype data
|
||||
*
|
||||
* @param builder the string builder
|
||||
* @param header the header object
|
||||
*/
|
||||
private static void addGenotypeData(StringBuilder builder, VCFHeader header,
|
||||
String genotypeFormatString, List<VCFGenotypeEncoding>vcfAltAlleles) {
|
||||
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(mGenotypeRecords);
|
||||
|
||||
StringBuffer tempStr = new StringBuffer();
|
||||
if ( header.getGenotypeSamples().size() < mGenotypeRecords.size() ) {
|
||||
for ( String sample : gMap.keySet() ) {
|
||||
if ( !header.getGenotypeSamples().contains(sample) )
|
||||
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
|
||||
else
|
||||
header.getGenotypeSamples().remove(sample);
|
||||
}
|
||||
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
|
||||
}
|
||||
tempStr.append(FIELD_SEPARATOR + genotypeFormatString);
|
||||
|
||||
String[] genotypeFormatStrings = genotypeFormatString.split(":");
|
||||
|
||||
for ( String genotype : header.getGenotypeSamples() ) {
|
||||
tempStr.append(FIELD_SEPARATOR);
|
||||
if ( gMap.containsKey(genotype) ) {
|
||||
VCFGenotypeRecord rec = gMap.get(genotype);
|
||||
tempStr.append(rec.toStringEncoding(vcfAltAlleles, genotypeFormatStrings));
|
||||
gMap.remove(genotype);
|
||||
} else {
|
||||
tempStr.append(VCFGenotypeRecord.stringEncodingForEmptyGenotype(genotypeFormatStrings));
|
||||
}
|
||||
}
|
||||
if ( gMap.size() != 0 ) {
|
||||
for ( String sample : gMap.keySet() )
|
||||
System.err.println("Sample " + sample + " is being genotyped but isn't in the header.");
|
||||
throw new IllegalStateException("We failed to use all the genotype samples; there must be an inconsistancy between the header and records");
|
||||
}
|
||||
|
||||
builder.append(tempStr);
|
||||
}
|
||||
/**
|
||||
* create a genotype mapping from a list and their sample names
|
||||
*
|
||||
* @param list a list of genotype samples
|
||||
* @return a mapping of the sample name to VCF genotype record
|
||||
*/
|
||||
private static Map<String, VCFGenotypeRecord> genotypeListToMap(List<VCFGenotypeRecord> list) {
|
||||
Map<String, VCFGenotypeRecord> map = new HashMap<String, VCFGenotypeRecord>();
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
VCFGenotypeRecord rec = list.get(i);
|
||||
map.put(rec.getSampleName(), rec);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the info string
|
||||
*
|
||||
* @return a string representing the infomation fields
|
||||
*/
|
||||
static protected String createInfoString(Map<String,String> infoFields) {
|
||||
StringBuffer info = new StringBuffer();
|
||||
boolean isFirst = true;
|
||||
for (Map.Entry<String, String> entry : infoFields.entrySet()) {
|
||||
if ( isFirst )
|
||||
isFirst = false;
|
||||
else
|
||||
info.append(INFO_FIELD_SEPARATOR);
|
||||
info.append(entry.getKey());
|
||||
if ( entry.getValue() != null && !entry.getValue().equals("") ) {
|
||||
info.append("=");
|
||||
info.append(entry.getValue());
|
||||
}
|
||||
}
|
||||
return info.length() == 0 ? EMPTY_INFO_FIELD : info.toString();
|
||||
}
|
||||
|
||||
private static String formatVCFField(String key, Object val) {
|
||||
String result;
|
||||
if ( val == null )
|
||||
result = VCFGenotypeRecord.getMissingFieldValue(key);
|
||||
else if ( val instanceof Double )
|
||||
result = String.format("%.2f", (Double)val);
|
||||
else if ( val instanceof Boolean )
|
||||
result = (Boolean)val ? "" : null; // empty string for true, null for false
|
||||
else if ( val instanceof List ) {
|
||||
List list = (List)val;
|
||||
if ( list.size() == 0 )
|
||||
return formatVCFField(key, null);
|
||||
StringBuffer sb = new StringBuffer(formatVCFField(key, list.get(0)));
|
||||
for ( int i = 1; i < list.size(); i++) {
|
||||
sb.append(",");
|
||||
sb.append(formatVCFField(key, list.get(i)));
|
||||
}
|
||||
result = sb.toString();
|
||||
} else
|
||||
result = val.toString();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
|
||||
Set<String> keys = new HashSet<String>();
|
||||
|
||||
boolean sawGoodQual = false;
|
||||
for ( Genotype g : vc.getGenotypes().values() ) {
|
||||
keys.addAll(g.getAttributes().keySet());
|
||||
if ( g.hasNegLog10PError() )
|
||||
sawGoodQual = true;
|
||||
}
|
||||
|
||||
if ( sawGoodQual )
|
||||
keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
|
||||
return Utils.sorted(new ArrayList<String>(keys));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue