Refactored and generalized the VCF header info code.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2346 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-12-13 21:02:45 +00:00
parent 05b8782d5f
commit 97618663ef
35 changed files with 373 additions and 136 deletions

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.List;
import java.util.Map;
@ -61,5 +62,5 @@ public class AlleleBalance extends StandardVariantAnnotation {
public String getKeyName() { return "AB"; }
public String getDescription() { return "AB,1,Float,\"Allele Balance for hets (ref/(ref+alt))\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("AB", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Allele Balance for hets (ref/(ref+alt))"); }
}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
@ -19,5 +20,5 @@ public class DepthOfCoverage extends StandardVariantAnnotation {
public String getKeyName() { return VCFRecord.DEPTH_KEY; }
public String getDescription() { return getKeyName() + ",1,Integer,\"Total Depth (including MQ0 reads)\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine(getKeyName(), 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total Depth (including MQ0 reads)"); }
}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
@ -21,7 +22,7 @@ public class HomopolymerRun extends StandardVariantAnnotation {
public String getKeyName() { return "HRun"; }
public String getDescription() { return "HRun,1,Integer,\"Largest Contiguous Homopolymer Run of Variant Allele In Either Direction\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("HRun", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Largest Contiguous Homopolymer Run of Variant Allele In Either Direction"); }
public boolean useZeroQualityReads() { return false; }

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.*;
import java.util.Map;
@ -30,5 +31,5 @@ public class MismatchRate implements VariantAnnotation {
public String getKeyName() { return "MR"; }
public String getDescription() { return "MR,1,Float,\"Mismatch Rate of Reads Spanning This Position\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("MR", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Mismatch Rate of Reads Spanning This Position"); }
}

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
import java.util.ArrayList;
@ -31,5 +32,5 @@ public class RMSMappingQuality extends StandardVariantAnnotation {
public String getKeyName() { return VCFRecord.RMS_MAPPING_QUALITY_KEY; }
public String getDescription() { return getKeyName() + ",1,Float,\"RMS Mapping Quality\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine(getKeyName(), 1, VCFInfoHeaderLine.INFO_TYPE.Float, "RMS Mapping Quality"); }
}

View File

@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.List;
import java.util.ArrayList;
@ -67,7 +68,7 @@ public class RankSumTest implements VariantAnnotation {
public String getKeyName() { return "RankSum"; }
public String getDescription() { return "RankSum,1,Float,\"Phred-scaled p-value From Wilcoxon Rank Sum Test of Het Vs. Ref Base Qualities\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("RankSum", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Phred-scaled p-value From Wilcoxon Rank Sum Test of Het Vs. Ref Base Qualities"); }
private void fillQualsFromPileup(char ref, char alt, ReadBackedPileup pileup, List<Integer> refQuals, List<Integer> altQuals) {
for ( PileupElement p : pileup ) {

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
@ -26,7 +27,7 @@ public class SecondBaseSkew implements VariantAnnotation {
public String getKeyName() { return KEY_NAME; }
public String getDescription() { return KEY_NAME + ",1,Float,\"Chi-square Secondary Base Skew\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine(KEY_NAME, 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Chi-square Secondary Base Skew"); }
public String annotate(ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, Variation variation) {
if ( !variation.isBiallelic() || !variation.isSNP() )

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
@ -23,5 +24,5 @@ public class SpanningDeletions extends StandardVariantAnnotation {
public String getKeyName() { return "Dels"; }
public String getDescription() { return "Dels,1,Float,\"Fraction of Reads Containing Spanning Deletions\""; }
public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("Dels", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Fraction of Reads Containing Spanning Deletions"); }
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
@ -16,6 +17,6 @@ public interface VariantAnnotation {
public String getKeyName();
// return the description used for the VCF INFO meta field
public String getDescription();
public VCFInfoHeaderLine getDescription();
}

View File

@ -116,10 +116,10 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
}
// setup the header fields
Set<String> hInfo = new HashSet<String>();
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add("source=VariantAnnotator");
hInfo.add("annotatorReference=" + getToolkit().getArguments().referenceFile.getName());
hInfo.add(new VCFHeaderLine("source", "VariantAnnotator"));
hInfo.add(new VCFHeaderLine("annotatorReference", getToolkit().getArguments().referenceFile.getName()));
hInfo.addAll(getVCFAnnotationDescriptions(requestedAnnotations));
vcfHeader = new VCFHeader(hInfo, samples);
@ -175,35 +175,35 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
}
// option #1: don't specify annotations to be used: standard annotations are used by default
public static Set<String> getVCFAnnotationDescriptions() {
public static Set<VCFHeaderLine> getVCFAnnotationDescriptions() {
if ( standardAnnotations == null )
determineAllAnnotations();
TreeSet<String> descriptions = new TreeSet<String>();
TreeSet<VCFHeaderLine> descriptions = new TreeSet<VCFHeaderLine>();
for ( VariantAnnotation annotation : standardAnnotations.values() )
descriptions.add("INFO=" + annotation.getDescription());
descriptions.add(annotation.getDescription());
return descriptions;
}
// option #2: specify that all possible annotations be used
public static Set<String> getAllVCFAnnotationDescriptions() {
public static Set<VCFHeaderLine> getAllVCFAnnotationDescriptions() {
if ( standardAnnotations == null )
determineAllAnnotations();
TreeSet<String> descriptions = new TreeSet<String>();
TreeSet<VCFHeaderLine> descriptions = new TreeSet<VCFHeaderLine>();
for ( VariantAnnotation annotation : allAnnotations.values() )
descriptions.add("INFO=" + annotation.getDescription());
descriptions.add(annotation.getDescription());
return descriptions;
}
// option #3: specify the exact annotations to be used
public static Set<String> getVCFAnnotationDescriptions(Collection<VariantAnnotation> annotations) {
public static Set<VCFHeaderLine> getVCFAnnotationDescriptions(Collection<VariantAnnotation> annotations) {
TreeSet<String> descriptions = new TreeSet<String>();
TreeSet<VCFHeaderLine> descriptions = new TreeSet<VCFHeaderLine>();
for ( VariantAnnotation annotation : annotations )
descriptions.add("INFO=" + annotation.getDescription());
descriptions.add(annotation.getDescription());
return descriptions;
}

View File

@ -103,21 +103,21 @@ public class CallsetConcordanceWalker extends RodWalker<Integer, Integer> {
}
// set up the header fields
Set<String> hInfo = new HashSet<String>();
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add("source=CallsetConcordance");
hInfo.add("note=\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\"");
hInfo.add(new VCFHeaderLine("source", "CallsetConcordance"));
hInfo.add(new VCFHeaderLine("note", "\"This file represents a concordance test of various call sets - NOT the output from a multi-sample caller\""));
hInfo.addAll(getVCFAnnotationDescriptions(requestedTypes));
VCFHeader header = new VCFHeader(hInfo, samples);
vcfWriter = new VCFWriter(header, OUTPUT);
}
public static Set<String> getVCFAnnotationDescriptions(Collection<ConcordanceType> types) {
public static Set<VCFHeaderLine> getVCFAnnotationDescriptions(Collection<ConcordanceType> types) {
TreeSet<String> descriptions = new TreeSet<String>();
TreeSet<VCFHeaderLine> descriptions = new TreeSet<VCFHeaderLine>();
for ( ConcordanceType type : types )
descriptions.add("INFO=" + type.getInfoDescription());
descriptions.add(type.getInfoDescription());
return descriptions;
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.Map;
import java.util.Set;
@ -11,5 +12,5 @@ public interface ConcordanceType {
public void initialize(Map<String,String> args, Set<String> samples);
public String computeConcordance(Map<String, Genotype> samplesToRecords, ReferenceContext ref);
public String getInfoName();
public String getInfoDescription();
public VCFInfoHeaderLine getInfoDescription();
}

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.*;
@ -102,5 +103,5 @@ public class IndelSubsets implements ConcordanceType {
}
public String getInfoName() { return "IndelSubsets"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"Indel-related subsets\""; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFInfoHeaderLine.INFO_TYPE.String, "Indel-related subsets"); }
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.*;
import java.util.Map.Entry;
@ -39,5 +40,5 @@ public class NWayVenn implements ConcordanceType {
}
public String getInfoName() { return "NwayVenn"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"N-way Venn split\""; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFInfoHeaderLine.INFO_TYPE.String, "N-way Venn split"); }
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import java.util.*;
@ -111,5 +112,5 @@ public class SNPGenotypeConcordance implements ConcordanceType {
}
public String getInfoName() { return "SnpConcordance"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"SNP concordance test\""; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFInfoHeaderLine.INFO_TYPE.String, "SNP concordance test"); }
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.genotype.Variation;
import org.broadinstitute.sting.utils.genotype.Genotype;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.StingException;
import java.util.*;
@ -59,5 +60,5 @@ public class SimpleVenn implements ConcordanceType {
}
public String getInfoName() { return "Venn"; }
public String getInfoDescription() { return getInfoName() + ",1,String,\"2-way Venn split\""; }
public VCFInfoHeaderLine getInfoDescription() { return new VCFInfoHeaderLine(getInfoName(), 1, VCFInfoHeaderLine.INFO_TYPE.String, "2-way Venn split"); }
}

View File

@ -47,18 +47,19 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
private void initializeVcfWriter(RodVCF rod) {
// setup the header fields
Set<String> hInfo = new HashSet<String>();
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add("source=" + "VariantFiltration");
hInfo.add("reference=" + getToolkit().getArguments().referenceFile.getName());
hInfo.add(new VCFHeaderLine("source", "VariantFiltration"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
if ( clusterWindow > 0 )
hInfo.add("FILTER=" + CLUSTERED_SNP_FILTER_NAME + ",\"SNPs found in clusters\"");
hInfo.add(new VCFFilterHeaderLine(CLUSTERED_SNP_FILTER_NAME, "SNPs found in clusters"));
if ( filterExpression != null )
hInfo.add("FILTER=" + FILTER_NAME + ",\"" + FILTER_STRING + "\"");
hInfo.add(new VCFFilterHeaderLine(FILTER_NAME, FILTER_STRING));
List<ReferenceOrderedDataSource> dataSources = getToolkit().getRodDataSources();
for ( ReferenceOrderedDataSource source : dataSources ) {
if ( source.getReferenceOrderedData().getName().equals("mask") ) {
hInfo.add("FILTER=" + MASK_NAME + ",\"Overlaps a user-input mask\"");
hInfo.add(new VCFFilterHeaderLine(MASK_NAME, "Overlaps a user-input mask"));
break;
}
}

View File

@ -37,6 +37,8 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.cmdLine.*;
import org.broadinstitute.sting.utils.genotype.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine;
import net.sf.samtools.SAMReadGroupRecord;
@ -139,7 +141,7 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
samples.clear();
// get the optional header fields
Set<String> headerInfo = getHeaderInfo();
Set<VCFHeaderLine> headerInfo = getHeaderInfo();
// create the output writer stream
if ( VARIANTS_FILE != null )
@ -154,16 +156,16 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
callsMetrics = new CallMetrics();
}
private Set<String> getHeaderInfo() {
Set<String> headerInfo = new HashSet<String>();
private Set<VCFHeaderLine> getHeaderInfo() {
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
// this is only applicable to VCF
if ( UAC.VAR_FORMAT != GenotypeWriterFactory.GENOTYPE_FORMAT.VCF )
return headerInfo;
// first, the basic info
headerInfo.add("source=UnifiedGenotyper");
headerInfo.add("reference=" + getToolkit().getArguments().referenceFile.getName());
headerInfo.add(new VCFHeaderLine("source", "UnifiedGenotyper"));
headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
// annotation (INFO) fields from VariantAnnotator
if ( UAC.ALL_ANNOTATIONS )
@ -172,10 +174,10 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
headerInfo.addAll(VariantAnnotator.getVCFAnnotationDescriptions());
// annotation (INFO) fields from UnifiedGenotyper
headerInfo.add("INFO=AF,1,Float,\"Allele Frequency\"");
headerInfo.add("INFO=NS,1,Integer,\"Number of Samples With Data\"");
headerInfo.add(new VCFInfoHeaderLine("AF", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Allele Frequency"));
headerInfo.add(new VCFInfoHeaderLine("NS", 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Number of Samples With Data"));
if ( !UAC.NO_SLOD )
headerInfo.add("INFO=SB,1,Float,\"Strand Bias\"");
headerInfo.add(new VCFInfoHeaderLine("SB", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Strand Bias"));
// FORMAT fields if not in POOLED mode
if ( UAC.genotypeModel != GenotypeCalculationModel.Model.POOLED )
@ -184,7 +186,7 @@ public class UnifiedGenotyper extends LocusWalker<Pair<VariationCall, List<Genot
// all of the arguments from the argument collection
Map<String,String> commandLineArgs = CommandLineUtils.getApproximateCommandLineArguments(Collections.<Object>singleton(UAC));
for ( Map.Entry<String, String> commandLineArg : commandLineArgs.entrySet() )
headerInfo.add(String.format("UG_%s=%s", commandLineArg.getKey(), commandLineArg.getValue()));
headerInfo.add(new VCFHeaderLine(String.format("UG_%s", commandLineArg.getKey()), commandLineArg.getValue()));
return headerInfo;
}

View File

@ -59,9 +59,9 @@ public class VariantsToVCF extends RefWalker<Integer, Integer> {
//Calendar cal = Calendar.getInstance();
//metaData.put("fileDate", String.format("%d%02d%02d", cal.get(Calendar.YEAR), cal.get(Calendar.MONTH), cal.get(Calendar.DAY_OF_MONTH)));
Set<String> metaData = new HashSet<String>();
metaData.add("source=VariantsToVCF");
metaData.add("reference=" + args.referenceFile.getAbsolutePath());
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
metaData.add(new VCFHeaderLine("reference", args.referenceFile.getAbsolutePath()));
Set<String> additionalColumns = new HashSet<String>();
additionalColumns.add("FORMAT");

View File

@ -33,9 +33,9 @@ public class VCFSubsetWalker extends RefWalker<ArrayList<VCFRecord>, VCFWriter>
public void initializeWriter() {
Set<String> metaData = new HashSet<String>();
metaData.add("source=VariantsToVCF");
metaData.add("reference=" + this.getToolkit().getArguments().referenceFile.getAbsolutePath());
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine("source", "VariantsToVCF"));
metaData.add(new VCFHeaderLine("reference", this.getToolkit().getArguments().referenceFile.getAbsolutePath()));
Set<String> additionalColumns = new HashSet<String>();
additionalColumns.add("FORMAT");

View File

@ -38,7 +38,7 @@ public class GenotypeWriterFactory {
SAMFileHeader header,
File destination,
Set<String> sampleNames,
Set<String> headerInfo) {
Set<VCFHeaderLine> headerInfo) {
switch (format) {
case GLF:
return new GLFWriter(header.toString(), destination);
@ -57,7 +57,7 @@ public class GenotypeWriterFactory {
SAMFileHeader header,
PrintStream destination,
Set<String> sampleNames,
Set<String> headerInfo) {
Set<VCFHeaderLine> headerInfo) {
switch (format) {
case GELI:
return new GeliTextWriter(destination);

View File

@ -0,0 +1,40 @@
package org.broadinstitute.sting.utils.genotype.vcf;
/**
* @author ebanks
* <p/>
* Class VCFFilterHeaderLine
* <p/>
* A class representing a key=value entry for FILTER fields in the VCF header
*/
public class VCFFilterHeaderLine extends VCFHeaderLine {
private String mName;
private String mDescription;
/**
* create a VCF filter header line
*
* @param name the name for this header line
* @param description the description for this header line
*/
public VCFFilterHeaderLine(String name, String description) {
super("FILTER", "");
mName = name;
mDescription = description;
}
protected String makeStringRep() {
return String.format("FILTER=%s,\"%s\"", mName, mDescription);
}
public boolean equals(Object o) {
if ( !(o instanceof VCFFilterHeaderLine) )
return false;
VCFFilterHeaderLine other = (VCFFilterHeaderLine)o;
return mName.equals(other.mName) &&
mDescription.equals(other.mDescription);
}
}

View File

@ -0,0 +1,53 @@
package org.broadinstitute.sting.utils.genotype.vcf;
/**
* @author ebanks
* <p/>
* Class VCFFormatHeaderLine
* <p/>
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
*/
public class VCFFormatHeaderLine extends VCFHeaderLine {
// the info field types
public enum INFO_TYPE {
Integer, Float, String
}
private String mName;
private int mCount;
private String mDescription;
private INFO_TYPE mType;
/**
* create a VCF format header line
*
* @param name the name for this header line
* @param count the count for this header line
* @param type the type for this header line
* @param description the description for this header line
*/
public VCFFormatHeaderLine(String name, int count, INFO_TYPE type, String description) {
super("FORMAT", "");
mName = name;
mCount = count;
mType = type;
mDescription = description;
}
protected String makeStringRep() {
return String.format("FORMAT=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
}
public boolean equals(Object o) {
if ( !(o instanceof VCFFormatHeaderLine) )
return false;
VCFFormatHeaderLine other = (VCFFormatHeaderLine)o;
return mName.equals(other.mName) &&
mCount == other.mCount &&
mDescription.equals(other.mDescription) &&
mType == other.mType;
}
}

View File

@ -262,12 +262,12 @@ public class VCFGenotypeRecord implements Genotype, SampleBacked {
return result;
}
public static Set<String> getSupportedHeaderStrings() {
Set<String> result = new HashSet<String>();
result.add("FORMAT=" + GENOTYPE_KEY + ",1,String,\"Genotype\"");
result.add("FORMAT=" + GENOTYPE_QUALITY_KEY + ",1,Integer,\"Genotype Quality\"");
result.add("FORMAT=" + DEPTH_KEY + ",1,Integer,\"Read Depth (without MQ0 reads)\"");
//result.add("FORMAT=" + HAPLOTYPE_QUALITY_KEY + ",1,Integer,\"Haplotype Quality\"");
public static Set<VCFFormatHeaderLine> getSupportedHeaderStrings() {
Set<VCFFormatHeaderLine> result = new HashSet<VCFFormatHeaderLine>();
result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype"));
result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Genotype Quality"));
result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (without MQ0 reads)"));
//result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality"));
return result;
}
}

View File

@ -25,7 +25,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
public VCFGenotypeWriterAdapter(File writeTo, Set<String> sampleNames, Set<String> headerInfo) {
public VCFGenotypeWriterAdapter(File writeTo, Set<String> sampleNames, Set<VCFHeaderLine> headerInfo) {
mSampleNames.addAll(sampleNames);
initializeHeader(headerInfo);
@ -34,7 +34,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
mWriter = new VCFWriter(mHeader, writeTo);
}
public VCFGenotypeWriterAdapter(OutputStream writeTo, Set<String> sampleNames, Set<String> headerInfo) {
public VCFGenotypeWriterAdapter(OutputStream writeTo, Set<String> sampleNames, Set<VCFHeaderLine> headerInfo) {
mSampleNames.addAll(sampleNames);
initializeHeader(headerInfo);
@ -48,11 +48,11 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
*
* @param optionalHeaderInfo the optional header fields
*/
private void initializeHeader(Set<String> optionalHeaderInfo) {
Set<String> hInfo = new TreeSet<String>();
private void initializeHeader(Set<VCFHeaderLine> optionalHeaderInfo) {
Set<VCFHeaderLine> hInfo = new TreeSet<VCFHeaderLine>();
// setup the header fields
hInfo.add(VCFHeader.FULL_FORMAT_LINE);
hInfo.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION));
hInfo.addAll(optionalHeaderInfo);
// setup the sample names

View File

@ -8,12 +8,12 @@ import java.util.*;
* <p/>
* Class VCFHeader
* <p/>
* A descriptions should go here. Blame aaron if it's missing.
* A class representing the VCF header
*/
public class VCFHeader {
public static final String FILE_FORMAT_KEY = "fileformat=";
public static final String OLD_FILE_FORMAT_KEY = "format="; // from version 3.2
public static final String FILE_FORMAT_KEY = "fileformat";
public static final String OLD_FILE_FORMAT_KEY = "format"; // from version 3.2
/** the current vcf version we support. */
@ -22,7 +22,6 @@ public class VCFHeader {
public static final double VCF_VERSION_NUMBER = 3.3;
public static final String VCF_VERSION = VCF_VERSION_HEADER + VCF_VERSION_NUMBER;
public static final String FULL_FORMAT_LINE = FILE_FORMAT_KEY + VCF_VERSION;
// the manditory header fields
public enum HEADER_FIELDS {
@ -30,7 +29,7 @@ public class VCFHeader {
}
// the associated meta data
private final Set<String> mMetaData;
private final Set<VCFHeaderLine> mMetaData;
// the list of auxillary tags
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
@ -50,8 +49,8 @@ public class VCFHeader {
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(Set<String> metaData) {
mMetaData = new TreeSet<String>(metaData);
public VCFHeader(Set<VCFHeaderLine> metaData) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
checkVCFVersion();
}
@ -61,8 +60,8 @@ public class VCFHeader {
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the genotype format field, and the sample names
*/
public VCFHeader(Set<String> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<String>(metaData);
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
for (String col : genotypeSampleNames) {
if (!col.equals("FORMAT"))
mGenotypeSampleNames.add(col);
@ -77,19 +76,15 @@ public class VCFHeader {
*/
public void checkVCFVersion() {
String version = null;
for ( String field : mMetaData ) {
if ( field.startsWith(FILE_FORMAT_KEY) ) {
version = field.substring(FILE_FORMAT_KEY.length());
break;
}
else if ( field.startsWith(OLD_FILE_FORMAT_KEY) ) {
version = field.substring(OLD_FILE_FORMAT_KEY.length());
for ( VCFHeaderLine line : mMetaData ) {
if ( line.getKey().equals(FILE_FORMAT_KEY) || line.getKey().equals(OLD_FILE_FORMAT_KEY) ) {
version = line.getValue();
break;
}
}
if ( version == null )
mMetaData.add(FULL_FORMAT_LINE);
mMetaData.add(new VCFHeaderLine(FILE_FORMAT_KEY, VCF_VERSION));
else if ( !isSupportedVersion(version) )
throw new RuntimeException("VCF version " + version +
" is not yet supported; only version " + VCF_VERSION + " and earlier can be used");
@ -124,7 +119,7 @@ public class VCFHeader {
*
* @return a set of the meta data
*/
public Set<String> getMetaData() {
public Set<VCFHeaderLine> getMetaData() {
return mMetaData;
}

View File

@ -0,0 +1,86 @@
package org.broadinstitute.sting.utils.genotype.vcf;
/**
* @author ebanks
* <p/>
* Class VCFHeaderLine
* <p/>
* A class representing a key=value entry in the VCF header
*/
public class VCFHeaderLine implements Comparable {
private String stringRep = null;
private String mKey = null;
private String mValue = null;
/**
* create a VCF header line
*
* @param key the key for this header line
* @param value the value for this header line
*/
public VCFHeaderLine(String key, String value) {
mKey = key;
mValue = value;
}
/**
* Get the key
*
* @return the key
*/
public String getKey() {
return mKey;
}
/**
* Set the key
*
* @param key the key for this header line
*/
public void setKey(String key) {
mKey = key;
stringRep = null;
}
/**
* Get the value
*
* @return the value
*/
public String getValue() {
return mValue;
}
/**
* Set the value
*
* @param value the value for this header line
*/
public void setValue(String value) {
mValue = value;
stringRep = null;
}
public String toString() {
if ( stringRep == null )
stringRep = makeStringRep();
return stringRep;
}
protected String makeStringRep() {
return mKey + "=" + mValue;
}
public boolean equals(Object o) {
if ( !(o instanceof VCFHeaderLine) )
return false;
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
}
public int compareTo(Object other) {
return toString().compareTo(other.toString());
}
}

View File

@ -0,0 +1,53 @@
package org.broadinstitute.sting.utils.genotype.vcf;
/**
* @author ebanks
* <p/>
* Class VCFInfoHeaderLine
* <p/>
* A class representing a key=value entry for INFO fields in the VCF header
*/
public class VCFInfoHeaderLine extends VCFHeaderLine {
// the info field types
public enum INFO_TYPE {
Integer, Float, String
}
private String mName;
private int mCount;
private String mDescription;
private INFO_TYPE mType;
/**
* create a VCF info header line
*
* @param name the name for this header line
* @param count the count for this header line
* @param type the type for this header line
* @param description the description for this header line
*/
public VCFInfoHeaderLine(String name, int count, INFO_TYPE type, String description) {
super("INFO", "");
mName = name;
mCount = count;
mType = type;
mDescription = description;
}
protected String makeStringRep() {
return String.format("INFO=%s,%d,%s,\"%s\"", mName, mCount, mType.toString(), mDescription);
}
public boolean equals(Object o) {
if ( !(o instanceof VCFInfoHeaderLine) )
return false;
VCFInfoHeaderLine other = (VCFInfoHeaderLine)o;
return mName.equals(other.mName) &&
mCount == other.mCount &&
mDescription.equals(other.mDescription) &&
mType == other.mType;
}
}

View File

@ -146,7 +146,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
* @return a VCF Header created from the list of stinrgs
*/
protected VCFHeader createHeader(List<String> headerStrings) {
Set<String> metaData = new TreeSet<String>();
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
Set<String> auxTags = new LinkedHashSet<String>();
// iterate over all the passed in strings
for ( String str : headerStrings ) {
@ -169,7 +169,9 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
arrayIndex++;
}
} else {
metaData.add(str.substring(2));
int equals = str.indexOf("=");
if ( equals != -1 )
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
}
}

View File

@ -27,10 +27,10 @@ public class VCFUtils {
*
* @return a set of all fields
*/
public static Set<String> getHeaderFields(GenomeAnalysisEngine toolkit) {
public static Set<VCFHeaderLine> getHeaderFields(GenomeAnalysisEngine toolkit) {
// keep a map of sample name to occurrences encountered
TreeSet<String> fields = new TreeSet<String>();
TreeSet<VCFHeaderLine> fields = new TreeSet<VCFHeaderLine>();
// iterate to get all of the sample names
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();

View File

@ -50,24 +50,22 @@ public class VCFWriter {
new OutputStreamWriter(location));
try {
// the fileformat field needs to be written first
TreeSet<String> allMetaData = new TreeSet<String>(header.getMetaData());
for ( String metadata : allMetaData ) {
if ( metadata.startsWith(VCFHeader.FILE_FORMAT_KEY) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "\n");
break;
TreeSet<VCFHeaderLine> nonFormatMetaData = new TreeSet<VCFHeaderLine>();
for ( VCFHeaderLine line : header.getMetaData() ) {
if ( line.getKey().equals(VCFHeader.FILE_FORMAT_KEY) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + line.toString() + "\n");
}
else if ( metadata.startsWith(VCFHeader.OLD_FILE_FORMAT_KEY) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeader.FILE_FORMAT_KEY + metadata.substring(VCFHeader.OLD_FILE_FORMAT_KEY.length()) + "\n");
break;
else if ( line.getKey().equals(VCFHeader.OLD_FILE_FORMAT_KEY) ) {
mWriter.write(VCFHeader.METADATA_INDICATOR + VCFHeader.FILE_FORMAT_KEY + line.toString().substring(VCFHeader.OLD_FILE_FORMAT_KEY.length()) + "\n");
} else {
nonFormatMetaData.add(line);
}
}
// write the rest of the header meta-data out
for ( String metadata : header.getMetaData() ) {
if ( !metadata.startsWith(VCFHeader.FILE_FORMAT_KEY) && !metadata.startsWith(VCFHeader.OLD_FILE_FORMAT_KEY) )
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "\n");
}
for ( VCFHeaderLine line : nonFormatMetaData )
mWriter.write(VCFHeader.METADATA_INDICATOR + line + "\n");
// write out the column line
StringBuilder b = new StringBuilder();
b.append(VCFHeader.HEADER_INDICATOR);

View File

@ -17,8 +17,7 @@ import java.util.*;
*/
public class VCFHeaderTest extends BaseTest {
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
private Set<String> metaData = new HashSet();
private Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
private Set<String> additionalColumns = new HashSet<String>();
/**
@ -26,8 +25,8 @@ public class VCFHeaderTest extends BaseTest {
*/
@Test
public void testHeaderConstructor() {
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
metaData.add("two=2");
metaData.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION));
metaData.add(new VCFHeaderLine("two", "2"));
additionalColumns.add("extra1");
additionalColumns.add("extra2");
// this should create a header that is valid

View File

@ -24,17 +24,16 @@ public class VCFReaderTest extends BaseTest {
private static final File complexFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/complexExample.vcf");
private static final File headerNoRecordsFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/headerNoRecords.vcf");
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
@BeforeClass
public static void beforeTests() {
try {
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File("/broad/1KG/reference/human_b36_both.fasta"));
GenomeLocParser.setupRefContigOrdering(seq);
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
}
@Test
public void testVCFInput() {
@ -330,7 +329,6 @@ public class VCFReaderTest extends BaseTest {
public void testHeaderNoRecords() {
VCFReader reader = new VCFReader(headerNoRecordsFile);
Assert.assertTrue(reader.getHeader().getMetaData() != null);
Iterator<VCFRecord> iter = reader.iterator();
Assert.assertTrue(!reader.iterator().hasNext());
}

View File

@ -22,21 +22,20 @@ import java.io.FileNotFoundException;
*/
public class VCFRecordTest extends BaseTest {
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq);
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/**
* create a fake VCF record
*
* @param infoFields the info fields
* @return a VCFRecord
*/
private static VCFRecord makeFakeVCFRecord(Map<String, String> infoFields) {
@ -140,9 +139,9 @@ public class VCFRecordTest extends BaseTest {
* @return a fake VCF header
*/
public static VCFHeader createFakeHeader() {
Set<String> metaData = new HashSet();
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
metaData.add("two=2");
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION));
metaData.add(new VCFHeaderLine("two", "2"));
Set<String> additionalColumns = new HashSet<String>();
additionalColumns.add("FORMAT");
additionalColumns.add("sample1");
@ -158,8 +157,6 @@ public class VCFRecordTest extends BaseTest {
Map<String, String> infoFields = new HashMap<String, String>();
infoFields.put("DP", "50");
VCFRecord rec = makeFakeVCFRecord(infoFields);
Map<String, String> metaData = new HashMap<String, String>();
List<String> additionalColumns = new ArrayList<String>();
String rep = rec.toStringEncoding(createFakeHeader());
Assert.assertTrue(stringRep.equals(rep));
rec.addInfoField("AB", "CD");

View File

@ -21,21 +21,18 @@ import java.util.*;
* This class tests out the ability of the VCF writer to correctly write VCF files
*/
public class VCFWriterTest extends BaseTest {
private Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
private Set<String> metaData = new HashSet();
private Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
private Set<String> additionalColumns = new HashSet<String>();
private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf");
private static IndexedFastaSequenceFile seq;
@BeforeClass
public static void beforeTests() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq);
} catch (FileNotFoundException e) {
throw new StingException("unable to load the sequence dictionary");
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/** test, using the writer and reader, that we can output and input a VCF file without problems */
@ -50,21 +47,23 @@ public class VCFWriterTest extends BaseTest {
int counter = 0;
// validate what we're reading in
validateHeader(reader.getHeader());
for(VCFRecord rec :reader) {
for (VCFRecord rec : reader) {
counter++;
}
Assert.assertEquals(2,counter);
Assert.assertEquals(2,counter);
reader.close();
fakeVCFFile.delete();
}
/**
* create a fake header of known quantity
* @param metaData the header lines
* @param additionalColumns the additional column names
* @return a fake VCF header
*/
public static VCFHeader createFakeHeader(Set<String> metaData, Set<String> additionalColumns) {
metaData.add(VCFHeader.FULL_FORMAT_LINE); // required
metaData.add("two=2");
public static VCFHeader createFakeHeader(Set<VCFHeaderLine> metaData, Set<String> additionalColumns) {
metaData.add(new VCFHeaderLine(VCFHeader.FILE_FORMAT_KEY, VCFHeader.VCF_VERSION));
metaData.add(new VCFHeaderLine("two", "2"));
additionalColumns.add("FORMAT");
additionalColumns.add("extra1");
additionalColumns.add("extra2");