Expanded, contrastive VariantContextBenchmark
-- Compares performance across a bunch of common operations with GATK 1.3 version of VariantContext and GATK 1.4 -- 1.3 VC and associated utilities copied wholesale into test directory under v13
This commit is contained in:
parent
e56d52006a
commit
101ffc4dfd
|
|
@ -27,8 +27,14 @@ package org.broadinstitute.sting.utils.variantcontext;
|
|||
import com.google.caliper.Param;
|
||||
import com.google.caliper.SimpleBenchmark;
|
||||
import com.google.caliper.runner.CaliperMain;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
|
@ -46,24 +52,39 @@ public class VariantContextBenchmark extends SimpleBenchmark {
|
|||
@Param({"100"})
|
||||
int nSamplesToTake; // set automatically by framework
|
||||
|
||||
@Param({"READ", "READ_SUBSET"})
|
||||
Operation operation; // set automatically by framework
|
||||
@Param({"10"})
|
||||
int dupsToMerge; // set automatically by framework
|
||||
|
||||
@Param({"OF_SAMPLES"})
|
||||
SubContextOp subContextOp; // set automatically by framework
|
||||
@Param
|
||||
Operation operation; // set automatically by framework
|
||||
|
||||
private String INPUT_STRING;
|
||||
|
||||
public enum Operation {
|
||||
READ,
|
||||
READ_SUBSET
|
||||
SUBSET_TO_SAMPLES,
|
||||
GET_TYPE,
|
||||
GET_ID,
|
||||
GET_GENOTYPES,
|
||||
GET_ATTRIBUTE_STRING,
|
||||
GET_ATTRIBUTE_INT,
|
||||
GET_N_SAMPLES,
|
||||
GET_GENOTYPES_FOR_SAMPLES,
|
||||
GET_GENOTYPES_IN_ORDER_OF_NAME,
|
||||
CALC_GENOTYPE_COUNTS,
|
||||
MERGE
|
||||
}
|
||||
|
||||
public enum SubContextOp {
|
||||
OF_SAMPLES
|
||||
}
|
||||
private GenomeLocParser b37GenomeLocParser;
|
||||
|
||||
@Override protected void setUp() {
|
||||
try {
|
||||
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.b37KGReference));
|
||||
b37GenomeLocParser = new GenomeLocParser(seq);
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
// read it into a String so that we don't try to benchmark IO issues
|
||||
try {
|
||||
FileInputStream s = new FileInputStream(new File(vcfFile));
|
||||
|
|
@ -83,52 +104,266 @@ public class VariantContextBenchmark extends SimpleBenchmark {
|
|||
}
|
||||
}
|
||||
|
||||
private void parseGenotypes(VCFCodec codec, Operation op, SubContextOp subop ) {
|
||||
private interface FunctionToBenchmark<T extends Feature> {
|
||||
public void run(T vc);
|
||||
}
|
||||
|
||||
private <T extends Feature> void runBenchmark(FeatureCodec<T> codec, FunctionToBenchmark<T> func) {
|
||||
try {
|
||||
InputStream is = new ByteArrayInputStream(INPUT_STRING.getBytes());
|
||||
AsciiLineReader lineReader = new AsciiLineReader(is);
|
||||
codec.readHeader(lineReader);
|
||||
|
||||
int counter = 0;
|
||||
Set<String> samples = null;
|
||||
while (counter++ < linesToRead ) {
|
||||
String line = lineReader.readLine();
|
||||
if ( line == null )
|
||||
break;
|
||||
|
||||
VariantContext vc = (VariantContext)codec.decode(line);
|
||||
if ( samples == null ) {
|
||||
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
|
||||
}
|
||||
|
||||
if ( op == Operation.READ_SUBSET)
|
||||
processOneVC(vc, samples, subop);
|
||||
T vc = codec.decode(line);
|
||||
func.run(vc);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println("Benchmarking run failure because of " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public void timeMe(int rep) {
|
||||
for ( int i = 0; i < rep; i++ )
|
||||
parseGenotypes(new VCFCodec(), operation, subContextOp);
|
||||
public void timeV14(int rep) {
|
||||
for ( int i = 0; i < rep; i++ ) {
|
||||
FunctionToBenchmark<VariantContext> func = getV14FunctionToBenchmark();
|
||||
FeatureCodec<VariantContext> codec = new VCFCodec();
|
||||
runBenchmark(codec, func);
|
||||
}
|
||||
}
|
||||
|
||||
public FunctionToBenchmark<VariantContext> getV14FunctionToBenchmark() {
|
||||
switch ( operation ) {
|
||||
case READ:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
; // empty operation
|
||||
}
|
||||
};
|
||||
case SUBSET_TO_SAMPLES:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
Set<String> samples;
|
||||
public void run(final VariantContext vc) {
|
||||
if ( samples == null )
|
||||
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
|
||||
VariantContext sub = vc.subContextFromSamples(samples);
|
||||
sub.getNSamples();
|
||||
}
|
||||
};
|
||||
case GET_TYPE:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getType();
|
||||
}
|
||||
};
|
||||
case GET_ID:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getID();
|
||||
}
|
||||
};
|
||||
case GET_GENOTYPES:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getGenotypes().size();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_GENOTYPES_FOR_SAMPLES:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
Set<String> samples;
|
||||
public void run(final VariantContext vc) {
|
||||
if ( samples == null )
|
||||
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
|
||||
vc.getGenotypes(samples).size();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_ATTRIBUTE_STRING:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getAttribute("AN", null);
|
||||
}
|
||||
};
|
||||
|
||||
case GET_ATTRIBUTE_INT:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getAttributeAsInt("AC", 0);
|
||||
}
|
||||
};
|
||||
|
||||
case GET_N_SAMPLES:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getNSamples();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_GENOTYPES_IN_ORDER_OF_NAME:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
; // TODO - TEST IS BROKEN
|
||||
// int n = 0;
|
||||
// for ( final Genotype g: vc.getGenotypesSortedByName() ) n++;
|
||||
}
|
||||
};
|
||||
|
||||
case CALC_GENOTYPE_COUNTS:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
vc.getHetCount();
|
||||
}
|
||||
};
|
||||
|
||||
case MERGE:
|
||||
return new FunctionToBenchmark<VariantContext>() {
|
||||
public void run(final VariantContext vc) {
|
||||
List<VariantContext> toMerge = new ArrayList<VariantContext>();
|
||||
|
||||
for ( int i = 0; i < dupsToMerge; i++ ) {
|
||||
GenotypeCollection gc = GenotypeCollection.create(vc.getNSamples());
|
||||
for ( final Genotype g : vc.getGenotypes() ) {
|
||||
gc.add(new Genotype(g.getSampleName()+"_"+i, g));
|
||||
}
|
||||
toMerge.add(VariantContext.modifyGenotypes(vc, gc));
|
||||
}
|
||||
|
||||
VariantContextUtils.simpleMerge(b37GenomeLocParser, toMerge, null,
|
||||
VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||
VariantContextUtils.GenotypeMergeType.UNSORTED,
|
||||
true, false, "set", false, true);
|
||||
}
|
||||
};
|
||||
|
||||
default: throw new IllegalArgumentException("Unexpected operation " + operation);
|
||||
}
|
||||
}
|
||||
|
||||
public void timeV13(int rep) {
|
||||
for ( int i = 0; i < rep; i++ ) {
|
||||
FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> func = getV13FunctionToBenchmark();
|
||||
FeatureCodec<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> codec = new org.broadinstitute.sting.utils.variantcontext.v13.VCFCodec();
|
||||
runBenchmark(codec, func);
|
||||
}
|
||||
}
|
||||
|
||||
public FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> getV13FunctionToBenchmark() {
|
||||
switch ( operation ) {
|
||||
case READ:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
; // empty operation
|
||||
}
|
||||
};
|
||||
case SUBSET_TO_SAMPLES:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
List<String> samples;
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
if ( samples == null )
|
||||
samples = new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake);
|
||||
org.broadinstitute.sting.utils.variantcontext.v13.VariantContext sub = vc.subContextFromGenotypes(vc.getGenotypes(samples).values());
|
||||
sub.getNSamples();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_TYPE:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getType();
|
||||
}
|
||||
};
|
||||
case GET_ID:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getID();
|
||||
}
|
||||
};
|
||||
case GET_GENOTYPES:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getGenotypes().size();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_GENOTYPES_FOR_SAMPLES:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
Set<String> samples;
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
if ( samples == null )
|
||||
samples = new HashSet<String>(new ArrayList<String>(vc.getSampleNames()).subList(0, nSamplesToTake));
|
||||
vc.getGenotypes(samples).size();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_ATTRIBUTE_STRING:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getAttribute("AN", null);
|
||||
}
|
||||
};
|
||||
|
||||
case GET_ATTRIBUTE_INT:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getAttributeAsInt("AC", 0);
|
||||
}
|
||||
};
|
||||
|
||||
case GET_N_SAMPLES:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getNSamples();
|
||||
}
|
||||
};
|
||||
|
||||
case GET_GENOTYPES_IN_ORDER_OF_NAME:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
; // TODO - TEST IS BROKEN
|
||||
//vc.getGenotypesSortedByName();
|
||||
}
|
||||
};
|
||||
|
||||
case CALC_GENOTYPE_COUNTS:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
vc.getHetCount();
|
||||
}
|
||||
};
|
||||
|
||||
case MERGE:
|
||||
return new FunctionToBenchmark<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>() {
|
||||
public void run(final org.broadinstitute.sting.utils.variantcontext.v13.VariantContext vc) {
|
||||
List<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext> toMerge = new ArrayList<org.broadinstitute.sting.utils.variantcontext.v13.VariantContext>();
|
||||
|
||||
for ( int i = 0; i < dupsToMerge; i++ ) {
|
||||
Map<String, org.broadinstitute.sting.utils.variantcontext.v13.Genotype> gc = new HashMap<String, org.broadinstitute.sting.utils.variantcontext.v13.Genotype>();
|
||||
for ( final org.broadinstitute.sting.utils.variantcontext.v13.Genotype g : vc.getGenotypes().values() ) {
|
||||
String name = g.getSampleName()+"_"+i;
|
||||
gc.put(name, new org.broadinstitute.sting.utils.variantcontext.v13.Genotype(name,
|
||||
g.getAlleles(), g.getNegLog10PError(), g.getFilters(), g.getAttributes(), g.isPhased(), g.getLikelihoods().getAsVector()));
|
||||
toMerge.add(org.broadinstitute.sting.utils.variantcontext.v13.VariantContext.modifyGenotypes(vc, gc));
|
||||
}
|
||||
}
|
||||
|
||||
org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.simpleMerge(b37GenomeLocParser,
|
||||
toMerge, null,
|
||||
org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||
org.broadinstitute.sting.utils.variantcontext.v13.VariantContextUtils.GenotypeMergeType.UNSORTED,
|
||||
true, false, "set", false, true);
|
||||
}
|
||||
};
|
||||
|
||||
default: throw new IllegalArgumentException("Unexpected operation " + operation);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
CaliperMain.main(VariantContextBenchmark.class, args);
|
||||
}
|
||||
|
||||
private static final void processOneVC(VariantContext vc, Set<String> samples, SubContextOp subop) {
|
||||
VariantContext sub;
|
||||
|
||||
switch ( subop ) {
|
||||
case OF_SAMPLES:
|
||||
sub = vc.subContextFromSamples(samples, vc.getAlleles());
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected op: " + subop);
|
||||
}
|
||||
|
||||
sub.getNSamples();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,635 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.NameAwareCodec;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.BlockCompressedInputStream;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
|
||||
abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, VCFParser {
|
||||
|
||||
protected final static Logger log = Logger.getLogger(VCFCodec.class);
|
||||
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
|
||||
|
||||
protected VCFHeaderVersion version;
|
||||
|
||||
// we have to store the list of strings that make up the header until they're needed
|
||||
protected VCFHeader header = null;
|
||||
|
||||
// a mapping of the allele
|
||||
protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3);
|
||||
|
||||
// for ParsingUtils.split
|
||||
protected String[] GTValueArray = new String[100];
|
||||
protected String[] genotypeKeyArray = new String[100];
|
||||
protected String[] infoFieldArray = new String[1000];
|
||||
protected String[] infoValueArray = new String[1000];
|
||||
|
||||
// for performance testing purposes
|
||||
public static boolean validate = true;
|
||||
|
||||
// a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over
|
||||
// todo: make this thread safe?
|
||||
protected String[] parts = null;
|
||||
protected String[] genotypeParts = null;
|
||||
|
||||
// for performance we cache the hashmap of filter encodings for quick lookup
|
||||
protected HashMap<String,LinkedHashSet<String>> filterHash = new HashMap<String,LinkedHashSet<String>>();
|
||||
|
||||
// a mapping of the VCF fields to their type, filter fields, and format fields, for quick lookup to validate against
|
||||
TreeMap<String, VCFHeaderLineType> infoFields = new TreeMap<String, VCFHeaderLineType>();
|
||||
TreeMap<String, VCFHeaderLineType> formatFields = new TreeMap<String, VCFHeaderLineType>();
|
||||
Set<String> filterFields = new HashSet<String>();
|
||||
|
||||
// we store a name to give to each of the variant contexts we emit
|
||||
protected String name = "Unknown";
|
||||
|
||||
protected int lineNo = 0;
|
||||
|
||||
protected Map<String, String> stringCache = new HashMap<String, String>();
|
||||
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
* @return the number of header lines
|
||||
*/
|
||||
public abstract Object readHeader(LineReader reader);
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @param chr chrom
|
||||
* @param pos position
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public abstract Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
|
||||
|
||||
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied
|
||||
*/
|
||||
protected abstract Set<String> parseFilters(String filterString);
|
||||
|
||||
/**
|
||||
* create a VCF header
|
||||
* @param headerStrings a list of strings that represent all the ## entries
|
||||
* @param line the single # line (column names)
|
||||
* @return the count of header lines
|
||||
*/
|
||||
protected Object createHeader(List<String> headerStrings, String line) {
|
||||
|
||||
headerStrings.add(line);
|
||||
|
||||
Set<VCFHeaderLine> metaData = new TreeSet<VCFHeaderLine>();
|
||||
Set<String> auxTags = new LinkedHashSet<String>();
|
||||
// iterate over all the passed in strings
|
||||
for ( String str : headerStrings ) {
|
||||
if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) {
|
||||
String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR);
|
||||
if ( strings.length < VCFHeader.HEADER_FIELDS.values().length )
|
||||
throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str);
|
||||
|
||||
int arrayIndex = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||
try {
|
||||
if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex]))
|
||||
throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'");
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name.");
|
||||
}
|
||||
arrayIndex++;
|
||||
}
|
||||
|
||||
boolean sawFormatTag = false;
|
||||
if ( arrayIndex < strings.length ) {
|
||||
if ( !strings[arrayIndex].equals("FORMAT") )
|
||||
throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
|
||||
sawFormatTag = true;
|
||||
arrayIndex++;
|
||||
}
|
||||
|
||||
while ( arrayIndex < strings.length )
|
||||
auxTags.add(strings[arrayIndex++]);
|
||||
|
||||
if ( sawFormatTag && auxTags.size() == 0 )
|
||||
throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
|
||||
|
||||
} else {
|
||||
if ( str.startsWith("##INFO=") ) {
|
||||
VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
|
||||
metaData.add(info);
|
||||
infoFields.put(info.getName(), info.getType());
|
||||
} else if ( str.startsWith("##FILTER=") ) {
|
||||
VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9),version);
|
||||
metaData.add(filter);
|
||||
filterFields.add(filter.getName());
|
||||
} else if ( str.startsWith("##FORMAT=") ) {
|
||||
VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9),version);
|
||||
metaData.add(format);
|
||||
formatFields.put(format.getName(), format.getType());
|
||||
} else {
|
||||
int equals = str.indexOf("=");
|
||||
if ( equals != -1 )
|
||||
metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
header = new VCFHeader(metaData, auxTags);
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* the fast decode function
|
||||
* @param line the line of text for the record
|
||||
* @return a feature, (not guaranteed complete) that has the correct start and stop
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
lineNo++;
|
||||
|
||||
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
|
||||
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
|
||||
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
|
||||
|
||||
final String[] locParts = new String[6];
|
||||
int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
||||
|
||||
if ( nParts != 6 )
|
||||
throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
|
||||
|
||||
// get our alleles (because the end position depends on them)
|
||||
final String ref = getCachedString(locParts[3].toUpperCase());
|
||||
final String alts = getCachedString(locParts[4].toUpperCase());
|
||||
final List<Allele> alleles = parseAlleles(ref, alts, lineNo);
|
||||
|
||||
// find out our location
|
||||
final int start = Integer.valueOf(locParts[1]);
|
||||
int stop = start;
|
||||
|
||||
// ref alleles don't need to be single bases for monomorphic sites
|
||||
if ( alleles.size() == 1 ) {
|
||||
stop = start + alleles.get(0).length() - 1;
|
||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||
stop = clipAlleles(start, ref, alleles, null, lineNo);
|
||||
}
|
||||
|
||||
return new VCFLocFeature(locParts[0], start, stop);
|
||||
}
|
||||
|
||||
private final static class VCFLocFeature implements Feature {
|
||||
|
||||
final String chr;
|
||||
final int start, stop;
|
||||
|
||||
private VCFLocFeature(String chr, int start, int stop) {
|
||||
this.chr = chr;
|
||||
this.start = start;
|
||||
this.stop = stop;
|
||||
}
|
||||
|
||||
public String getChr() { return chr; }
|
||||
public int getStart() { return start; }
|
||||
public int getEnd() { return stop; }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* decode the line into a feature (VariantContext)
|
||||
* @param line the line
|
||||
* @return a VariantContext
|
||||
*/
|
||||
public Feature decode(String line) {
|
||||
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
|
||||
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
|
||||
|
||||
// our header cannot be null, we need the genotype sample names and counts
|
||||
if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
|
||||
|
||||
if (parts == null)
|
||||
parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
|
||||
|
||||
int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
|
||||
|
||||
// if we have don't have a header, or we have a header with no genotyping data check that we have eight columns. Otherwise check that we have nine (normal colummns + genotyping data)
|
||||
if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
|
||||
(header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
|
||||
throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
|
||||
" tokens, and saw " + nParts + " )", lineNo);
|
||||
|
||||
return parseVCFLine(parts);
|
||||
}
|
||||
|
||||
protected void generateException(String message) {
|
||||
throw new UserException.MalformedVCF(message, lineNo);
|
||||
}
|
||||
|
||||
protected static void generateException(String message, int lineNo) {
|
||||
throw new UserException.MalformedVCF(message, lineNo);
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the VCF line
|
||||
*
|
||||
* @param parts the parts split up
|
||||
* @return a variant context object
|
||||
*/
|
||||
private VariantContext parseVCFLine(String[] parts) {
|
||||
// increment the line count
|
||||
lineNo++;
|
||||
|
||||
// parse out the required fields
|
||||
String contig = getCachedString(parts[0]);
|
||||
int pos = Integer.valueOf(parts[1]);
|
||||
String id = null;
|
||||
if ( parts[2].length() == 0 )
|
||||
generateException("The VCF specification requires a valid ID field");
|
||||
else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) )
|
||||
id = VCFConstants.EMPTY_ID_FIELD;
|
||||
else
|
||||
id = new String(parts[2]);
|
||||
String ref = getCachedString(parts[3].toUpperCase());
|
||||
String alts = getCachedString(parts[4].toUpperCase());
|
||||
Double qual = parseQual(parts[5]);
|
||||
String filter = getCachedString(parts[6]);
|
||||
String info = new String(parts[7]);
|
||||
|
||||
// get our alleles, filters, and setup an attribute map
|
||||
List<Allele> alleles = parseAlleles(ref, alts, lineNo);
|
||||
Set<String> filters = parseFilters(filter);
|
||||
Map<String, Object> attributes = parseInfo(info, id);
|
||||
|
||||
// find out our current location, and clip the alleles down to their minimum length
|
||||
int loc = pos;
|
||||
// ref alleles don't need to be single bases for monomorphic sites
|
||||
if ( alleles.size() == 1 ) {
|
||||
loc = pos + alleles.get(0).length() - 1;
|
||||
} else if ( !isSingleNucleotideEvent(alleles) ) {
|
||||
ArrayList<Allele> newAlleles = new ArrayList<Allele>();
|
||||
loc = clipAlleles(pos, ref, alleles, newAlleles, lineNo);
|
||||
alleles = newAlleles;
|
||||
}
|
||||
|
||||
// do we have genotyping data
|
||||
if (parts.length > NUM_STANDARD_FIELDS) {
|
||||
attributes.put(VariantContext.UNPARSED_GENOTYPE_MAP_KEY, new String(parts[8]));
|
||||
attributes.put(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY, this);
|
||||
}
|
||||
|
||||
VariantContext vc = null;
|
||||
try {
|
||||
vc = new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]);
|
||||
} catch (Exception e) {
|
||||
generateException(e.getMessage());
|
||||
}
|
||||
|
||||
// did we resort the sample names? If so, we need to load the genotype data
|
||||
if ( !header.samplesWereAlreadySorted() )
|
||||
vc.getGenotypes();
|
||||
|
||||
return vc;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return the type of record
|
||||
*/
|
||||
public Class<VariantContext> getFeatureType() {
|
||||
return VariantContext.class;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the name of this codec
|
||||
* @return our set name
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the name of this codec
|
||||
* @param name new name
|
||||
*/
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a cached copy of the supplied string.
|
||||
*
|
||||
* @param str string
|
||||
* @return interned string
|
||||
*/
|
||||
protected String getCachedString(String str) {
|
||||
String internedString = stringCache.get(str);
|
||||
if ( internedString == null ) {
|
||||
internedString = new String(str);
|
||||
stringCache.put(internedString, internedString);
|
||||
}
|
||||
return internedString;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the info fields
|
||||
* @param infoField the fields
|
||||
* @param id the indentifier
|
||||
* @return a mapping of keys to objects
|
||||
*/
|
||||
private Map<String, Object> parseInfo(String infoField, String id) {
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
if ( infoField.length() == 0 )
|
||||
generateException("The VCF specification requires a valid info field");
|
||||
|
||||
if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) {
|
||||
if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 )
|
||||
generateException("The VCF specification does not allow for whitespace in the INFO field");
|
||||
|
||||
int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false);
|
||||
for (int i = 0; i < infoFieldSplitSize; i++) {
|
||||
String key;
|
||||
Object value;
|
||||
|
||||
int eqI = infoFieldArray[i].indexOf("=");
|
||||
if ( eqI != -1 ) {
|
||||
key = infoFieldArray[i].substring(0, eqI);
|
||||
String str = infoFieldArray[i].substring(eqI+1);
|
||||
|
||||
// split on the INFO field separator
|
||||
int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false);
|
||||
if ( infoValueSplitSize == 1 ) {
|
||||
value = infoValueArray[0];
|
||||
} else {
|
||||
ArrayList<String> valueList = new ArrayList<String>(infoValueSplitSize);
|
||||
for ( int j = 0; j < infoValueSplitSize; j++ )
|
||||
valueList.add(infoValueArray[j]);
|
||||
value = valueList;
|
||||
}
|
||||
} else {
|
||||
key = infoFieldArray[i];
|
||||
value = true;
|
||||
}
|
||||
|
||||
attributes.put(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) )
|
||||
attributes.put(VariantContext.ID_KEY, id);
|
||||
return attributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a an allele from an index and an array of alleles
|
||||
* @param index the index
|
||||
* @param alleles the alleles
|
||||
* @return an Allele
|
||||
*/
|
||||
protected static Allele oneAllele(String index, List<Allele> alleles) {
|
||||
if ( index.equals(VCFConstants.EMPTY_ALLELE) )
|
||||
return Allele.NO_CALL;
|
||||
int i = Integer.valueOf(index);
|
||||
if ( i >= alleles.size() )
|
||||
throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record");
|
||||
return alleles.get(i);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* parse genotype alleles from the genotype string
|
||||
* @param GT GT string
|
||||
* @param alleles list of possible alleles
|
||||
* @param cache cache of alleles for GT
|
||||
* @return the allele list for the GT string
|
||||
*/
|
||||
protected static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) {
|
||||
// cache results [since they are immutable] and return a single object for each genotype
|
||||
List<Allele> GTAlleles = cache.get(GT);
|
||||
|
||||
if ( GTAlleles == null ) {
|
||||
StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS);
|
||||
GTAlleles = new ArrayList<Allele>(st.countTokens());
|
||||
while ( st.hasMoreTokens() ) {
|
||||
String genotype = st.nextToken();
|
||||
GTAlleles.add(oneAllele(genotype, alleles));
|
||||
}
|
||||
cache.put(GT, GTAlleles);
|
||||
}
|
||||
|
||||
return GTAlleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the qual value
|
||||
* @param qualString the quality string
|
||||
* @return return a double
|
||||
*/
|
||||
protected static Double parseQual(String qualString) {
|
||||
// if we're the VCF 4 missing char, return immediately
|
||||
if ( qualString.equals(VCFConstants.MISSING_VALUE_v4))
|
||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||
|
||||
Double val = Double.valueOf(qualString);
|
||||
|
||||
// check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles
|
||||
if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON))
|
||||
return VariantContext.NO_NEG_LOG_10PERROR;
|
||||
|
||||
// scale and return the value
|
||||
return val / 10.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* parse out the alleles
|
||||
* @param ref the reference base
|
||||
* @param alts a string of alternates to break into alleles
|
||||
* @param lineNo the line number for this record
|
||||
* @return a list of alleles, and a pair of the shortest and longest sequence
|
||||
*/
|
||||
protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) {
|
||||
List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic
|
||||
// ref
|
||||
checkAllele(ref, true, lineNo);
|
||||
Allele refAllele = Allele.create(ref, true);
|
||||
alleles.add(refAllele);
|
||||
|
||||
if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split
|
||||
parseSingleAltAllele(alleles, alts, lineNo);
|
||||
else
|
||||
for ( String alt : alts.split(",") )
|
||||
parseSingleAltAllele(alleles, alt, lineNo);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* check to make sure the allele is an acceptable allele
|
||||
* @param allele the allele to check
|
||||
* @param isRef are we the reference allele?
|
||||
* @param lineNo the line number for this record
|
||||
*/
|
||||
private static void checkAllele(String allele, boolean isRef, int lineNo) {
|
||||
if ( allele == null || allele.length() == 0 )
|
||||
generateException("Empty alleles are not permitted in VCF records", lineNo);
|
||||
|
||||
if ( isSymbolicAllele(allele) ) {
|
||||
if ( isRef ) {
|
||||
generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo);
|
||||
}
|
||||
} else {
|
||||
// check for VCF3 insertions or deletions
|
||||
if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) )
|
||||
generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" +
|
||||
" convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo);
|
||||
|
||||
if (!Allele.acceptableAlleleBases(allele))
|
||||
generateException("Unparsable vcf record with allele " + allele, lineNo);
|
||||
|
||||
if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) )
|
||||
generateException("The reference allele cannot be missing", lineNo);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* return true if this is a symbolic allele (e.g. <SOMETAG>) otherwise false
|
||||
* @param allele the allele to check
|
||||
* @return true if the allele is a symbolic allele, otherwise false
|
||||
*/
|
||||
private static boolean isSymbolicAllele(String allele) {
|
||||
return (allele != null && allele.startsWith("<") && allele.endsWith(">") && allele.length() > 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* parse a single allele, given the allele list
|
||||
* @param alleles the alleles available
|
||||
* @param alt the allele to parse
|
||||
* @param lineNo the line number for this record
|
||||
*/
|
||||
private static void parseSingleAltAllele(List<Allele> alleles, String alt, int lineNo) {
|
||||
checkAllele(alt, false, lineNo);
|
||||
|
||||
Allele allele = Allele.create(alt, false);
|
||||
if ( ! allele.isNoCall() )
|
||||
alleles.add(allele);
|
||||
}
|
||||
|
||||
protected static boolean isSingleNucleotideEvent(List<Allele> alleles) {
|
||||
for ( Allele a : alleles ) {
|
||||
if ( a.length() != 1 )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
|
||||
boolean clipping = true;
|
||||
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() )
|
||||
continue;
|
||||
|
||||
if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) {
|
||||
clipping = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (clipping) ? 1 : 0;
|
||||
}
|
||||
|
||||
protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
|
||||
int clipping = 0;
|
||||
boolean stillClipping = true;
|
||||
|
||||
while ( stillClipping ) {
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() )
|
||||
continue;
|
||||
|
||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
|
||||
stillClipping = false;
|
||||
else if ( ref.length() == clipping )
|
||||
generateException("bad alleles encountered", lineNo);
|
||||
else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] )
|
||||
stillClipping = false;
|
||||
}
|
||||
if ( stillClipping )
|
||||
clipping++;
|
||||
}
|
||||
|
||||
return clipping;
|
||||
}
|
||||
/**
|
||||
* clip the alleles, based on the reference
|
||||
*
|
||||
* @param position the unadjusted start position (pre-clipping)
|
||||
* @param ref the reference string
|
||||
* @param unclippedAlleles the list of unclipped alleles
|
||||
* @param clippedAlleles output list of clipped alleles
|
||||
* @param lineNo the current line number in the file
|
||||
* @return the new reference end position of this event
|
||||
*/
|
||||
protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {
|
||||
|
||||
int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
|
||||
int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);
|
||||
|
||||
if ( clippedAlleles != null ) {
|
||||
for ( Allele a : unclippedAlleles ) {
|
||||
if ( a.isSymbolic() ) {
|
||||
clippedAlleles.add(a);
|
||||
} else {
|
||||
clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the new reference length
|
||||
int refLength = ref.length() - reverseClipping;
|
||||
|
||||
return position+Math.max(refLength - 1,0);
|
||||
}
|
||||
|
||||
public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
|
||||
try {
|
||||
return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
|
||||
isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) ||
|
||||
isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
return false;
|
||||
} catch ( IOException e ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) {
|
||||
try {
|
||||
byte[] buff = new byte[MAGIC_HEADER_LINE.length()];
|
||||
int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length());
|
||||
boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes());
|
||||
return eq;
|
||||
// String firstLine = new String(buff);
|
||||
// return firstLine.startsWith(MAGIC_HEADER_LINE);
|
||||
} catch ( IOException e ) {
|
||||
return false;
|
||||
} catch ( RuntimeException e ) {
|
||||
return false;
|
||||
} finally {
|
||||
try { stream.close(); } catch ( IOException e ) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,456 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Immutable representation of an allele
|
||||
*
|
||||
* Types of alleles:
|
||||
*
|
||||
* Ref: a t C g a // C is the reference base
|
||||
*
|
||||
* : a t G g a // C base is a G in some individuals
|
||||
*
|
||||
* : a t - g a // C base is deleted w.r.t. the reference
|
||||
*
|
||||
* : a t CAg a // A base is inserted w.r.t. the reference sequence
|
||||
*
|
||||
* In these cases, where are the alleles?
|
||||
*
|
||||
* SNP polymorphism of C/G -> { C , G } -> C is the reference allele
|
||||
* 1 base deletion of C -> { C , - } -> C is the reference allele
|
||||
* 1 base insertion of A -> { - ; A } -> Null is the reference allele
|
||||
*
|
||||
* Suppose I see a the following in the population:
|
||||
*
|
||||
* Ref: a t C g a // C is the reference base
|
||||
* : a t G g a // C base is a G in some individuals
|
||||
* : a t - g a // C base is deleted w.r.t. the reference
|
||||
*
|
||||
* How do I represent this? There are three segregating alleles:
|
||||
*
|
||||
* { C , G , - }
|
||||
*
|
||||
* Now suppose I have this more complex example:
|
||||
*
|
||||
* Ref: a t C g a // C is the reference base
|
||||
* : a t - g a
|
||||
* : a t - - a
|
||||
* : a t CAg a
|
||||
*
|
||||
* There are actually four segregating alleles:
|
||||
*
|
||||
* { C g , - g, - -, and CAg } over bases 2-4
|
||||
*
|
||||
* However, the molecular equivalence explicitly listed above is usually discarded, so the actual
|
||||
* segregating alleles are:
|
||||
*
|
||||
* { C g, g, -, C a g }
|
||||
*
|
||||
* Critically, it should be possible to apply an allele to a reference sequence to create the
|
||||
* correct haplotype sequence:
|
||||
*
|
||||
* Allele + reference => haplotype
|
||||
*
|
||||
* For convenience, we are going to create Alleles where the GenomeLoc of the allele is stored outside of the
|
||||
* Allele object itself. So there's an idea of an A/C polymorphism independent of it's surrounding context.
|
||||
*
|
||||
* Given list of alleles it's possible to determine the "type" of the variation
|
||||
*
|
||||
* A / C @ loc => SNP with
|
||||
* - / A => INDEL
|
||||
*
|
||||
* If you know where allele is the reference, you can determine whether the variant is an insertion or deletion.
|
||||
*
|
||||
* Alelle also supports is concept of a NO_CALL allele. This Allele represents a haplotype that couldn't be
|
||||
* determined. This is usually represented by a '.' allele.
|
||||
*
|
||||
* Note that Alleles store all bases as bytes, in **UPPER CASE**. So 'atc' == 'ATC' from the perspective of an
|
||||
* Allele.
|
||||
|
||||
* @author ebanks, depristo
|
||||
*/
|
||||
class Allele implements Comparable<Allele> {
|
||||
private static final byte[] EMPTY_ALLELE_BASES = new byte[0];
|
||||
|
||||
private boolean isRef = false;
|
||||
private boolean isNull = false;
|
||||
private boolean isNoCall = false;
|
||||
private boolean isSymbolic = false;
|
||||
|
||||
private byte[] bases = null;
|
||||
|
||||
public final static String NULL_ALLELE_STRING = "-";
|
||||
public final static String NO_CALL_STRING = ".";
|
||||
/** A generic static NO_CALL allele for use */
|
||||
|
||||
// no public way to create an allele
|
||||
private Allele(byte[] bases, boolean isRef) {
|
||||
// standardize our representation of null allele and bases
|
||||
if ( wouldBeNullAllele(bases) ) {
|
||||
bases = EMPTY_ALLELE_BASES;
|
||||
isNull = true;
|
||||
} else if ( wouldBeNoCallAllele(bases) ) {
|
||||
bases = EMPTY_ALLELE_BASES;
|
||||
isNoCall = true;
|
||||
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
|
||||
} else if ( wouldBeSymbolicAllele(bases) ) {
|
||||
isSymbolic = true;
|
||||
if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele");
|
||||
}
|
||||
// else
|
||||
// bases = new String(bases).toUpperCase().getBytes(); // todo -- slow performance
|
||||
|
||||
this.isRef = isRef;
|
||||
this.bases = bases;
|
||||
|
||||
if ( ! acceptableAlleleBases(bases) )
|
||||
throw new IllegalArgumentException("Unexpected base in allele bases \'" + new String(bases)+"\'");
|
||||
}
|
||||
|
||||
private Allele(String bases, boolean isRef) {
|
||||
this(bases.getBytes(), isRef);
|
||||
}
|
||||
|
||||
|
||||
private final static Allele REF_A = new Allele("A", true);
|
||||
private final static Allele ALT_A = new Allele("A", false);
|
||||
private final static Allele REF_C = new Allele("C", true);
|
||||
private final static Allele ALT_C = new Allele("C", false);
|
||||
private final static Allele REF_G = new Allele("G", true);
|
||||
private final static Allele ALT_G = new Allele("G", false);
|
||||
private final static Allele REF_T = new Allele("T", true);
|
||||
private final static Allele ALT_T = new Allele("T", false);
|
||||
private final static Allele REF_N = new Allele("N", true);
|
||||
private final static Allele ALT_N = new Allele("N", false);
|
||||
private final static Allele REF_NULL = new Allele(NULL_ALLELE_STRING, true);
|
||||
private final static Allele ALT_NULL = new Allele(NULL_ALLELE_STRING, false);
|
||||
public final static Allele NO_CALL = new Allele(NO_CALL_STRING, false);
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// creation routines
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases
|
||||
* == '-', a Null allele is created. If bases == '.', a no call Allele is created.
|
||||
*
|
||||
* @param bases the DNA sequence of this variation, '-', of '.'
|
||||
* @param isRef should we make this a reference allele?
|
||||
* @throws IllegalArgumentException if bases contains illegal characters or is otherwise malformated
|
||||
*/
|
||||
public static Allele create(byte[] bases, boolean isRef) {
|
||||
if ( bases == null )
|
||||
throw new IllegalArgumentException("create: the Allele base string cannot be null; use new Allele() or new Allele(\"\") to create a Null allele");
|
||||
|
||||
if ( bases.length == 1 ) {
|
||||
// optimization to return a static constant Allele for each single base object
|
||||
switch (bases[0]) {
|
||||
case '.':
|
||||
if ( isRef ) throw new IllegalArgumentException("Cannot tag a NoCall allele as the reference allele");
|
||||
return NO_CALL;
|
||||
case '-': return isRef ? REF_NULL : ALT_NULL;
|
||||
case 'A': case 'a' : return isRef ? REF_A : ALT_A;
|
||||
case 'C': case 'c' : return isRef ? REF_C : ALT_C;
|
||||
case 'G': case 'g' : return isRef ? REF_G : ALT_G;
|
||||
case 'T': case 't' : return isRef ? REF_T : ALT_T;
|
||||
case 'N': case 'n' : return isRef ? REF_N : ALT_N;
|
||||
default: throw new IllegalArgumentException("Illegal base: " + (char)bases[0]);
|
||||
}
|
||||
} else {
|
||||
return new Allele(bases, isRef);
|
||||
}
|
||||
}
|
||||
|
||||
public static Allele create(byte base, boolean isRef) {
|
||||
// public Allele(byte base, boolean isRef) {
|
||||
return create( new byte[]{ base }, isRef);
|
||||
}
|
||||
|
||||
public static Allele create(byte base) {
|
||||
return create( base, false );
|
||||
}
|
||||
|
||||
public static Allele extend(Allele left, byte[] right) {
|
||||
if (left.isSymbolic())
|
||||
throw new IllegalArgumentException("Cannot extend a symbolic allele");
|
||||
byte[] bases = null;
|
||||
if ( left.length() == 0 )
|
||||
bases = right;
|
||||
else {
|
||||
bases = new byte[left.length() + right.length];
|
||||
System.arraycopy(left.getBases(), 0, bases, 0, left.length());
|
||||
System.arraycopy(right, 0, bases, left.length(), right.length);
|
||||
}
|
||||
|
||||
return create(bases, left.isReference());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the null allele
|
||||
*/
|
||||
public static boolean wouldBeNullAllele(byte[] bases) {
|
||||
return (bases.length == 1 && bases[0] == '-') || bases.length == 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the NO_CALL allele
|
||||
*/
|
||||
public static boolean wouldBeNoCallAllele(byte[] bases) {
|
||||
return bases.length == 1 && bases[0] == '.';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent a symbolic allele
|
||||
*/
|
||||
public static boolean wouldBeSymbolicAllele(byte[] bases) {
|
||||
return bases.length > 2 && bases[0] == '<' && bases[bases.length-1] == '>';
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the well formatted allele
|
||||
*/
|
||||
public static boolean acceptableAlleleBases(String bases) {
|
||||
return acceptableAlleleBases(bases.getBytes());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bases bases representing an allele
|
||||
* @return true if the bases represent the well formatted allele
|
||||
*/
|
||||
public static boolean acceptableAlleleBases(byte[] bases) {
|
||||
if ( wouldBeNullAllele(bases) || wouldBeNoCallAllele(bases) || wouldBeSymbolicAllele(bases) )
|
||||
return true;
|
||||
|
||||
for ( int i = 0; i < bases.length; i++ ) {
|
||||
switch (bases[i]) {
|
||||
case 'A': case 'C': case 'G': case 'T': case 'N' : case 'a': case 'c': case 'g': case 't': case 'n' :
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see Allele(byte[], boolean)
|
||||
*
|
||||
* @param bases bases representing an allele
|
||||
* @param isRef is this the reference allele?
|
||||
*/
|
||||
public static Allele create(String bases, boolean isRef) {
|
||||
//public Allele(String bases, boolean isRef) {
|
||||
return create(bases.getBytes(), isRef);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
|
||||
*
|
||||
* @param bases bases representing an allele
|
||||
*/
|
||||
public static Allele create(String bases) {
|
||||
return create(bases, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a non-Ref allele. @see Allele(byte[], boolean) for full information
|
||||
*
|
||||
* @param bases bases representing an allele
|
||||
*/
|
||||
public static Allele create(byte[] bases) {
|
||||
return create(bases, false);
|
||||
//this(bases, false);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// accessor routines
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
//Returns true if this is the null allele
|
||||
public boolean isNull() { return isNull; }
|
||||
// Returns true if this is not the null allele
|
||||
public boolean isNonNull() { return ! isNull(); }
|
||||
|
||||
// Returns true if this is the NO_CALL allele
|
||||
public boolean isNoCall() { return isNoCall; }
|
||||
// Returns true if this is not the NO_CALL allele
|
||||
public boolean isCalled() { return ! isNoCall(); }
|
||||
|
||||
// Returns true if this Allele is the reference allele
|
||||
public boolean isReference() { return isRef; }
|
||||
// Returns true if this Allele is not the reference allele
|
||||
public boolean isNonReference() { return ! isReference(); }
|
||||
|
||||
// Returns true if this Allele is symbolic (i.e. no well-defined base sequence)
|
||||
public boolean isSymbolic() { return isSymbolic; }
|
||||
|
||||
// Returns a nice string representation of this object
|
||||
public String toString() {
|
||||
return (isNull() ? NULL_ALLELE_STRING : ( isNoCall() ? NO_CALL_STRING : getDisplayString() )) + (isReference() ? "*" : "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the DNA bases segregating in this allele. Note this isn't reference polarized,
|
||||
* so the Null allele is represented by a vector of length 0
|
||||
*
|
||||
* @return the segregating bases
|
||||
*/
|
||||
public byte[] getBases() { return isSymbolic ? EMPTY_ALLELE_BASES : bases; }
|
||||
|
||||
/**
|
||||
* Return the DNA bases segregating in this allele in String format.
|
||||
* This is useful, because toString() adds a '*' to reference alleles and getBases() returns garbage when you call toString() on it.
|
||||
*
|
||||
* @return the segregating bases
|
||||
*/
|
||||
public String getBaseString() { return new String(getBases()); }
|
||||
|
||||
/**
|
||||
* Return the printed representation of this allele.
|
||||
* Same as getBaseString(), except for symbolic alleles.
|
||||
* For symbolic alleles, the base string is empty while the display string contains <TAG>.
|
||||
*
|
||||
* @return the allele string representation
|
||||
*/
|
||||
public String getDisplayString() { return new String(bases); }
|
||||
|
||||
/**
|
||||
* @param other the other allele
|
||||
*
|
||||
* @return true if these alleles are equal
|
||||
*/
|
||||
public boolean equals(Object other) {
|
||||
return ( ! (other instanceof Allele) ? false : equals((Allele)other, false) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @return hash code
|
||||
*/
|
||||
public int hashCode() {
|
||||
int hash = 1;
|
||||
for (int i = 0; i < bases.length; i++)
|
||||
hash += (i+1) * bases[i];
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this and other are equal. If ignoreRefState is true, then doesn't require both alleles has the
|
||||
* same ref tag
|
||||
*
|
||||
* @param other allele to compare to
|
||||
* @param ignoreRefState if true, ignore ref state in comparison
|
||||
* @return true if this and other are equal
|
||||
*/
|
||||
public boolean equals(Allele other, boolean ignoreRefState) {
|
||||
return this == other || (isRef == other.isRef || ignoreRefState) && isNull == other.isNull && isNoCall == other.isNoCall && (bases == other.bases || Arrays.equals(bases, other.bases));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param test bases to test against
|
||||
*
|
||||
* @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
|
||||
*/
|
||||
public boolean basesMatch(byte[] test) { return !isSymbolic && (bases == test || Arrays.equals(bases, test)); }
|
||||
|
||||
/**
|
||||
* @param test bases to test against
|
||||
*
|
||||
* @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
|
||||
*/
|
||||
public boolean basesMatch(String test) { return basesMatch(test.toUpperCase().getBytes()); }
|
||||
|
||||
/**
|
||||
* @param test allele to test against
|
||||
*
|
||||
* @return true if this Alelle contains the same bases as test, regardless of its reference status; handles Null and NO_CALL alleles
|
||||
*/
|
||||
public boolean basesMatch(Allele test) { return basesMatch(test.getBases()); }
|
||||
|
||||
/**
|
||||
* @return the length of this allele. Null and NO_CALL alleles have 0 length.
|
||||
*/
|
||||
public int length() {
|
||||
return isSymbolic ? 0 : bases.length;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// useful static functions
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static Allele getMatchingAllele(Collection<Allele> allAlleles, String alleleBases) {
|
||||
return getMatchingAllele(allAlleles, alleleBases.getBytes());
|
||||
}
|
||||
|
||||
public static Allele getMatchingAllele(Collection<Allele> allAlleles, byte[] alleleBases) {
|
||||
for ( Allele a : allAlleles ) {
|
||||
if ( a.basesMatch(alleleBases) ) {
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
if ( wouldBeNoCallAllele(alleleBases) )
|
||||
return NO_CALL;
|
||||
else
|
||||
return null; // couldn't find anything
|
||||
}
|
||||
|
||||
public static List<Allele> resolveAlleles(List<Allele> possibleAlleles, List<String> alleleStrings) {
|
||||
List<Allele> myAlleles = new ArrayList<Allele>(alleleStrings.size());
|
||||
|
||||
for ( String alleleString : alleleStrings ) {
|
||||
Allele allele = getMatchingAllele(possibleAlleles, alleleString);
|
||||
|
||||
if ( allele == null ) {
|
||||
if ( Allele.wouldBeNoCallAllele(alleleString.getBytes()) ) {
|
||||
allele = create(alleleString);
|
||||
} else {
|
||||
throw new IllegalArgumentException("Allele " + alleleString + " not present in the list of alleles " + possibleAlleles);
|
||||
}
|
||||
}
|
||||
|
||||
myAlleles.add(allele);
|
||||
}
|
||||
|
||||
return myAlleles;
|
||||
}
|
||||
|
||||
public int compareTo(Allele other) {
|
||||
if ( isReference() && other.isNonReference() )
|
||||
return -1;
|
||||
else if ( isNonReference() && other.isReference() )
|
||||
return 1;
|
||||
else
|
||||
return getBaseString().compareTo(other.getBaseString()); // todo -- potential performance issue
|
||||
}
|
||||
|
||||
public static boolean oneIsPrefixOfOther(Allele a1, Allele a2) {
|
||||
if ( a1.isNull() || a2.isNull() )
|
||||
return true;
|
||||
|
||||
if ( a2.length() >= a1.length() )
|
||||
return firstIsPrefixOfSecond(a1, a2);
|
||||
else
|
||||
return firstIsPrefixOfSecond(a2, a1);
|
||||
}
|
||||
|
||||
private static boolean firstIsPrefixOfSecond(Allele a1, Allele a2) {
|
||||
String a1String = a1.getBaseString();
|
||||
return a2.getBaseString().substring(0, a1String.length()).equals(a1String);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,349 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This class encompasses all the basic information about a genotype. It is immutable.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
*/
|
||||
public class Genotype {
|
||||
|
||||
public final static String PHASED_ALLELE_SEPARATOR = "|";
|
||||
public final static String UNPHASED_ALLELE_SEPARATOR = "/";
|
||||
|
||||
protected InferredGeneticContext commonInfo;
|
||||
public final static double NO_NEG_LOG_10PERROR = InferredGeneticContext.NO_NEG_LOG_10PERROR;
|
||||
protected List<Allele> alleles = null; // new ArrayList<Allele>();
|
||||
protected Type type = null;
|
||||
|
||||
protected boolean isPhased = false;
|
||||
protected boolean filtersWereAppliedToContext;
|
||||
|
||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased) {
|
||||
this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null);
|
||||
}
|
||||
|
||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean isPhased, double[] log10Likelihoods) {
|
||||
if ( alleles != null )
|
||||
this.alleles = Collections.unmodifiableList(alleles);
|
||||
commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes);
|
||||
if ( log10Likelihoods != null )
|
||||
commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods));
|
||||
filtersWereAppliedToContext = filters != null;
|
||||
this.isPhased = isPhased;
|
||||
validate();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new Genotype for sampleName with genotype according to alleles.
|
||||
* @param sampleName
|
||||
* @param alleles
|
||||
* @param negLog10PError the confidence in these alleles
|
||||
* @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known
|
||||
*/
|
||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError, double[] log10Likelihoods) {
|
||||
this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods);
|
||||
}
|
||||
|
||||
public Genotype(String sampleName, List<Allele> alleles, double negLog10PError) {
|
||||
this(sampleName, alleles, negLog10PError, null, null, false);
|
||||
}
|
||||
|
||||
public Genotype(String sampleName, List<Allele> alleles) {
|
||||
this(sampleName, alleles, NO_NEG_LOG_10PERROR, null, null, false);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Partial-cloning routines (because Genotype is immutable).
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public static Genotype modifyName(Genotype g, String name) {
|
||||
return new Genotype(name, g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
|
||||
}
|
||||
|
||||
public static Genotype modifyAttributes(Genotype g, Map<String, Object> attributes) {
|
||||
return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased());
|
||||
}
|
||||
|
||||
public static Genotype modifyAlleles(Genotype g, List<Allele> alleles) {
|
||||
return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased());
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the alleles for this genotype
|
||||
*/
|
||||
public List<Allele> getAlleles() {
|
||||
return alleles;
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles(Allele allele) {
|
||||
if ( getType() == Type.UNAVAILABLE )
|
||||
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
|
||||
|
||||
List<Allele> al = new ArrayList<Allele>();
|
||||
for ( Allele a : alleles )
|
||||
if ( a.equals(allele) )
|
||||
al.add(a);
|
||||
|
||||
return Collections.unmodifiableList(al);
|
||||
}
|
||||
|
||||
public Allele getAllele(int i) {
|
||||
if ( getType() == Type.UNAVAILABLE )
|
||||
throw new ReviewedStingException("Requesting alleles for an UNAVAILABLE genotype");
|
||||
return alleles.get(i);
|
||||
}
|
||||
|
||||
public boolean isPhased() { return isPhased; }
|
||||
|
||||
/**
|
||||
* @return the ploidy of this genotype
|
||||
*/
|
||||
public int getPloidy() {
|
||||
if ( alleles == null )
|
||||
throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype");
|
||||
return alleles.size();
|
||||
}
|
||||
|
||||
public enum Type {
|
||||
NO_CALL,
|
||||
HOM_REF,
|
||||
HET,
|
||||
HOM_VAR,
|
||||
UNAVAILABLE,
|
||||
MIXED // no-call and call in the same genotype
|
||||
}
|
||||
|
||||
public Type getType() {
|
||||
if ( type == null ) {
|
||||
type = determineType();
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
protected Type determineType() {
|
||||
if ( alleles == null )
|
||||
return Type.UNAVAILABLE;
|
||||
|
||||
boolean sawNoCall = false, sawMultipleAlleles = false;
|
||||
Allele observedAllele = null;
|
||||
|
||||
for ( Allele allele : alleles ) {
|
||||
if ( allele.isNoCall() )
|
||||
sawNoCall = true;
|
||||
else if ( observedAllele == null )
|
||||
observedAllele = allele;
|
||||
else if ( !allele.equals(observedAllele) )
|
||||
sawMultipleAlleles = true;
|
||||
}
|
||||
|
||||
if ( sawNoCall ) {
|
||||
if ( observedAllele == null )
|
||||
return Type.NO_CALL;
|
||||
return Type.MIXED;
|
||||
}
|
||||
|
||||
if ( observedAllele == null )
|
||||
throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null");
|
||||
|
||||
return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHom() { return isHomRef() || isHomVar(); }
|
||||
|
||||
/**
|
||||
* @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHomRef() { return getType() == Type.HOM_REF; }
|
||||
|
||||
/**
|
||||
* @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHomVar() { return getType() == Type.HOM_VAR; }
|
||||
|
||||
/**
|
||||
* @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false.
|
||||
*/
|
||||
public boolean isHet() { return getType() == Type.HET; }
|
||||
|
||||
/**
|
||||
* @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false.
|
||||
*/
|
||||
public boolean isNoCall() { return getType() == Type.NO_CALL; }
|
||||
|
||||
/**
|
||||
* @return true if this genotype is comprised of any alleles that are not no-calls (even if some are).
|
||||
*/
|
||||
public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
|
||||
|
||||
/**
|
||||
* @return true if this genotype is comprised of both calls and no-calls.
|
||||
*/
|
||||
public boolean isMixed() { return getType() == Type.MIXED; }
|
||||
|
||||
/**
|
||||
* @return true if the type of this genotype is set.
|
||||
*/
|
||||
public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }
|
||||
|
||||
//
|
||||
// Useful methods for getting genotype likelihoods for a genotype object, if present
|
||||
//
|
||||
public boolean hasLikelihoods() {
|
||||
return (hasAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4)) ||
|
||||
(hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && !getAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).equals(VCFConstants.MISSING_VALUE_v4));
|
||||
}
|
||||
|
||||
public GenotypeLikelihoods getLikelihoods() {
|
||||
GenotypeLikelihoods x = getLikelihoods(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, true);
|
||||
if ( x != null )
|
||||
return x;
|
||||
else {
|
||||
x = getLikelihoods(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, false);
|
||||
if ( x != null ) return x;
|
||||
else
|
||||
throw new IllegalStateException("BUG: genotype likelihood field in " + this.getSampleName() + " sample are not either a string or a genotype likelihood class!");
|
||||
}
|
||||
}
|
||||
|
||||
private GenotypeLikelihoods getLikelihoods(String key, boolean asPL) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x instanceof String ) {
|
||||
if ( asPL )
|
||||
return GenotypeLikelihoods.fromPLField((String)x);
|
||||
else
|
||||
return GenotypeLikelihoods.fromGLField((String)x);
|
||||
}
|
||||
else if ( x instanceof GenotypeLikelihoods ) return (GenotypeLikelihoods)x;
|
||||
else return null;
|
||||
}
|
||||
|
||||
public void validate() {
|
||||
if ( alleles == null ) return;
|
||||
if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");
|
||||
|
||||
// int nNoCalls = 0;
|
||||
for ( Allele allele : alleles ) {
|
||||
if ( allele == null )
|
||||
throw new IllegalArgumentException("BUG: allele cannot be null in Genotype");
|
||||
// nNoCalls += allele.isNoCall() ? 1 : 0;
|
||||
}
|
||||
|
||||
// Technically, the spec does allow for the below case so this is not an illegal state
|
||||
//if ( nNoCalls > 0 && nNoCalls != alleles.size() )
|
||||
// throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
|
||||
}
|
||||
|
||||
public String getGenotypeString() {
|
||||
return getGenotypeString(true);
|
||||
}
|
||||
|
||||
public String getGenotypeString(boolean ignoreRefState) {
|
||||
if ( alleles == null )
|
||||
return null;
|
||||
|
||||
// Notes:
|
||||
// 1. Make sure to use the appropriate separator depending on whether the genotype is phased
|
||||
// 2. If ignoreRefState is true, then we want just the bases of the Alleles (ignoring the '*' indicating a ref Allele)
|
||||
// 3. So that everything is deterministic with regards to integration tests, we sort Alleles (when the genotype isn't phased, of course)
|
||||
return ParsingUtils.join(isPhased() ? PHASED_ALLELE_SEPARATOR : UNPHASED_ALLELE_SEPARATOR,
|
||||
ignoreRefState ? getAlleleStrings() : (isPhased() ? getAlleles() : ParsingUtils.sortList(getAlleles())));
|
||||
}
|
||||
|
||||
private List<String> getAlleleStrings() {
|
||||
List<String> al = new ArrayList<String>();
|
||||
for ( Allele a : alleles )
|
||||
al.add(a.getBaseString());
|
||||
|
||||
return al;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
int Q = (int)Math.round(getPhredScaledQual());
|
||||
return String.format("[%s %s Q%s %s]", getSampleName(), getGenotypeString(false),
|
||||
Q == -10 ? "." : String.format("%2d",Q), sortedString(getAttributes()));
|
||||
}
|
||||
|
||||
public String toBriefString() {
|
||||
return String.format("%s:Q%.2f", getGenotypeString(false), getPhredScaledQual());
|
||||
}
|
||||
|
||||
public boolean sameGenotype(Genotype other) {
|
||||
return sameGenotype(other, true);
|
||||
}
|
||||
|
||||
public boolean sameGenotype(Genotype other, boolean ignorePhase) {
|
||||
if (getPloidy() != other.getPloidy())
|
||||
return false; // gotta have the same number of allele to be equal
|
||||
|
||||
// By default, compare the elements in the lists of alleles, element-by-element
|
||||
Collection<Allele> thisAlleles = this.getAlleles();
|
||||
Collection<Allele> otherAlleles = other.getAlleles();
|
||||
|
||||
if (ignorePhase) { // do not care about order, only identity of Alleles
|
||||
thisAlleles = new TreeSet<Allele>(thisAlleles); //implemented Allele.compareTo()
|
||||
otherAlleles = new TreeSet<Allele>(otherAlleles);
|
||||
}
|
||||
|
||||
return thisAlleles.equals(otherAlleles);
|
||||
}
|
||||
|
||||
/**
|
||||
* a utility method for generating sorted strings from a map key set.
|
||||
* @param c the map
|
||||
* @param <T> the key type
|
||||
* @param <V> the value type
|
||||
* @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys
|
||||
*/
|
||||
private static <T extends Comparable<T>, V> String sortedString(Map<T, V> c) {
|
||||
// NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS
|
||||
List<T> t = new ArrayList<T>(c.keySet());
|
||||
Collections.sort(t);
|
||||
|
||||
List<String> pairs = new ArrayList<String>();
|
||||
for (T k : t) {
|
||||
pairs.add(k + "=" + c.get(k));
|
||||
}
|
||||
|
||||
return "{" + ParsingUtils.join(", ", pairs.toArray(new String[pairs.size()])) + "}";
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// get routines to access context info fields
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public String getSampleName() { return commonInfo.getName(); }
|
||||
public Set<String> getFilters() { return commonInfo.getFilters(); }
|
||||
public boolean isFiltered() { return commonInfo.isFiltered(); }
|
||||
public boolean isNotFiltered() { return commonInfo.isNotFiltered(); }
|
||||
public boolean filtersWereApplied() { return filtersWereAppliedToContext; }
|
||||
public boolean hasNegLog10PError() { return commonInfo.hasNegLog10PError(); }
|
||||
public double getNegLog10PError() { return commonInfo.getNegLog10PError(); }
|
||||
public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); }
|
||||
|
||||
public Map<String, Object> getAttributes() { return commonInfo.getAttributes(); }
|
||||
public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); }
|
||||
public Object getAttribute(String key) { return commonInfo.getAttribute(key); }
|
||||
|
||||
public Object getAttribute(String key, Object defaultValue) {
|
||||
return commonInfo.getAttribute(key, defaultValue);
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
|
||||
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
|
||||
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
|
||||
}
|
||||
|
|
@ -0,0 +1,196 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class GenotypeLikelihoods {
|
||||
public static final boolean CAP_PLS = false;
|
||||
public static final int PL_CAP = 255;
|
||||
|
||||
//
|
||||
// There are two objects here because we are lazy in creating both representations
|
||||
// for this object: a vector of log10 Probs and the PL phred-scaled string. Supports
|
||||
// having one set during initializating, and dynamic creation of the other, if needed
|
||||
//
|
||||
private double[] log10Likelihoods = null;
|
||||
private String likelihoodsAsString_PLs = null;
|
||||
|
||||
public final static GenotypeLikelihoods fromPLField(String PLs) {
|
||||
return new GenotypeLikelihoods(PLs);
|
||||
}
|
||||
|
||||
public final static GenotypeLikelihoods fromGLField(String GLs) {
|
||||
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
|
||||
}
|
||||
|
||||
public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) {
|
||||
return new GenotypeLikelihoods(log10Likelihoods);
|
||||
}
|
||||
|
||||
//
|
||||
// You must use the factory methods now
|
||||
//
|
||||
protected GenotypeLikelihoods(String asString) {
|
||||
likelihoodsAsString_PLs = asString;
|
||||
}
|
||||
|
||||
protected GenotypeLikelihoods(double[] asVector) {
|
||||
log10Likelihoods = asVector;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this
|
||||
* vector returns math.log10(x) for each of the genotypes. Can return null if the
|
||||
* genotype likelihoods are "missing".
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public double[] getAsVector() {
|
||||
// assumes one of the likelihoods vector or the string isn't null
|
||||
if ( log10Likelihoods == null ) {
|
||||
// make sure we create the GL string if it doesn't already exist
|
||||
log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs);
|
||||
}
|
||||
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getAsString();
|
||||
}
|
||||
|
||||
public String getAsString() {
|
||||
if ( likelihoodsAsString_PLs == null ) {
|
||||
// todo -- should we accept null log10Likelihoods and set PLs as MISSING?
|
||||
if ( log10Likelihoods == null )
|
||||
throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!");
|
||||
likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods);
|
||||
}
|
||||
|
||||
return likelihoodsAsString_PLs;
|
||||
}
|
||||
|
||||
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
|
||||
//Returns null in case of missing likelihoods
|
||||
public EnumMap<Genotype.Type,Double> getAsMap(boolean normalizeFromLog10){
|
||||
//Make sure that the log10likelihoods are set
|
||||
double[] likelihoods = normalizeFromLog10 ? MathUtils.normalizeFromLog10(getAsVector()) : getAsVector();
|
||||
if(likelihoods == null)
|
||||
return null;
|
||||
EnumMap<Genotype.Type,Double> likelihoodsMap = new EnumMap<Genotype.Type, Double>(Genotype.Type.class);
|
||||
likelihoodsMap.put(Genotype.Type.HOM_REF,likelihoods[Genotype.Type.HOM_REF.ordinal()-1]);
|
||||
likelihoodsMap.put(Genotype.Type.HET,likelihoods[Genotype.Type.HET.ordinal()-1]);
|
||||
likelihoodsMap.put(Genotype.Type.HOM_VAR, likelihoods[Genotype.Type.HOM_VAR.ordinal() - 1]);
|
||||
return likelihoodsMap;
|
||||
}
|
||||
|
||||
//Return the neg log10 Genotype Quality (GQ) for the given genotype
|
||||
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
|
||||
public double getNegLog10GQ(Genotype.Type genotype){
|
||||
|
||||
double qual = Double.NEGATIVE_INFINITY;
|
||||
EnumMap<Genotype.Type,Double> likelihoods = getAsMap(false);
|
||||
if(likelihoods == null)
|
||||
return qual;
|
||||
for(Map.Entry<Genotype.Type,Double> likelihood : likelihoods.entrySet()){
|
||||
if(likelihood.getKey() == genotype)
|
||||
continue;
|
||||
if(likelihood.getValue() > qual)
|
||||
qual = likelihood.getValue();
|
||||
|
||||
}
|
||||
|
||||
//Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best)
|
||||
qual = likelihoods.get(genotype) - qual;
|
||||
|
||||
//Quality of other genotypes 1-P(G)
|
||||
if (qual < 0) {
|
||||
double[] normalized = MathUtils.normalizeFromLog10(getAsVector());
|
||||
double chosenGenotype = normalized[genotype.ordinal()-1];
|
||||
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
|
||||
}
|
||||
return qual;
|
||||
}
|
||||
|
||||
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
|
||||
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
String[] strings = likelihoodsAsString_PLs.split(",");
|
||||
double[] likelihoodsAsVector = new double[strings.length];
|
||||
for ( int i = 0; i < strings.length; i++ ) {
|
||||
likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0;
|
||||
}
|
||||
return likelihoodsAsVector;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format
|
||||
* @param GLString
|
||||
* @return
|
||||
*/
|
||||
private final static double[] parseDeprecatedGLString(String GLString) {
|
||||
if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) {
|
||||
String[] strings = GLString.split(",");
|
||||
double[] likelihoodsAsVector = new double[strings.length];
|
||||
for ( int i = 0; i < strings.length; i++ ) {
|
||||
likelihoodsAsVector[i] = Double.parseDouble(strings[i]);
|
||||
}
|
||||
return likelihoodsAsVector;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private final static String convertLikelihoodsToPLString(double[] GLs) {
|
||||
if ( GLs == null )
|
||||
return VCFConstants.MISSING_VALUE_v4;
|
||||
|
||||
StringBuilder s = new StringBuilder();
|
||||
|
||||
double adjust = Double.NEGATIVE_INFINITY;
|
||||
for ( double l : GLs ) adjust = Math.max(adjust, l);
|
||||
|
||||
boolean first = true;
|
||||
for ( double l : GLs ) {
|
||||
if ( ! first )
|
||||
s.append(",");
|
||||
else
|
||||
first = false;
|
||||
|
||||
long PL = Math.round(-10 * (l - adjust));
|
||||
if ( CAP_PLS )
|
||||
PL = Math.min(PL, PL_CAP);
|
||||
s.append(Long.toString(PL));
|
||||
}
|
||||
|
||||
return s.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broad.tribble.Tribble;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.index.DynamicIndexCreator;
|
||||
import org.broad.tribble.index.Index;
|
||||
import org.broad.tribble.index.IndexFactory;
|
||||
import org.broad.tribble.util.LittleEndianOutputStream;
|
||||
import org.broad.tribble.util.PositionalStream;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
abstract class IndexingVCFWriter implements VCFWriter {
|
||||
final private String name;
|
||||
private final SAMSequenceDictionary refDict;
|
||||
|
||||
private OutputStream outputStream;
|
||||
private PositionalStream positionalStream = null;
|
||||
private DynamicIndexCreator indexer = null;
|
||||
private LittleEndianOutputStream idxStream = null;
|
||||
|
||||
@Requires({"name != null",
|
||||
"! ( location == null && output == null )",
|
||||
"! ( enableOnTheFlyIndexing && location == null )"})
|
||||
protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
|
||||
outputStream = output;
|
||||
this.name = name;
|
||||
this.refDict = refDict;
|
||||
|
||||
if ( enableOnTheFlyIndexing ) {
|
||||
try {
|
||||
idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
|
||||
//System.out.println("Creating index on the fly for " + location);
|
||||
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
|
||||
indexer.initialize(location, indexer.defaultBinSize());
|
||||
positionalStream = new PositionalStream(output);
|
||||
outputStream = positionalStream;
|
||||
} catch ( IOException ex ) {
|
||||
// No matter what we keep going, since we don't care if we can't create the index file
|
||||
idxStream = null;
|
||||
indexer = null;
|
||||
positionalStream = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public OutputStream getOutputStream() {
|
||||
return outputStream;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public String getStreamName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public abstract void writeHeader(VCFHeader header);
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
public void close() {
|
||||
// try to close the index stream (keep it separate to help debugging efforts)
|
||||
if ( indexer != null ) {
|
||||
try {
|
||||
Index index = indexer.finalizeIndex(positionalStream.getPosition());
|
||||
IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
|
||||
index.write(idxStream);
|
||||
idxStream.close();
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* add a record to the file
|
||||
*
|
||||
* @param vc the Variant Context object
|
||||
*/
|
||||
public void add(VariantContext vc) {
|
||||
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
|
||||
if ( indexer != null )
|
||||
indexer.addFeature(vc, positionalStream.getPosition());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reasonable "name" for this writer, to display to the user if something goes wrong
|
||||
*
|
||||
* @param location
|
||||
* @param stream
|
||||
* @return
|
||||
*/
|
||||
protected static final String writerName(final File location, final OutputStream stream) {
|
||||
return location == null ? stream.toString() : location.getAbsolutePath();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a output stream writing to location, or throws a UserException if this fails
|
||||
* @param location
|
||||
* @return
|
||||
*/
|
||||
protected static OutputStream openOutputStream(final File location) {
|
||||
try {
|
||||
return new FileOutputStream(location);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,243 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Common utility routines for VariantContext and Genotype
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
final class InferredGeneticContext {
|
||||
public static final double NO_NEG_LOG_10PERROR = -1.0;
|
||||
|
||||
private static Set<String> NO_FILTERS = Collections.unmodifiableSet(new HashSet<String>());
|
||||
private static Map<String, Object> NO_ATTRIBUTES = Collections.unmodifiableMap(new HashMap<String, Object>());
|
||||
|
||||
private double negLog10PError = NO_NEG_LOG_10PERROR;
|
||||
private String name = null;
|
||||
private Set<String> filters = NO_FILTERS;
|
||||
private Map<String, Object> attributes = NO_ATTRIBUTES;
|
||||
|
||||
// public InferredGeneticContext(String name) {
|
||||
// this.name = name;
|
||||
// }
|
||||
//
|
||||
// public InferredGeneticContext(String name, double negLog10PError) {
|
||||
// this(name);
|
||||
// setNegLog10PError(negLog10PError);
|
||||
// }
|
||||
|
||||
public InferredGeneticContext(String name, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
|
||||
this.name = name;
|
||||
setNegLog10PError(negLog10PError);
|
||||
if ( filters != null )
|
||||
setFilters(filters);
|
||||
if ( attributes != null )
|
||||
setAttributes(attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the name
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the name
|
||||
*
|
||||
* @param name the name associated with this information
|
||||
*/
|
||||
public void setName(String name) {
|
||||
if ( name == null ) throw new IllegalArgumentException("Name cannot be null " + this);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Filter
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public Set<String> getFilters() {
|
||||
return Collections.unmodifiableSet(filters);
|
||||
}
|
||||
|
||||
public boolean isFiltered() {
|
||||
return filters.size() > 0;
|
||||
}
|
||||
|
||||
public boolean isNotFiltered() {
|
||||
return ! isFiltered();
|
||||
}
|
||||
|
||||
public void addFilter(String filter) {
|
||||
if ( filters == NO_FILTERS ) // immutable -> mutable
|
||||
filters = new HashSet<String>(filters);
|
||||
|
||||
if ( filter == null ) throw new IllegalArgumentException("BUG: Attempting to add null filter " + this);
|
||||
if ( getFilters().contains(filter) ) throw new IllegalArgumentException("BUG: Attempting to add duplicate filter " + filter + " at " + this);
|
||||
filters.add(filter);
|
||||
}
|
||||
|
||||
public void addFilters(Collection<String> filters) {
|
||||
if ( filters == null ) throw new IllegalArgumentException("BUG: Attempting to add null filters at" + this);
|
||||
for ( String f : filters )
|
||||
addFilter(f);
|
||||
}
|
||||
|
||||
public void clearFilters() {
|
||||
filters = new HashSet<String>();
|
||||
}
|
||||
|
||||
public void setFilters(Collection<String> filters) {
|
||||
clearFilters();
|
||||
addFilters(filters);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with log error rates
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean hasNegLog10PError() {
|
||||
return getNegLog10PError() != NO_NEG_LOG_10PERROR;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the -1 * log10-based error estimate
|
||||
*/
|
||||
public double getNegLog10PError() { return negLog10PError; }
|
||||
public double getPhredScaledQual() { return getNegLog10PError() * 10; }
|
||||
|
||||
public void setNegLog10PError(double negLog10PError) {
|
||||
if ( negLog10PError < 0 && negLog10PError != NO_NEG_LOG_10PERROR ) throw new IllegalArgumentException("BUG: negLog10PError cannot be < than 0 : " + negLog10PError);
|
||||
if ( Double.isInfinite(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be Infinity");
|
||||
if ( Double.isNaN(negLog10PError) ) throw new IllegalArgumentException("BUG: negLog10PError should not be NaN");
|
||||
|
||||
this.negLog10PError = negLog10PError;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Working with attributes
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public void clearAttributes() {
|
||||
attributes = new HashMap<String, Object>();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the attribute map
|
||||
*/
|
||||
public Map<String, Object> getAttributes() {
|
||||
return Collections.unmodifiableMap(attributes);
|
||||
}
|
||||
|
||||
// todo -- define common attributes as enum
|
||||
|
||||
public void setAttributes(Map<String, ?> map) {
|
||||
clearAttributes();
|
||||
putAttributes(map);
|
||||
}
|
||||
|
||||
public void putAttribute(String key, Object value) {
|
||||
putAttribute(key, value, false);
|
||||
}
|
||||
|
||||
public void putAttribute(String key, Object value, boolean allowOverwrites) {
|
||||
if ( ! allowOverwrites && hasAttribute(key) )
|
||||
throw new IllegalStateException("Attempting to overwrite key->value binding: key = " + key + " this = " + this);
|
||||
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>();
|
||||
|
||||
attributes.put(key, value);
|
||||
}
|
||||
|
||||
public void removeAttribute(String key) {
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>();
|
||||
attributes.remove(key);
|
||||
}
|
||||
|
||||
public void putAttributes(Map<String, ?> map) {
|
||||
if ( map != null ) {
|
||||
// for efficiency, we can skip the validation if the map is empty
|
||||
if ( attributes.size() == 0 ) {
|
||||
if ( attributes == NO_ATTRIBUTES ) // immutable -> mutable
|
||||
attributes = new HashMap<String, Object>();
|
||||
attributes.putAll(map);
|
||||
} else {
|
||||
for ( Map.Entry<String, ?> elt : map.entrySet() ) {
|
||||
putAttribute(elt.getKey(), elt.getValue(), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasAttribute(String key) {
|
||||
return attributes.containsKey(key);
|
||||
}
|
||||
|
||||
public int getNumAttributes() {
|
||||
return attributes.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the attribute key
|
||||
*
|
||||
* @return the attribute value for the given key (or null if not set)
|
||||
*/
|
||||
public Object getAttribute(String key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
||||
public Object getAttribute(String key, Object defaultValue) {
|
||||
if ( hasAttribute(key) )
|
||||
return attributes.get(key);
|
||||
else
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
public String getAttributeAsString(String key, String defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof String ) return (String)x;
|
||||
return String.valueOf(x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public int getAttributeAsInt(String key, int defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
|
||||
if ( x instanceof Integer ) return (Integer)x;
|
||||
return Integer.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public double getAttributeAsDouble(String key, double defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Double ) return (Double)x;
|
||||
return Double.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
|
||||
Object x = getAttribute(key);
|
||||
if ( x == null ) return defaultValue;
|
||||
if ( x instanceof Boolean ) return (Boolean)x;
|
||||
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
|
||||
}
|
||||
|
||||
// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
|
||||
// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
|
||||
// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
|
||||
// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
|
||||
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
|
||||
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
|
||||
// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
|
||||
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* This class emcompasses all the basic information about a genotype. It is immutable.
|
||||
*
|
||||
* @author Mark DePristo
|
||||
*/
|
||||
class MutableGenotype extends Genotype {
|
||||
public MutableGenotype(Genotype parent) {
|
||||
super(parent.getSampleName(), parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased());
|
||||
}
|
||||
|
||||
public MutableGenotype(String sampleName, Genotype parent) {
|
||||
super(sampleName, parent.getAlleles(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.isPhased());
|
||||
}
|
||||
|
||||
|
||||
public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, boolean genotypesArePhased) {
|
||||
super(sampleName, alleles, negLog10PError, filters, attributes, genotypesArePhased);
|
||||
}
|
||||
|
||||
public MutableGenotype(String sampleName, List<Allele> alleles, double negLog10PError) {
|
||||
super(sampleName, alleles, negLog10PError);
|
||||
}
|
||||
|
||||
public MutableGenotype(String sampleName, List<Allele> alleles) {
|
||||
super(sampleName, alleles);
|
||||
}
|
||||
|
||||
public Genotype unmodifiableGenotype() {
|
||||
return new Genotype(getSampleName(), getAlleles(), getNegLog10PError(), getFilters(), getAttributes(), isPhased());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param alleles list of alleles
|
||||
*/
|
||||
public void setAlleles(List<Allele> alleles) {
|
||||
this.alleles = new ArrayList<Allele>(alleles);
|
||||
validate();
|
||||
}
|
||||
|
||||
public void setPhase(boolean isPhased) {
|
||||
super.isPhased = isPhased;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// InferredGeneticContext mutation operators
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
public void setName(String name) { commonInfo.setName(name); }
|
||||
public void addFilter(String filter) { commonInfo.addFilter(filter); }
|
||||
public void addFilters(Collection<String> filters) { commonInfo.addFilters(filters); }
|
||||
public void clearFilters() { commonInfo.clearFilters(); }
|
||||
public void setFilters(Collection<String> filters) { commonInfo.setFilters(filters); }
|
||||
public void setAttributes(Map<String, ?> map) { commonInfo.setAttributes(map); }
|
||||
public void clearAttributes() { commonInfo.clearAttributes(); }
|
||||
public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); }
|
||||
public void removeAttribute(String key) { commonInfo.removeAttribute(key); }
|
||||
public void putAttributes(Map<String, ?> map) { commonInfo.putAttributes(map); }
|
||||
public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); }
|
||||
public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); }
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,213 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Mutable version of VariantContext
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
class MutableVariantContext extends VariantContext {
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, Collection<Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
|
||||
super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
}
|
||||
|
||||
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, Map<String, Genotype> genotypes, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
|
||||
super(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes);
|
||||
}
|
||||
|
||||
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles) {
|
||||
super(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null);
|
||||
}
|
||||
|
||||
public MutableVariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, Collection<Genotype> genotypes) {
|
||||
super(source, contig, start, stop, alleles, genotypes, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null);
|
||||
}
|
||||
|
||||
public MutableVariantContext(VariantContext parent) {
|
||||
super(parent.getSource(), parent.contig, parent.start, parent.stop, parent.getAlleles(), parent.getGenotypes(), parent.getNegLog10PError(), parent.getFilters(), parent.getAttributes(), parent.getReferenceBaseForIndel());
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the alleles segregating in this context to the collect of alleles. Each of which must be unique according
|
||||
* to equals() in Allele. Validate() should be called when you are done modifying the context.
|
||||
*
|
||||
* @param alleles
|
||||
*/
|
||||
public void setAlleles(Collection<Allele> alleles) {
|
||||
this.alleles.clear();
|
||||
for ( Allele a : alleles )
|
||||
addAllele(a);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds allele to the segregating allele list in this context to the collection of alleles. The new
|
||||
* allele must be be unique according to equals() in Allele.
|
||||
* Validate() should be called when you are done modifying the context.
|
||||
*
|
||||
* @param allele
|
||||
*/
|
||||
public void addAllele(Allele allele) {
|
||||
final boolean allowDuplicates = false; // used to be a parameter
|
||||
|
||||
type = null;
|
||||
|
||||
for ( Allele a : alleles ) {
|
||||
if ( a.basesMatch(allele) && ! allowDuplicates )
|
||||
throw new IllegalArgumentException("Duplicate allele added to VariantContext" + this);
|
||||
}
|
||||
|
||||
// we are a novel allele
|
||||
alleles.add(allele);
|
||||
}
|
||||
|
||||
public void clearGenotypes() {
|
||||
genotypes = new TreeMap<String, Genotype>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this single genotype to the context, not allowing duplicate genotypes to be added
|
||||
* @param genotype
|
||||
*/
|
||||
public void addGenotypes(Genotype genotype) {
|
||||
putGenotype(genotype.getSampleName(), genotype, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds these genotypes to the context, not allowing duplicate genotypes to be added
|
||||
* @param genotypes
|
||||
*/
|
||||
public void addGenotypes(Collection<Genotype> genotypes) {
|
||||
for ( Genotype g : genotypes ) {
|
||||
addGenotype(g);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds these genotype to the context, not allowing duplicate genotypes to be added.
|
||||
* @param genotypes
|
||||
*/
|
||||
public void addGenotypes(Map<String, Genotype> genotypes) {
|
||||
|
||||
for ( Map.Entry<String, Genotype> elt : genotypes.entrySet() ) {
|
||||
addGenotype(elt.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds these genotypes to the context.
|
||||
*
|
||||
* @param genotypes
|
||||
*/
|
||||
public void putGenotypes(Map<String, Genotype> genotypes) {
|
||||
for ( Map.Entry<String, Genotype> g : genotypes.entrySet() )
|
||||
putGenotype(g.getKey(), g.getValue());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds these genotypes to the context.
|
||||
*
|
||||
* @param genotypes
|
||||
*/
|
||||
public void putGenotypes(Collection<Genotype> genotypes) {
|
||||
for ( Genotype g : genotypes )
|
||||
putGenotype(g);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this genotype to the context, throwing an error if it's already bound.
|
||||
*
|
||||
* @param genotype
|
||||
*/
|
||||
public void addGenotype(Genotype genotype) {
|
||||
addGenotype(genotype.getSampleName(), genotype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this genotype to the context, throwing an error if it's already bound.
|
||||
*
|
||||
* @param genotype
|
||||
*/
|
||||
public void addGenotype(String sampleName, Genotype genotype) {
|
||||
putGenotype(sampleName, genotype, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this genotype to the context.
|
||||
*
|
||||
* @param genotype
|
||||
*/
|
||||
public void putGenotype(Genotype genotype) {
|
||||
putGenotype(genotype.getSampleName(), genotype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this genotype to the context.
|
||||
*
|
||||
* @param genotype
|
||||
*/
|
||||
public void putGenotype(String sampleName, Genotype genotype) {
|
||||
putGenotype(sampleName, genotype, true);
|
||||
}
|
||||
|
||||
private void putGenotype(String sampleName, Genotype genotype, boolean allowOverwrites) {
|
||||
if ( hasGenotype(sampleName) && ! allowOverwrites )
|
||||
throw new IllegalStateException("Attempting to overwrite sample->genotype binding: " + sampleName + " this=" + this);
|
||||
|
||||
if ( ! sampleName.equals(genotype.getSampleName()) )
|
||||
throw new IllegalStateException("Sample name doesn't equal genotype.getSample(): " + sampleName + " genotype=" + genotype);
|
||||
|
||||
this.genotypes.put(sampleName, genotype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the binding from sampleName to genotype. If this doesn't exist, throws an IllegalArgumentException
|
||||
* @param sampleName
|
||||
*/
|
||||
public void removeGenotype(String sampleName) {
|
||||
if ( ! this.genotypes.containsKey(sampleName) )
|
||||
throw new IllegalArgumentException("Sample name isn't contained in genotypes " + sampleName + " genotypes =" + genotypes);
|
||||
|
||||
this.genotypes.remove(sampleName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes genotype from the context. If this doesn't exist, throws an IllegalArgumentException
|
||||
* @param genotype
|
||||
*/
|
||||
public void removeGenotype(Genotype genotype) {
|
||||
removeGenotype(genotype.getSampleName());
|
||||
}
|
||||
|
||||
// todo -- add replace genotype routine
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// InferredGeneticContext mutation operators
|
||||
//
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void setSource(String source) { commonInfo.setName(source); }
|
||||
public void addFilter(String filter) { commonInfo.addFilter(filter); }
|
||||
public void addFilters(Collection<String> filters) { commonInfo.addFilters(filters); }
|
||||
public void clearFilters() { commonInfo.clearFilters(); }
|
||||
public void setFilters(Collection<String> filters) { commonInfo.setFilters(filters); }
|
||||
public void setAttributes(Map<String, ?> map) { commonInfo.setAttributes(map); }
|
||||
public void clearAttributes() { commonInfo.clearAttributes(); }
|
||||
public void putAttribute(String key, Object value) { commonInfo.putAttribute(key, value); }
|
||||
public void removeAttribute(String key) { commonInfo.removeAttribute(key); }
|
||||
public void putAttributes(Map<String, ?> map) { commonInfo.putAttributes(map); }
|
||||
public void setNegLog10PError(double negLog10PError) { commonInfo.setNegLog10PError(negLog10PError); }
|
||||
public void putAttribute(String key, Object value, boolean allowOverwrites) { commonInfo.putAttribute(key, value, allowOverwrites); }
|
||||
public void setID(String id) { putAttribute(ID_KEY, id, true); }
|
||||
}
|
||||
|
|
@ -0,0 +1,198 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* A feature codec for the VCF3 specification, to read older VCF files. VCF3 has been
|
||||
* depreciated in favor of VCF4 (See VCF codec for the latest information)
|
||||
*
|
||||
* <p>
|
||||
* Reads historical VCF3 encoded files (1000 Genomes Pilot results, for example)
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
|
||||
* See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
|
||||
* </p>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
*/
|
||||
class VCF3Codec extends AbstractVCFCodec {
|
||||
public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3";
|
||||
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
* @return the number of header lines
|
||||
*/
|
||||
public Object readHeader(LineReader reader) {
|
||||
List<String> headerStrings = new ArrayList<String>();
|
||||
|
||||
String line;
|
||||
try {
|
||||
boolean foundHeaderVersion = false;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
lineNo++;
|
||||
if (line.startsWith(VCFHeader.METADATA_INDICATOR)) {
|
||||
String[] lineFields = line.substring(2).split("=");
|
||||
if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) {
|
||||
if ( !VCFHeaderVersion.isVersionString(lineFields[1]) )
|
||||
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
|
||||
foundHeaderVersion = true;
|
||||
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
|
||||
if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 )
|
||||
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]);
|
||||
}
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
|
||||
if (!foundHeaderVersion) {
|
||||
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
|
||||
}
|
||||
return createHeader(headerStrings, line);
|
||||
}
|
||||
else {
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied
|
||||
*/
|
||||
protected Set<String> parseFilters(String filterString) {
|
||||
|
||||
// null for unfiltered
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
||||
return null;
|
||||
|
||||
// empty set for passes filters
|
||||
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
|
||||
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
|
||||
return fFields;
|
||||
|
||||
if ( filterString.length() == 0 )
|
||||
generateException("The VCF specification requires a valid filter status");
|
||||
|
||||
// do we have the filter string cached?
|
||||
if ( filterHash.containsKey(filterString) )
|
||||
return filterHash.get(filterString);
|
||||
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
||||
filterHash.put(filterString, fFields);
|
||||
|
||||
return fFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @param chr chrom
|
||||
* @param pos position
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
|
||||
if (genotypeParts == null)
|
||||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
|
||||
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(nParts);
|
||||
|
||||
// get the format keys
|
||||
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
||||
// cycle through the sample names
|
||||
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
||||
|
||||
// clear out our allele mapping
|
||||
alleleMap.clear();
|
||||
|
||||
// cycle through the genotype strings
|
||||
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
|
||||
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
Map<String, String> gtAttributes = null;
|
||||
String sampleName = sampleNameIterator.next();
|
||||
|
||||
// check to see if the value list is longer than the key list, which is a problem
|
||||
if (nGTKeys < GTValueSplitSize)
|
||||
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
|
||||
|
||||
int genotypeAlleleLocation = -1;
|
||||
if (nGTKeys >= 1) {
|
||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||
|
||||
for (int i = 0; i < nGTKeys; i++) {
|
||||
final String gtKey = new String(genotypeKeyArray[i]);
|
||||
boolean missing = i >= GTValueSplitSize;
|
||||
|
||||
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||
genotypeAlleleLocation = i;
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
|
||||
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
|
||||
} else if ( missing || GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) {
|
||||
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
|
||||
} else {
|
||||
gtAttributes.put(gtKey, new String(GTValueArray[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check to make sure we found a genotype field
|
||||
if ( genotypeAlleleLocation < 0 )
|
||||
generateException("Unable to find the GT field for the record; the GT field is required");
|
||||
if ( genotypeAlleleLocation > 0 )
|
||||
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes");
|
||||
|
||||
boolean phased = GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
|
||||
|
||||
// add it to the list
|
||||
try {
|
||||
genotypes.put(sampleName, new Genotype(sampleName,
|
||||
parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap),
|
||||
GTQual,
|
||||
genotypeFilters,
|
||||
gtAttributes,
|
||||
phased));
|
||||
} catch (TribbleException e) {
|
||||
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
|
||||
}
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canDecode(final File potentialInput) {
|
||||
return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for ALT fields in the VCF header
|
||||
*/
|
||||
class VCFAltHeaderLine extends VCFSimpleHeaderLine {
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFAltHeaderLine(String name, String description) {
|
||||
super(name, description, SupportedHeaderLineType.ALT);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
*/
|
||||
protected VCFAltHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.ALT);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,228 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.readers.LineReader;
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A feature codec for the VCF 4 specification
|
||||
*
|
||||
* <p>
|
||||
* VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a
|
||||
* header line, and then data lines each containing information about a position in the genome.
|
||||
* </p>
|
||||
* <p>One of the main uses of next-generation sequencing is to discover variation amongst large populations
|
||||
* of related samples. Recently the format for storing next-generation read alignments has been
|
||||
* standardised by the SAM/BAM file format specification. This has significantly improved the
|
||||
* interoperability of next-generation tools for alignment, visualisation, and variant calling.
|
||||
* We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent
|
||||
* types of sequence variation, including SNPs, indels and larger structural variants, together
|
||||
* with rich annotations. VCF is usually stored in a compressed manner and can be indexed for
|
||||
* fast data retrieval of variants from a range of positions on the reference genome.
|
||||
* The format was developed for the 1000 Genomes Project, and has also been adopted by other projects
|
||||
* such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements
|
||||
* various utilities for processing VCF files, including validation, merging and comparing,
|
||||
* and also provides a general Perl and Python API.
|
||||
* The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.</p>
|
||||
*
|
||||
* <p>
|
||||
* See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br>
|
||||
* See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a>
|
||||
* </p>
|
||||
*
|
||||
* <h2>File format example</h2>
|
||||
* <pre>
|
||||
* ##fileformat=VCFv4.0
|
||||
* #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
|
||||
* chr1 109 . A T 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99
|
||||
* chr1 147 . C A 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:294,49:118:-57.87,-34.96,-338.46:99
|
||||
* </pre>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
*/
|
||||
public class VCFCodec extends AbstractVCFCodec {
|
||||
// Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters.
|
||||
|
||||
public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4";
|
||||
|
||||
/**
|
||||
* @param reader the line reader to take header lines from
|
||||
* @return the number of header lines
|
||||
*/
|
||||
public Object readHeader(LineReader reader) {
|
||||
List<String> headerStrings = new ArrayList<String>();
|
||||
|
||||
String line;
|
||||
try {
|
||||
boolean foundHeaderVersion = false;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
lineNo++;
|
||||
if (line.startsWith(VCFHeader.METADATA_INDICATOR)) {
|
||||
String[] lineFields = line.substring(2).split("=");
|
||||
if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) {
|
||||
if ( !VCFHeaderVersion.isVersionString(lineFields[1]) )
|
||||
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
|
||||
foundHeaderVersion = true;
|
||||
version = VCFHeaderVersion.toHeaderVersion(lineFields[1]);
|
||||
if ( version == VCFHeaderVersion.VCF3_3 || version == VCFHeaderVersion.VCF3_2 )
|
||||
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]);
|
||||
if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 )
|
||||
throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]);
|
||||
}
|
||||
headerStrings.add(line);
|
||||
}
|
||||
else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) {
|
||||
if (!foundHeaderVersion) {
|
||||
throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version");
|
||||
}
|
||||
return createHeader(headerStrings, line);
|
||||
}
|
||||
else {
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("IO Exception ", e);
|
||||
}
|
||||
throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* parse the filter string, first checking to see if we already have parsed it in a previous attempt
|
||||
*
|
||||
* @param filterString the string to parse
|
||||
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
|
||||
*/
|
||||
protected Set<String> parseFilters(String filterString) {
|
||||
return parseFilters(filterHash, lineNo, filterString);
|
||||
}
|
||||
|
||||
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
|
||||
// null for unfiltered
|
||||
if ( filterString.equals(VCFConstants.UNFILTERED) )
|
||||
return null;
|
||||
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
|
||||
return Collections.emptySet();
|
||||
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
|
||||
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
|
||||
if ( filterString.length() == 0 )
|
||||
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
|
||||
|
||||
// do we have the filter string cached?
|
||||
if ( cache != null && cache.containsKey(filterString) )
|
||||
return Collections.unmodifiableSet(cache.get(filterString));
|
||||
|
||||
// empty set for passes filters
|
||||
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
|
||||
// otherwise we have to parse and cache the value
|
||||
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
|
||||
fFields.add(filterString);
|
||||
else
|
||||
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
|
||||
|
||||
fFields = fFields;
|
||||
if ( cache != null ) cache.put(filterString, fFields);
|
||||
|
||||
return Collections.unmodifiableSet(fFields);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos) {
|
||||
if (genotypeParts == null)
|
||||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
|
||||
Map<String, Genotype> genotypes = new LinkedHashMap<String, Genotype>(nParts);
|
||||
|
||||
// get the format keys
|
||||
int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
||||
// cycle through the sample names
|
||||
Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator();
|
||||
|
||||
// clear out our allele mapping
|
||||
alleleMap.clear();
|
||||
|
||||
// cycle through the genotype strings
|
||||
for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) {
|
||||
int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR);
|
||||
|
||||
double GTQual = VariantContext.NO_NEG_LOG_10PERROR;
|
||||
Set<String> genotypeFilters = null;
|
||||
Map<String, String> gtAttributes = null;
|
||||
String sampleName = sampleNameIterator.next();
|
||||
|
||||
// check to see if the value list is longer than the key list, which is a problem
|
||||
if (nGTKeys < GTValueSplitSize)
|
||||
generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]);
|
||||
|
||||
int genotypeAlleleLocation = -1;
|
||||
if (nGTKeys >= 1) {
|
||||
gtAttributes = new HashMap<String, String>(nGTKeys - 1);
|
||||
|
||||
for (int i = 0; i < nGTKeys; i++) {
|
||||
final String gtKey = new String(genotypeKeyArray[i]);
|
||||
boolean missing = i >= GTValueSplitSize;
|
||||
|
||||
// todo -- all of these on the fly parsing of the missing value should be static constants
|
||||
if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) {
|
||||
genotypeAlleleLocation = i;
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) {
|
||||
GTQual = missing ? parseQual(VCFConstants.MISSING_VALUE_v4) : parseQual(GTValueArray[i]);
|
||||
} else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) {
|
||||
genotypeFilters = missing ? parseFilters(VCFConstants.MISSING_VALUE_v4) : parseFilters(getCachedString(GTValueArray[i]));
|
||||
} else if ( missing ) {
|
||||
gtAttributes.put(gtKey, VCFConstants.MISSING_VALUE_v4);
|
||||
} else {
|
||||
gtAttributes.put(gtKey, new String(GTValueArray[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check to make sure we found a genotype field if we are a VCF4.0 file
|
||||
if ( version == VCFHeaderVersion.VCF4_0 && genotypeAlleleLocation == -1 )
|
||||
generateException("Unable to find the GT field for the record; the GT field is required in VCF4.0");
|
||||
if ( genotypeAlleleLocation > 0 )
|
||||
generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present");
|
||||
|
||||
List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? null : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap));
|
||||
boolean phased = genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1;
|
||||
|
||||
// add it to the list
|
||||
try {
|
||||
genotypes.put(sampleName,
|
||||
new Genotype(sampleName,
|
||||
GTalleles,
|
||||
GTQual,
|
||||
genotypeFilters,
|
||||
gtAttributes,
|
||||
phased));
|
||||
} catch (TribbleException e) {
|
||||
throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos);
|
||||
}
|
||||
}
|
||||
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canDecode(final File potentialInput) {
|
||||
return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* a base class for compound header lines, which include info lines and format lines (so far)
|
||||
*/
|
||||
abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
||||
public enum SupportedHeaderLineType {
|
||||
INFO(true), FORMAT(false);
|
||||
|
||||
public final boolean allowFlagValues;
|
||||
SupportedHeaderLineType(boolean flagValues) {
|
||||
allowFlagValues = flagValues;
|
||||
}
|
||||
}
|
||||
|
||||
// the field types
|
||||
private String name;
|
||||
private int count = -1;
|
||||
private VCFHeaderLineCount countType;
|
||||
private String description;
|
||||
private VCFHeaderLineType type;
|
||||
|
||||
// access methods
|
||||
public String getName() { return name; }
|
||||
public String getDescription() { return description; }
|
||||
public VCFHeaderLineType getType() { return type; }
|
||||
public VCFHeaderLineCount getCountType() { return countType; }
|
||||
public int getCount() {
|
||||
if ( countType != VCFHeaderLineCount.INTEGER )
|
||||
throw new ReviewedStingException("Asking for header line count when type is not an integer");
|
||||
return count;
|
||||
}
|
||||
|
||||
// utility method
|
||||
public int getCount(int numAltAlleles) {
|
||||
int myCount;
|
||||
switch ( countType ) {
|
||||
case INTEGER: myCount = count; break;
|
||||
case UNBOUNDED: myCount = -1; break;
|
||||
case A: myCount = numAltAlleles; break;
|
||||
case G: myCount = ((numAltAlleles + 1) * (numAltAlleles + 2) / 2); break;
|
||||
default: throw new ReviewedStingException("Unknown count type: " + countType);
|
||||
}
|
||||
return myCount;
|
||||
}
|
||||
|
||||
public void setNumberToUnbounded() {
|
||||
countType = VCFHeaderLineCount.UNBOUNDED;
|
||||
count = -1;
|
||||
}
|
||||
|
||||
// our type of line, i.e. format, info, etc
|
||||
private final SupportedHeaderLineType lineType;
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param count the count for this header line
|
||||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
* @param lineType the header line type
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.name = name;
|
||||
this.countType = VCFHeaderLineCount.INTEGER;
|
||||
this.count = count;
|
||||
this.type = type;
|
||||
this.description = description;
|
||||
this.lineType = lineType;
|
||||
validate();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param count the count type for this header line
|
||||
* @param type the type for this header line
|
||||
* @param description the description for this header line
|
||||
* @param lineType the header line type
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.name = name;
|
||||
this.countType = count;
|
||||
this.type = type;
|
||||
this.description = description;
|
||||
this.lineType = lineType;
|
||||
validate();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF format header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the VCF header version
|
||||
* @param lineType the header line type
|
||||
*
|
||||
*/
|
||||
protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Number","Type","Description"));
|
||||
name = mapping.get("ID");
|
||||
count = -1;
|
||||
final String numberStr = mapping.get("Number");
|
||||
if ( numberStr.equals(VCFConstants.PER_ALLELE_COUNT) ) {
|
||||
countType = VCFHeaderLineCount.A;
|
||||
} else if ( numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT) ) {
|
||||
countType = VCFHeaderLineCount.G;
|
||||
} else if ( ((version == VCFHeaderVersion.VCF4_0 || version == VCFHeaderVersion.VCF4_1) &&
|
||||
numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) ||
|
||||
((version == VCFHeaderVersion.VCF3_2 || version == VCFHeaderVersion.VCF3_3) &&
|
||||
numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3)) ) {
|
||||
countType = VCFHeaderLineCount.UNBOUNDED;
|
||||
} else {
|
||||
countType = VCFHeaderLineCount.INTEGER;
|
||||
count = Integer.valueOf(numberStr);
|
||||
|
||||
}
|
||||
type = VCFHeaderLineType.valueOf(mapping.get("Type"));
|
||||
if (type == VCFHeaderLineType.Flag && !allowFlagValues())
|
||||
throw new IllegalArgumentException("Flag is an unsupported type for this kind of field");
|
||||
|
||||
description = mapping.get("Description");
|
||||
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
|
||||
description = UNBOUND_DESCRIPTION;
|
||||
|
||||
this.lineType = lineType;
|
||||
|
||||
validate();
|
||||
}
|
||||
|
||||
private void validate() {
|
||||
if ( name == null || type == null || description == null || lineType == null )
|
||||
throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s",
|
||||
super.getKey(), name, type, description, lineType ));
|
||||
}
|
||||
|
||||
/**
|
||||
* make a string representation of this header line
|
||||
* @return a string representation
|
||||
*/
|
||||
protected String toStringEncoding() {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID", name);
|
||||
Object number;
|
||||
switch ( countType ) {
|
||||
case A: number = VCFConstants.PER_ALLELE_COUNT; break;
|
||||
case G: number = VCFConstants.PER_GENOTYPE_COUNT; break;
|
||||
case UNBOUNDED: number = VCFConstants.UNBOUNDED_ENCODING_v4; break;
|
||||
case INTEGER:
|
||||
default: number = count;
|
||||
}
|
||||
map.put("Number", number);
|
||||
map.put("Type", type);
|
||||
map.put("Description", description);
|
||||
return lineType.toString() + "=" + toStringEncoding(map);
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true if we're equal to another compounder header line
|
||||
* @param o a compound header line
|
||||
* @return true if equal
|
||||
*/
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFCompoundHeaderLine) )
|
||||
return false;
|
||||
VCFCompoundHeaderLine other = (VCFCompoundHeaderLine)o;
|
||||
return equalsExcludingDescription(other) &&
|
||||
description.equals(other.description);
|
||||
}
|
||||
|
||||
public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) {
|
||||
return count == other.count &&
|
||||
countType == other.countType &&
|
||||
type == other.type &&
|
||||
lineType == other.lineType &&
|
||||
name.equals(other.name);
|
||||
}
|
||||
|
||||
public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) {
|
||||
return lineType == other.lineType &&
|
||||
name.equals(other.name);
|
||||
}
|
||||
|
||||
/**
|
||||
* do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true)
|
||||
* @return true if we do, false otherwise
|
||||
*/
|
||||
abstract boolean allowFlagValues();
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
final class VCFConstants {
|
||||
public static final Locale VCF_LOCALE = Locale.US;
|
||||
|
||||
// reserved INFO/FORMAT field keys
|
||||
public static final String ANCESTRAL_ALLELE_KEY = "AA";
|
||||
public static final String ALLELE_COUNT_KEY = "AC";
|
||||
public static final String ALLELE_FREQUENCY_KEY = "AF";
|
||||
public static final String ALLELE_NUMBER_KEY = "AN";
|
||||
public static final String RMS_BASE_QUALITY_KEY = "BQ";
|
||||
public static final String CIGAR_KEY = "CIGAR";
|
||||
public static final String DBSNP_KEY = "DB";
|
||||
public static final String DEPTH_KEY = "DP";
|
||||
public static final String DOWNSAMPLED_KEY = "DS";
|
||||
public static final String EXPECTED_ALLELE_COUNT_KEY = "EC";
|
||||
public static final String END_KEY = "END";
|
||||
public static final String GENOTYPE_FILTER_KEY = "FT";
|
||||
public static final String GENOTYPE_KEY = "GT";
|
||||
@Deprecated
|
||||
public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods
|
||||
public static final String GENOTYPE_POSTERIORS_KEY = "GP";
|
||||
public static final String GENOTYPE_QUALITY_KEY = "GQ";
|
||||
public static final String HAPMAP2_KEY = "H2";
|
||||
public static final String HAPMAP3_KEY = "H3";
|
||||
public static final String HAPLOTYPE_QUALITY_KEY = "HQ";
|
||||
public static final String RMS_MAPPING_QUALITY_KEY = "MQ";
|
||||
public static final String MAPPING_QUALITY_ZERO_KEY = "MQ0";
|
||||
public static final String SAMPLE_NUMBER_KEY = "NS";
|
||||
public static final String PHRED_GENOTYPE_LIKELIHOODS_KEY = "PL"; // phred-scaled genotype likelihoods
|
||||
public static final String PHASE_QUALITY_KEY = "PQ";
|
||||
public static final String PHASE_SET_KEY = "PS";
|
||||
public static final String OLD_DEPTH_KEY = "RD";
|
||||
public static final String STRAND_BIAS_KEY = "SB";
|
||||
public static final String SOMATIC_KEY = "SOMATIC";
|
||||
public static final String VALIDATED_KEY = "VALIDATED";
|
||||
public static final String THOUSAND_GENOMES_KEY = "1000G";
|
||||
|
||||
// separators
|
||||
public static final String FORMAT_FIELD_SEPARATOR = ":";
|
||||
public static final String GENOTYPE_FIELD_SEPARATOR = ":";
|
||||
public static final char GENOTYPE_FIELD_SEPARATOR_CHAR = ':';
|
||||
public static final String FIELD_SEPARATOR = "\t";
|
||||
public static final char FIELD_SEPARATOR_CHAR = '\t';
|
||||
public static final String FILTER_CODE_SEPARATOR = ";";
|
||||
public static final String INFO_FIELD_ARRAY_SEPARATOR = ",";
|
||||
public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ',';
|
||||
public static final String ID_FIELD_SEPARATOR = ";";
|
||||
public static final String INFO_FIELD_SEPARATOR = ";";
|
||||
public static final char INFO_FIELD_SEPARATOR_CHAR = ';';
|
||||
public static final String UNPHASED = "/";
|
||||
public static final String PHASED = "|";
|
||||
public static final String PHASED_SWITCH_PROB_v3 = "\\";
|
||||
public static final String PHASING_TOKENS = "/|\\";
|
||||
|
||||
// old indel alleles
|
||||
public static final char DELETION_ALLELE_v3 = 'D';
|
||||
public static final char INSERTION_ALLELE_v3 = 'I';
|
||||
|
||||
// missing/default values
|
||||
public static final String UNFILTERED = ".";
|
||||
public static final String PASSES_FILTERS_v3 = "0";
|
||||
public static final String PASSES_FILTERS_v4 = "PASS";
|
||||
public static final String EMPTY_ID_FIELD = ".";
|
||||
public static final String EMPTY_INFO_FIELD = ".";
|
||||
public static final String EMPTY_ALTERNATE_ALLELE_FIELD = ".";
|
||||
public static final String MISSING_VALUE_v4 = ".";
|
||||
public static final String MISSING_QUALITY_v3 = "-1";
|
||||
public static final Double MISSING_QUALITY_v3_DOUBLE = Double.valueOf(MISSING_QUALITY_v3);
|
||||
|
||||
public static final String MISSING_GENOTYPE_QUALITY_v3 = "-1";
|
||||
public static final String MISSING_HAPLOTYPE_QUALITY_v3 = "-1";
|
||||
public static final String MISSING_DEPTH_v3 = "-1";
|
||||
public static final String UNBOUNDED_ENCODING_v4 = ".";
|
||||
public static final String UNBOUNDED_ENCODING_v3 = "-1";
|
||||
public static final String PER_ALLELE_COUNT = "A";
|
||||
public static final String PER_GENOTYPE_COUNT = "G";
|
||||
public static final String EMPTY_ALLELE = ".";
|
||||
public static final String EMPTY_GENOTYPE = "./.";
|
||||
public static final double MAX_GENOTYPE_QUAL = 99.0;
|
||||
|
||||
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
|
||||
public static final String DOUBLE_PRECISION_INT_SUFFIX = ".00";
|
||||
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for FILTER fields in the VCF header
|
||||
*/
|
||||
class VCFFilterHeaderLine extends VCFSimpleHeaderLine {
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
*/
|
||||
public VCFFilterHeaderLine(String name, String description) {
|
||||
super(name, description, SupportedHeaderLineType.FILTER);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
*/
|
||||
protected VCFFilterHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.FILTER);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFFormatHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry for genotype FORMAT fields in the VCF header
|
||||
*/
|
||||
class VCFFormatHeaderLine extends VCFCompoundHeaderLine {
|
||||
|
||||
public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.FORMAT);
|
||||
if (type == VCFHeaderLineType.Flag)
|
||||
throw new IllegalArgumentException("Flag is an unsupported type for format fields");
|
||||
}
|
||||
|
||||
public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.FORMAT);
|
||||
}
|
||||
|
||||
protected VCFFormatHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.FORMAT);
|
||||
}
|
||||
|
||||
// format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype).
|
||||
@Override
|
||||
boolean allowFlagValues() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,198 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
|
||||
import org.broad.tribble.util.ParsingUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFHeader
|
||||
* <p/>
|
||||
* A class representing the VCF header
|
||||
*/
|
||||
class VCFHeader {
|
||||
|
||||
// the mandatory header fields
|
||||
public enum HEADER_FIELDS {
|
||||
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
|
||||
}
|
||||
|
||||
// the associated meta data
|
||||
private final Set<VCFHeaderLine> mMetaData;
|
||||
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new HashMap<String, VCFInfoHeaderLine>();
|
||||
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new HashMap<String, VCFFormatHeaderLine>();
|
||||
private final Map<String, VCFHeaderLine> mOtherMetaData = new HashMap<String, VCFHeaderLine>();
|
||||
|
||||
// the list of auxillary tags
|
||||
private final Set<String> mGenotypeSampleNames = new LinkedHashSet<String>();
|
||||
|
||||
// the character string that indicates meta data
|
||||
public static final String METADATA_INDICATOR = "##";
|
||||
|
||||
// the header string indicator
|
||||
public static final String HEADER_INDICATOR = "#";
|
||||
|
||||
// were the input samples sorted originally (or are we sorting them)?
|
||||
private boolean samplesWereAlreadySorted = true;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
*/
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData) {
|
||||
mMetaData = new TreeSet<VCFHeaderLine>(metaData);
|
||||
loadVCFVersion();
|
||||
loadMetaDataMaps();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData the meta data associated with this header
|
||||
* @param genotypeSampleNames the sample names
|
||||
*/
|
||||
public VCFHeader(Set<VCFHeaderLine> metaData, Set<String> genotypeSampleNames) {
|
||||
mMetaData = new TreeSet<VCFHeaderLine>();
|
||||
if ( metaData != null )
|
||||
mMetaData.addAll(metaData);
|
||||
|
||||
mGenotypeSampleNames.addAll(genotypeSampleNames);
|
||||
|
||||
loadVCFVersion();
|
||||
loadMetaDataMaps();
|
||||
|
||||
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a header line to the header metadata.
|
||||
*
|
||||
* @param headerLine Line to add to the existing metadata component.
|
||||
*/
|
||||
public void addMetaDataLine(VCFHeaderLine headerLine) {
|
||||
mMetaData.add(headerLine);
|
||||
}
|
||||
|
||||
/**
|
||||
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
|
||||
* or the version is not present
|
||||
*/
|
||||
public void loadVCFVersion() {
|
||||
List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
|
||||
for ( VCFHeaderLine line : mMetaData )
|
||||
if ( VCFHeaderVersion.isFormatString(line.getKey())) {
|
||||
toRemove.add(line);
|
||||
}
|
||||
// remove old header lines for now,
|
||||
mMetaData.removeAll(toRemove);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* load the format/info meta data maps (these are used for quick lookup by key name)
|
||||
*/
|
||||
private void loadMetaDataMaps() {
|
||||
for ( VCFHeaderLine line : mMetaData ) {
|
||||
if ( line instanceof VCFInfoHeaderLine ) {
|
||||
VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
|
||||
mInfoMetaData.put(infoLine.getName(), infoLine);
|
||||
}
|
||||
else if ( line instanceof VCFFormatHeaderLine ) {
|
||||
VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
|
||||
mFormatMetaData.put(formatLine.getName(), formatLine);
|
||||
}
|
||||
else {
|
||||
mOtherMetaData.put(line.getKey(), line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header fields in order they're presented in the input file (which is now required to be
|
||||
* the order presented in the spec).
|
||||
*
|
||||
* @return a set of the header fields, in order
|
||||
*/
|
||||
public Set<HEADER_FIELDS> getHeaderFields() {
|
||||
Set<HEADER_FIELDS> fields = new LinkedHashSet<HEADER_FIELDS>();
|
||||
for (HEADER_FIELDS field : HEADER_FIELDS.values())
|
||||
fields.add(field);
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the meta data, associated with this header
|
||||
*
|
||||
* @return a set of the meta data
|
||||
*/
|
||||
public Set<VCFHeaderLine> getMetaData() {
|
||||
Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
|
||||
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString()));
|
||||
lines.addAll(mMetaData);
|
||||
return Collections.unmodifiableSet(lines);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the genotyping sample names
|
||||
*
|
||||
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
|
||||
*/
|
||||
public Set<String> getGenotypeSamples() {
|
||||
return mGenotypeSampleNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have genotyping data?
|
||||
*
|
||||
* @return true if we have genotyping columns, false otherwise
|
||||
*/
|
||||
public boolean hasGenotypingData() {
|
||||
return mGenotypeSampleNames.size() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* were the input samples sorted originally?
|
||||
*
|
||||
* @return true if the input samples were sorted originally, false otherwise
|
||||
*/
|
||||
public boolean samplesWereAlreadySorted() {
|
||||
return samplesWereAlreadySorted;
|
||||
}
|
||||
|
||||
/** @return the column count */
|
||||
public int getColumnCount() {
|
||||
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFInfoHeaderLine getInfoHeaderLine(String key) {
|
||||
return mInfoMetaData.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFFormatHeaderLine getFormatHeaderLine(String key) {
|
||||
return mFormatMetaData.get(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key the header key name
|
||||
* @return the meta data line, or null if there is none
|
||||
*/
|
||||
public VCFHeaderLine getOtherHeaderLine(String key) {
|
||||
return mOtherMetaData.get(key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry in the VCF header
|
||||
*/
|
||||
class VCFHeaderLine implements Comparable {
|
||||
protected static boolean ALLOW_UNBOUND_DESCRIPTIONS = true;
|
||||
protected static String UNBOUND_DESCRIPTION = "Not provided in original VCF header";
|
||||
|
||||
private String mKey = null;
|
||||
private String mValue = null;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF header line
|
||||
*
|
||||
* @param key the key for this header line
|
||||
* @param value the value for this header line
|
||||
*/
|
||||
public VCFHeaderLine(String key, String value) {
|
||||
if ( key == null )
|
||||
throw new IllegalArgumentException("VCFHeaderLine: key cannot be null: key = " + key);
|
||||
mKey = key;
|
||||
mValue = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the key
|
||||
*
|
||||
* @return the key
|
||||
*/
|
||||
public String getKey() {
|
||||
return mKey;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value
|
||||
*
|
||||
* @return the value
|
||||
*/
|
||||
public String getValue() {
|
||||
return mValue;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return toStringEncoding();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should be overloaded in sub classes to do subclass specific
|
||||
*
|
||||
* @return the string encoding
|
||||
*/
|
||||
protected String toStringEncoding() {
|
||||
return mKey + "=" + mValue;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFHeaderLine) )
|
||||
return false;
|
||||
return mKey.equals(((VCFHeaderLine)o).getKey()) && mValue.equals(((VCFHeaderLine)o).getValue());
|
||||
}
|
||||
|
||||
public int compareTo(Object other) {
|
||||
return toString().compareTo(other.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param line the line
|
||||
* @return true if the line is a VCF meta data line, or false if it is not
|
||||
*/
|
||||
public static boolean isHeaderLine(String line) {
|
||||
return line != null && line.length() > 0 && VCFHeader.HEADER_INDICATOR.equals(line.substring(0,1));
|
||||
}
|
||||
|
||||
/**
|
||||
* create a string of a mapping pair for the target VCF version
|
||||
* @param keyValues a mapping of the key->value pairs to output
|
||||
* @return a string, correctly formatted
|
||||
*/
|
||||
public static String toStringEncoding(Map<String, ? extends Object> keyValues) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("<");
|
||||
boolean start = true;
|
||||
for (Map.Entry<String,?> entry : keyValues.entrySet()) {
|
||||
if (start) start = false;
|
||||
else builder.append(",");
|
||||
|
||||
if ( entry.getValue() == null ) throw new TribbleException.InternalCodecException("Header problem: unbound value at " + entry + " from " + keyValues);
|
||||
|
||||
builder.append(entry.getKey());
|
||||
builder.append("=");
|
||||
builder.append(entry.getValue().toString().contains(",") ||
|
||||
entry.getValue().toString().contains(" ") ||
|
||||
entry.getKey().equals("Description") ? "\""+ entry.getValue() + "\"" : entry.getValue());
|
||||
}
|
||||
builder.append(">");
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
/**
|
||||
* the count encodings we use for fields in VCF header lines
|
||||
*/
|
||||
public enum VCFHeaderLineCount {
|
||||
INTEGER, A, G, UNBOUNDED;
|
||||
}
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A class for translating between vcf header versions
|
||||
*/
|
||||
public class VCFHeaderLineTranslator {
|
||||
private static Map<VCFHeaderVersion,VCFLineParser> mapping;
|
||||
|
||||
static {
|
||||
mapping = new HashMap<VCFHeaderVersion,VCFLineParser>();
|
||||
mapping.put(VCFHeaderVersion.VCF4_0,new VCF4Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF4_1,new VCF4Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF3_3,new VCF3Parser());
|
||||
mapping.put(VCFHeaderVersion.VCF3_2,new VCF3Parser());
|
||||
}
|
||||
|
||||
public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
|
||||
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
interface VCFLineParser {
|
||||
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* a class that handles the to and from disk for VCF 4 lines
|
||||
*/
|
||||
class VCF4Parser implements VCFLineParser {
|
||||
Set<String> bracketed = new HashSet<String>();
|
||||
|
||||
/**
|
||||
* parse a VCF4 line
|
||||
* @param valueLine the line
|
||||
* @return a mapping of the tags parsed out
|
||||
*/
|
||||
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||
// our return map
|
||||
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||
|
||||
// a builder to store up characters as we go
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// store the key when we're parsing out the values
|
||||
String key = "";
|
||||
|
||||
// where are we in the stream of characters?
|
||||
int index = 0;
|
||||
|
||||
// are we inside a quotation? we don't special case ',' then
|
||||
boolean inQuote = false;
|
||||
|
||||
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly [yes, but this machine is getting ugly now... MAD]
|
||||
for (char c: valueLine.toCharArray()) {
|
||||
if ( c == '\"' ) {
|
||||
inQuote = ! inQuote;
|
||||
} else if ( inQuote ) {
|
||||
builder.append(c);
|
||||
} else {
|
||||
switch (c) {
|
||||
case ('<') : if (index == 0) break; // if we see a open bracket at the beginning, ignore it
|
||||
case ('>') : if (index == valueLine.length()-1) ret.put(key,builder.toString().trim()); break; // if we see a close bracket, and we're at the end, add an entry to our list
|
||||
case ('=') : key = builder.toString().trim(); builder = new StringBuilder(); break; // at an equals, copy the key and reset the builder
|
||||
case (',') : ret.put(key,builder.toString().trim()); builder = new StringBuilder(); break; // drop the current key value to the return map
|
||||
default: builder.append(c); // otherwise simply append to the current string
|
||||
}
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if (ret.size() > expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + ret.size() + " in string " + expectedTagOrder.size());
|
||||
for (String str : ret.keySet()) {
|
||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
class VCF3Parser implements VCFLineParser {
|
||||
|
||||
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
|
||||
// our return map
|
||||
Map<String, String> ret = new LinkedHashMap<String, String>();
|
||||
|
||||
// a builder to store up characters as we go
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// where are we in the stream of characters?
|
||||
int index = 0;
|
||||
// where in the expected tag order are we?
|
||||
int tagIndex = 0;
|
||||
|
||||
// are we inside a quotation? we don't special case ',' then
|
||||
boolean inQuote = false;
|
||||
|
||||
// a little switch machine to parse out the tags. Regex ended up being really complicated and ugly
|
||||
for (char c: valueLine.toCharArray()) {
|
||||
switch (c) {
|
||||
case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it
|
||||
case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map
|
||||
default: builder.append(c); // otherwise simply append to the current string
|
||||
}
|
||||
index++;
|
||||
}
|
||||
ret.put(expectedTagOrder.get(tagIndex++),builder.toString());
|
||||
|
||||
// validate the tags against the expected list
|
||||
index = 0;
|
||||
if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size());
|
||||
for (String str : ret.keySet()){
|
||||
if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine);
|
||||
index++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
/**
|
||||
* the type encodings we use for fields in VCF header lines
|
||||
*/
|
||||
enum VCFHeaderLineType {
|
||||
Integer, Float, String, Character, Flag;
|
||||
}
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.broad.tribble.TribbleException;
|
||||
|
||||
/**
|
||||
* information that identifies each header version
|
||||
*/
|
||||
enum VCFHeaderVersion {
|
||||
VCF3_2("VCRv3.2","format"),
|
||||
VCF3_3("VCFv3.3","fileformat"),
|
||||
VCF4_0("VCFv4.0","fileformat"),
|
||||
VCF4_1("VCFv4.1","fileformat");
|
||||
|
||||
private final String versionString;
|
||||
private final String formatString;
|
||||
|
||||
/**
|
||||
* create the enum, privately, using:
|
||||
* @param vString the version string
|
||||
* @param fString the format string
|
||||
*/
|
||||
VCFHeaderVersion(String vString, String fString) {
|
||||
this.versionString = vString;
|
||||
this.formatString = fString;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the header version
|
||||
* @param version the version string
|
||||
* @return a VCFHeaderVersion object
|
||||
*/
|
||||
public static VCFHeaderVersion toHeaderVersion(String version) {
|
||||
version = clean(version);
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.versionString.equals(version))
|
||||
return hv;
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* are we a valid version string of some type
|
||||
* @param version the version string
|
||||
* @return true if we're valid of some type, false otherwise
|
||||
*/
|
||||
public static boolean isVersionString(String version){
|
||||
return toHeaderVersion(version) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* are we a valid format string for some type
|
||||
* @param format the format string
|
||||
* @return true if we're valid of some type, false otherwise
|
||||
*/
|
||||
public static boolean isFormatString(String format){
|
||||
format = clean(format);
|
||||
for (VCFHeaderVersion hv : VCFHeaderVersion.values())
|
||||
if (hv.formatString.equals(format))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public static VCFHeaderVersion getHeaderVersion(String versionLine) {
|
||||
String[] lineFields = versionLine.split("=");
|
||||
if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) )
|
||||
throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line");
|
||||
|
||||
if ( !isVersionString(lineFields[1]) )
|
||||
throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version");
|
||||
|
||||
return toHeaderVersion(lineFields[1]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function to clean up a VCF header string
|
||||
*
|
||||
* @param s string
|
||||
* @return trimmed version of s
|
||||
*/
|
||||
private static String clean(String s) {
|
||||
return s.trim();
|
||||
}
|
||||
|
||||
|
||||
public String getVersionString() {
|
||||
return versionString;
|
||||
}
|
||||
|
||||
public String getFormatString() {
|
||||
return formatString;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* <p/>
|
||||
* Class VCFInfoHeaderLine
|
||||
* <p/>
|
||||
* A class representing a key=value entry for INFO fields in the VCF header
|
||||
*/
|
||||
class VCFInfoHeaderLine extends VCFCompoundHeaderLine {
|
||||
public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
||||
public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) {
|
||||
super(name, count, type, description, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
||||
protected VCFInfoHeaderLine(String line, VCFHeaderVersion version) {
|
||||
super(line, version, SupportedHeaderLineType.INFO);
|
||||
}
|
||||
|
||||
// info fields allow flag values
|
||||
@Override
|
||||
boolean allowFlagValues() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
/** an interface for named header lines **/
|
||||
interface VCFNamedHeaderLine {
|
||||
String getName();
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* All VCF codecs need to implement this interface so that we can perform lazy loading.
|
||||
*/
|
||||
interface VCFParser {
|
||||
|
||||
/**
|
||||
* create a genotype map
|
||||
* @param str the string
|
||||
* @param alleles the list of alleles
|
||||
* @param chr chrom
|
||||
* @param pos position
|
||||
* @return a mapping of sample name to genotype object
|
||||
*/
|
||||
public Map<String, Genotype> createGenotypeMap(String str, List<Allele> alleles, String chr, int pos);
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* A class representing a key=value entry for simple VCF header types
|
||||
*/
|
||||
abstract class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFNamedHeaderLine {
|
||||
|
||||
public enum SupportedHeaderLineType {
|
||||
FILTER, ALT;
|
||||
}
|
||||
|
||||
private String name;
|
||||
private String description;
|
||||
|
||||
// our type of line, i.e. filter, alt, etc
|
||||
private final SupportedHeaderLineType lineType;
|
||||
|
||||
|
||||
/**
|
||||
* create a VCF filter header line
|
||||
*
|
||||
* @param name the name for this header line
|
||||
* @param description the description for this header line
|
||||
* @param lineType the header line type
|
||||
*/
|
||||
public VCFSimpleHeaderLine(String name, String description, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.lineType = lineType;
|
||||
this.name = name;
|
||||
this.description = description;
|
||||
|
||||
if ( name == null || description == null )
|
||||
throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s desc=%s", super.getKey(), name, description ));
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF info header line
|
||||
*
|
||||
* @param line the header line
|
||||
* @param version the vcf header version
|
||||
* @param lineType the header line type
|
||||
*/
|
||||
protected VCFSimpleHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) {
|
||||
super(lineType.toString(), "");
|
||||
this.lineType = lineType;
|
||||
Map<String,String> mapping = VCFHeaderLineTranslator.parseLine(version,line, Arrays.asList("ID","Description"));
|
||||
name = mapping.get("ID");
|
||||
description = mapping.get("Description");
|
||||
if ( description == null && ALLOW_UNBOUND_DESCRIPTIONS ) // handle the case where there's no description provided
|
||||
description = UNBOUND_DESCRIPTION;
|
||||
}
|
||||
|
||||
protected String toStringEncoding() {
|
||||
Map<String,Object> map = new LinkedHashMap<String,Object>();
|
||||
map.put("ID", name);
|
||||
map.put("Description", description);
|
||||
return lineType.toString() + "=" + toStringEncoding(map);
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if ( !(o instanceof VCFSimpleHeaderLine) )
|
||||
return false;
|
||||
VCFSimpleHeaderLine other = (VCFSimpleHeaderLine)o;
|
||||
return name.equals(other.name) &&
|
||||
description.equals(other.description);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,227 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* A set of static utility methods for common operations on VCF files/records.
|
||||
*/
|
||||
class VCFUtils {
|
||||
/**
|
||||
* Constructor access disallowed...static utility methods only!
|
||||
*/
|
||||
private VCFUtils() { }
|
||||
|
||||
public static <T extends Feature> Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List<RodBinding<T>> rodBindings) {
|
||||
// Collect the eval rod names
|
||||
final Set<String> names = new TreeSet<String>();
|
||||
for ( final RodBinding<T> evalRod : rodBindings )
|
||||
names.add(evalRod.getName());
|
||||
return getVCFHeadersFromRods(toolkit, names);
|
||||
}
|
||||
|
||||
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) {
|
||||
return getVCFHeadersFromRods(toolkit, (Collection<String>)null);
|
||||
}
|
||||
|
||||
public static Map<String, VCFHeader> getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
|
||||
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
// ignore the rod if it's not in our list
|
||||
if ( rodNames != null && !rodNames.contains(source.getName()) )
|
||||
continue;
|
||||
|
||||
if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader )
|
||||
data.put(source.getName(), (VCFHeader)source.getHeader());
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
public static Map<String,VCFHeader> getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) {
|
||||
Map<String, VCFHeader> data = new HashMap<String, VCFHeader>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
// ignore the rod if lacks the prefix
|
||||
if ( ! source.getName().startsWith(prefix) )
|
||||
continue;
|
||||
|
||||
if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader )
|
||||
data.put(source.getName(), (VCFHeader)source.getHeader());
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the header fields from all VCF rods input by the user
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
*
|
||||
* @return a set of all fields
|
||||
*/
|
||||
public static Set<VCFHeaderLine> getHeaderFields(GenomeAnalysisEngine toolkit) {
|
||||
return getHeaderFields(toolkit, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the header fields from all VCF rods input by the user
|
||||
*
|
||||
* @param toolkit GATK engine
|
||||
* @param rodNames names of rods to use, or null if we should use all possible ones
|
||||
*
|
||||
* @return a set of all fields
|
||||
*/
|
||||
public static Set<VCFHeaderLine> getHeaderFields(GenomeAnalysisEngine toolkit, Collection<String> rodNames) {
|
||||
|
||||
// keep a map of sample name to occurrences encountered
|
||||
TreeSet<VCFHeaderLine> fields = new TreeSet<VCFHeaderLine>();
|
||||
|
||||
// iterate to get all of the sample names
|
||||
List<ReferenceOrderedDataSource> dataSources = toolkit.getRodDataSources();
|
||||
for ( ReferenceOrderedDataSource source : dataSources ) {
|
||||
// ignore the rod if it's not in our list
|
||||
if ( rodNames != null && !rodNames.contains(source.getName()) )
|
||||
continue;
|
||||
|
||||
if ( source.getRecordType().equals(VariantContext.class)) {
|
||||
VCFHeader header = (VCFHeader)source.getHeader();
|
||||
if ( header != null )
|
||||
fields.addAll(header.getMetaData());
|
||||
}
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
/** Only displays a warning if a logger is provided and an identical warning hasn't been already issued */
|
||||
private static final class HeaderConflictWarner {
|
||||
Logger logger;
|
||||
Set<String> alreadyIssued = new HashSet<String>();
|
||||
|
||||
private HeaderConflictWarner(final Logger logger) {
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
public void warn(final VCFHeaderLine line, final String msg) {
|
||||
if ( logger != null && ! alreadyIssued.contains(line.getKey()) ) {
|
||||
alreadyIssued.add(line.getKey());
|
||||
logger.warn(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> smartMergeHeaders(Collection<VCFHeader> headers, Logger logger) throws IllegalStateException {
|
||||
HashMap<String, VCFHeaderLine> map = new HashMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
|
||||
HeaderConflictWarner conflictWarner = new HeaderConflictWarner(logger);
|
||||
|
||||
// todo -- needs to remove all version headers from sources and add its own VCF version line
|
||||
for ( VCFHeader source : headers ) {
|
||||
//System.out.printf("Merging in header %s%n", source);
|
||||
for ( VCFHeaderLine line : source.getMetaData()) {
|
||||
String key = line.getKey();
|
||||
|
||||
if ( line instanceof VCFNamedHeaderLine)
|
||||
key = key + "" + ((VCFNamedHeaderLine) line).getName();
|
||||
|
||||
if ( map.containsKey(key) ) {
|
||||
VCFHeaderLine other = map.get(key);
|
||||
if ( line.equals(other) )
|
||||
continue;
|
||||
else if ( ! line.getClass().equals(other.getClass()) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
else if ( line instanceof VCFFilterHeaderLine) {
|
||||
String lineName = ((VCFFilterHeaderLine) line).getName(); String otherName = ((VCFFilterHeaderLine) other).getName();
|
||||
if ( ! lineName.equals(otherName) )
|
||||
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
|
||||
} else if ( line instanceof VCFCompoundHeaderLine ) {
|
||||
VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
|
||||
VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
|
||||
|
||||
// if the names are the same, but the values are different, we need to quit
|
||||
if (! (compLine).equalsExcludingDescription(compOther) ) {
|
||||
if ( compLine.getType().equals(compOther.getType()) ) {
|
||||
// The Number entry is an Integer that describes the number of values that can be
|
||||
// included with the INFO field. For example, if the INFO field contains a single
|
||||
// number, then this value should be 1. However, if the INFO field describes a pair
|
||||
// of numbers, then this value should be 2 and so on. If the number of possible
|
||||
// values varies, is unknown, or is unbounded, then this value should be '.'.
|
||||
conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other);
|
||||
compOther.setNumberToUnbounded();
|
||||
} else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) {
|
||||
// promote key to Float
|
||||
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
|
||||
map.put(key, compOther);
|
||||
} else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) {
|
||||
// promote key to Float
|
||||
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
|
||||
} else {
|
||||
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
|
||||
}
|
||||
}
|
||||
if ( ! compLine.getDescription().equals(compOther) )
|
||||
conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine);
|
||||
} else {
|
||||
// we are not equal, but we're not anything special either
|
||||
conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other);
|
||||
}
|
||||
} else {
|
||||
map.put(key, line);
|
||||
//System.out.printf("Adding header line %s%n", line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new HashSet<VCFHeaderLine>(map.values());
|
||||
}
|
||||
|
||||
public static String rsIDOfFirstRealVariant(List<VariantContext> VCs, VariantContext.Type type) {
|
||||
if ( VCs == null )
|
||||
return null;
|
||||
|
||||
String rsID = null;
|
||||
for ( VariantContext vc : VCs ) {
|
||||
if ( vc.getType() == type ) {
|
||||
rsID = vc.getID();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rsID;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
/**
|
||||
* this class writes VCF files
|
||||
*/
|
||||
public interface VCFWriter {
|
||||
|
||||
public void writeHeader(VCFHeader header);
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
public void close();
|
||||
|
||||
public void add(VariantContext vc);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,315 @@
|
|||
/*
|
||||
* Copyright (c) 2010. The Broad Institute
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.variantcontext.v13;
|
||||
|
||||
import org.apache.commons.jexl2.JexlContext;
|
||||
import org.apache.commons.jexl2.MapContext;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
* @author depristo
|
||||
*
|
||||
* Class VariantJEXLContext
|
||||
*
|
||||
* implements the JEXML context for VariantContext; this saves us from
|
||||
* having to generate a JEXML context lookup map everytime we want to evaluate an expression.
|
||||
*
|
||||
* This is package protected, only classes in variantcontext should have access to it.
|
||||
*
|
||||
* // todo -- clean up to remove or better support genotype filtering
|
||||
*/
|
||||
|
||||
class VariantJEXLContext implements JexlContext {
|
||||
// our stored variant context
|
||||
private VariantContext vc;
|
||||
|
||||
private interface AttributeGetter {
|
||||
public Object get(VariantContext vc);
|
||||
}
|
||||
|
||||
private static Map<String, AttributeGetter> x = new HashMap<String, AttributeGetter>();
|
||||
|
||||
static {
|
||||
x.put("vc", new AttributeGetter() { public Object get(VariantContext vc) { return vc; }});
|
||||
x.put("CHROM", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getChr(); }});
|
||||
x.put("POS", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getStart(); }});
|
||||
x.put("TYPE", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getType().toString(); }});
|
||||
x.put("QUAL", new AttributeGetter() { public Object get(VariantContext vc) { return 10 * vc.getNegLog10PError(); }});
|
||||
x.put("ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getAlleles(); }});
|
||||
x.put("N_ALLELES", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getNAlleles(); }});
|
||||
x.put("FILTER", new AttributeGetter() { public Object get(VariantContext vc) { return vc.isFiltered() ? "1" : "0"; }});
|
||||
|
||||
// x.put("GT", new AttributeGetter() { public Object get(VariantContext vc) { return g.getGenotypeString(); }});
|
||||
x.put("homRefCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomRefCount(); }});
|
||||
x.put("hetCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHetCount(); }});
|
||||
x.put("homVarCount", new AttributeGetter() { public Object get(VariantContext vc) { return vc.getHomVarCount(); }});
|
||||
}
|
||||
|
||||
public VariantJEXLContext(VariantContext vc) {
|
||||
this.vc = vc;
|
||||
}
|
||||
|
||||
public Object get(String name) {
|
||||
Object result = null;
|
||||
if ( x.containsKey(name) ) { // dynamic resolution of name -> value via map
|
||||
result = x.get(name).get(vc);
|
||||
} else if ( vc.hasAttribute(name)) {
|
||||
result = vc.getAttribute(name);
|
||||
} else if ( vc.getFilters().contains(name) ) {
|
||||
result = "1";
|
||||
}
|
||||
|
||||
//System.out.printf("dynamic lookup %s => %s%n", name, result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean has(String name) {
|
||||
return get(name) != null;
|
||||
}
|
||||
|
||||
public void set(String name, Object value) {
|
||||
throw new UnsupportedOperationException("remove() not supported on a VariantJEXLContext");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* this is an implementation of a Map of JexlVCMatchExp to true or false values. It lazy initializes each value
|
||||
* as requested to save as much processing time as possible.
|
||||
*
|
||||
* Compatible with JEXL 1.1 (this code will be easier if we move to 2.0, all of the functionality can go into the
|
||||
* JexlContext's get()
|
||||
*
|
||||
*/
|
||||
|
||||
class JEXLMap implements Map<VariantContextUtils.JexlVCMatchExp, Boolean> {
|
||||
// our variant context and/or Genotype
|
||||
private final VariantContext vc;
|
||||
private final Genotype g;
|
||||
|
||||
// our context
|
||||
private JexlContext jContext = null;
|
||||
|
||||
// our mapping from JEXLVCMatchExp to Booleans, which will be set to NULL for previously uncached JexlVCMatchExp
|
||||
private Map<VariantContextUtils.JexlVCMatchExp,Boolean> jexl;
|
||||
|
||||
|
||||
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc, Genotype g) {
|
||||
this.vc = vc;
|
||||
this.g = g;
|
||||
initialize(jexlCollection);
|
||||
}
|
||||
|
||||
public JEXLMap(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection, VariantContext vc) {
|
||||
this(jexlCollection, vc, null);
|
||||
}
|
||||
|
||||
private void initialize(Collection<VariantContextUtils.JexlVCMatchExp> jexlCollection) {
|
||||
jexl = new HashMap<VariantContextUtils.JexlVCMatchExp,Boolean>();
|
||||
for (VariantContextUtils.JexlVCMatchExp exp: jexlCollection) {
|
||||
jexl.put(exp, null);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create the internal JexlContext, only when required. This code is where new JEXL context variables
|
||||
* should get added.
|
||||
*
|
||||
*/
|
||||
private void createContext() {
|
||||
if ( g == null ) {
|
||||
// todo -- remove dependancy on g to the entire system
|
||||
jContext = new VariantJEXLContext(vc);
|
||||
} else {
|
||||
//
|
||||
// this whole branch is here just to support G jexl operations
|
||||
//
|
||||
Map<String, Object> infoMap = new HashMap<String, Object>();
|
||||
|
||||
if ( vc != null ) {
|
||||
// create a mapping of what we know about the variant context, its Chromosome, positions, etc.
|
||||
infoMap.put("CHROM", vc.getChr());
|
||||
infoMap.put("POS", vc.getStart());
|
||||
infoMap.put("TYPE", vc.getType().toString());
|
||||
infoMap.put("QUAL", String.valueOf(vc.getPhredScaledQual()));
|
||||
|
||||
// add alleles
|
||||
infoMap.put("ALLELES", Utils.join(";", vc.getAlleles()));
|
||||
infoMap.put("N_ALLELES", String.valueOf(vc.getNAlleles()));
|
||||
|
||||
// add attributes
|
||||
addAttributesToMap(infoMap, vc.getAttributes());
|
||||
|
||||
// add filter fields
|
||||
infoMap.put("FILTER", vc.isFiltered() ? "1" : "0");
|
||||
for ( Object filterCode : vc.getFilters() ) {
|
||||
infoMap.put(String.valueOf(filterCode), "1");
|
||||
}
|
||||
|
||||
// add genotype-specific fields
|
||||
// TODO -- implement me when we figure out a good way to represent this
|
||||
// for ( Genotype g : vc.getGenotypes().values() ) {
|
||||
// String prefix = g.getSampleName() + ".";
|
||||
// addAttributesToMap(infoMap, g.getAttributes(), prefix);
|
||||
// infoMap.put(prefix + "GT", g.getGenotypeString());
|
||||
// }
|
||||
|
||||
// add specific genotype if one is provided
|
||||
infoMap.put(VCFConstants.GENOTYPE_KEY, g.getGenotypeString());
|
||||
infoMap.put("isHomRef", g.isHomRef() ? "1" : "0");
|
||||
infoMap.put("isHet", g.isHet() ? "1" : "0");
|
||||
infoMap.put("isHomVar", g.isHomVar() ? "1" : "0");
|
||||
infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, new Double(g.getPhredScaledQual()));
|
||||
for ( Map.Entry<String, Object> e : g.getAttributes().entrySet() ) {
|
||||
if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) )
|
||||
infoMap.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
// create the internal context that we can evaluate expressions against
|
||||
|
||||
jContext = new MapContext(infoMap);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the size of the internal data structure
|
||||
*/
|
||||
public int size() {
|
||||
return jexl.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if we're empty
|
||||
*/
|
||||
public boolean isEmpty() { return this.jexl.isEmpty(); }
|
||||
|
||||
/**
|
||||
* do we contain the specified key
|
||||
* @param o the key
|
||||
* @return true if we have a value for that key
|
||||
*/
|
||||
public boolean containsKey(Object o) { return jexl.containsKey(o); }
|
||||
|
||||
public Boolean get(Object o) {
|
||||
// if we've already determined the value, return it
|
||||
if (jexl.containsKey(o) && jexl.get(o) != null) return jexl.get(o);
|
||||
|
||||
// try and cast the expression
|
||||
VariantContextUtils.JexlVCMatchExp e = (VariantContextUtils.JexlVCMatchExp) o;
|
||||
evaluateExpression(e);
|
||||
return jexl.get(e);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the keyset of map
|
||||
* @return a set of keys of type JexlVCMatchExp
|
||||
*/
|
||||
public Set<VariantContextUtils.JexlVCMatchExp> keySet() {
|
||||
return jexl.keySet();
|
||||
}
|
||||
|
||||
/**
|
||||
* get all the values of the map. This is an expensive call, since it evaluates all keys that haven't
|
||||
* been evaluated yet. This is fine if you truely want all the keys, but if you only want a portion, or know
|
||||
* the keys you want, you would be better off using get() to get them by name.
|
||||
* @return a collection of boolean values, representing the results of all the variants evaluated
|
||||
*/
|
||||
public Collection<Boolean> values() {
|
||||
// this is an expensive call
|
||||
for (VariantContextUtils.JexlVCMatchExp exp : jexl.keySet())
|
||||
if (jexl.get(exp) == null)
|
||||
evaluateExpression(exp);
|
||||
return jexl.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* evaulate a JexlVCMatchExp's expression, given the current context (and setup the context if it's null)
|
||||
* @param exp the JexlVCMatchExp to evaluate
|
||||
*/
|
||||
private void evaluateExpression(VariantContextUtils.JexlVCMatchExp exp) {
|
||||
// if the context is null, we need to create it to evaluate the JEXL expression
|
||||
if (this.jContext == null) createContext();
|
||||
try {
|
||||
jexl.put (exp, (Boolean) exp.exp.evaluate(jContext));
|
||||
} catch (Exception e) {
|
||||
throw new UserException.CommandLineException(String.format("Invalid JEXL expression detected for %s with message %s", exp.name, e.getMessage()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* helper function: adds the list of attributes to the information map we're building
|
||||
* @param infoMap the map
|
||||
* @param attributes the attributes
|
||||
*/
|
||||
private static void addAttributesToMap(Map<String, Object> infoMap, Map<String, ?> attributes ) {
|
||||
for (Map.Entry<String, ?> e : attributes.entrySet()) {
|
||||
infoMap.put(e.getKey(), String.valueOf(e.getValue()));
|
||||
}
|
||||
}
|
||||
|
||||
public Boolean put(VariantContextUtils.JexlVCMatchExp jexlVCMatchExp, Boolean aBoolean) {
|
||||
return jexl.put(jexlVCMatchExp,aBoolean);
|
||||
}
|
||||
|
||||
public void putAll(Map<? extends VariantContextUtils.JexlVCMatchExp, ? extends Boolean> map) {
|
||||
jexl.putAll(map);
|
||||
}
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////////////////////
|
||||
// The Following are unsupported at the moment
|
||||
// //////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// this doesn't make much sense to implement, boolean doesn't offer too much variety to deal
|
||||
// with evaluating every key in the internal map.
|
||||
public boolean containsValue(Object o) {
|
||||
throw new UnsupportedOperationException("containsValue() not supported on a JEXLMap");
|
||||
}
|
||||
|
||||
// this doesn't make much sense
|
||||
public Boolean remove(Object o) {
|
||||
throw new UnsupportedOperationException("remove() not supported on a JEXLMap");
|
||||
}
|
||||
|
||||
|
||||
public Set<Entry<VariantContextUtils.JexlVCMatchExp, Boolean>> entrySet() {
|
||||
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
|
||||
}
|
||||
|
||||
// nope
|
||||
public void clear() {
|
||||
throw new UnsupportedOperationException("clear() not supported on a JEXLMap");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue