Adding a lot of changes to the VCF code, plus a new basic validator. Also removing an extra copy of the Artificial SAM generator that got checked in at some point.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1437 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
b3fe566c0c
commit
4cf9110468
|
|
@ -0,0 +1,136 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFGenotypeRecord
|
||||
* <p/>
|
||||
* The genotype record in VCF store a considerable amount of information,
|
||||
* so they were broken off into their own class
|
||||
*/
|
||||
public class VCFGenotypeRecord {
|
||||
// what kind of phasing this genotype has
|
||||
enum GT_GENOTYPE {
|
||||
UNPHASED, PHASED, PHASED_SWITCH_PROB
|
||||
}
|
||||
|
||||
// our phasing
|
||||
private GT_GENOTYPE phaseType;
|
||||
|
||||
// our reference bases(s)
|
||||
private final char reference;
|
||||
|
||||
// our bases(s)
|
||||
private final List<String> bases = new ArrayList<String>();
|
||||
|
||||
// our mapping of the format fields to values
|
||||
private final Map<String, String> fields = new HashMap<String, String>();
|
||||
|
||||
// our pattern matching for the genotype fields
|
||||
private static final Pattern basicSplit = Pattern.compile("([0-9]*)([\\\\|\\/])([0-9]*):(\\S*)");
|
||||
|
||||
/**
|
||||
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||
*
|
||||
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
||||
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||
* @param referenceBase the reference base
|
||||
*/
|
||||
protected VCFGenotypeRecord(String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
||||
reference = referenceBase;
|
||||
// check that the first format field is GT, which is required
|
||||
String keys[] = formatString.split(":");
|
||||
if (keys.length < 0 || !keys[0].equals("GT"))
|
||||
throw new IllegalArgumentException("The format string must have fields, and the first must be GT (genotype)");
|
||||
|
||||
// find the values for each of the keys, of which the GT field should be the first
|
||||
Matcher match = basicSplit.matcher(genotypeString);
|
||||
if (!match.matches() || match.groupCount() < 3)
|
||||
throw new IllegalArgumentException("Unable to match genotype string to expected regex");
|
||||
|
||||
// add the alternate base (which can be ref by specifying 0)
|
||||
addAllele(match.group(1), altAlleles, referenceBase);
|
||||
|
||||
determinePhase(match.group(2));
|
||||
|
||||
// do we have a second alt allele?
|
||||
if (match.group(3).length() > 0) {
|
||||
addAllele(match.group(3), altAlleles, referenceBase);
|
||||
}
|
||||
|
||||
// check to see what other records we have
|
||||
if (match.groupCount() == 4) {
|
||||
// make sure we'll have enough occurances
|
||||
String tokens[] = match.group(4).split(":{1}"); // the {1} was required, since string.split does a greedy match of the specified regex, like :+
|
||||
int keyIndex = 1;
|
||||
for (String token: tokens) {
|
||||
this.fields.put(keys[keyIndex],token);
|
||||
keyIndex++;
|
||||
}
|
||||
if (keyIndex + 1 == tokens.length) fields.put(keys[++keyIndex],""); // if the last value is blank, split will leave it off
|
||||
if (keyIndex == 1 && match.group(4).contains(":")) {
|
||||
// there was a string of all semicolons, split doesn't handle this well (or at all)
|
||||
while(keyIndex < keys.length) this.fields.put(keys[keyIndex++],"");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* add an alternate allele to the list of alleles we have
|
||||
*
|
||||
* @param alleleNumber the allele number, as a string
|
||||
* @param altAlleles the list of alternate alleles
|
||||
* @param referenceBase the reference base
|
||||
*/
|
||||
private void addAllele(String alleleNumber, String[] altAlleles, char referenceBase) {
|
||||
if (Integer.valueOf(alleleNumber) == 0)
|
||||
bases.add(String.valueOf(referenceBase));
|
||||
else
|
||||
bases.add(altAlleles[Integer.valueOf(alleleNumber) - 1]);
|
||||
}
|
||||
|
||||
/**
|
||||
* determine the phase of the genotype
|
||||
*
|
||||
* @param phase the string that contains the phase character
|
||||
*/
|
||||
private void determinePhase(String phase) {
|
||||
// find the phasing information
|
||||
if (phase.equals("/"))
|
||||
phaseType = GT_GENOTYPE.UNPHASED;
|
||||
else if (phase.equals("|"))
|
||||
phaseType = GT_GENOTYPE.PHASED;
|
||||
else if (phase.equals("\\"))
|
||||
phaseType = GT_GENOTYPE.PHASED_SWITCH_PROB;
|
||||
else
|
||||
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
||||
}
|
||||
|
||||
/** getter methods */
|
||||
|
||||
public GT_GENOTYPE getPhaseType() {
|
||||
return phaseType;
|
||||
}
|
||||
|
||||
public char getReference() {
|
||||
return reference;
|
||||
}
|
||||
|
||||
public List<String> getAllele() {
|
||||
return bases;
|
||||
}
|
||||
|
||||
public Map<String, String> getFields() {
|
||||
return fields;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,8 +1,11 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -26,7 +29,7 @@ public class VCFHeader {
|
|||
private final Map<String, String> mMetaData = new HashMap<String, String>();
|
||||
|
||||
// the list of auxillary tags
|
||||
private final List<String> auxillaryTags = new ArrayList<String>();
|
||||
private final List<String> mGenotypeSampleNames = new ArrayList<String>();
|
||||
|
||||
// the character string that indicates meta data
|
||||
public static final String METADATA_INDICATOR = "##";
|
||||
|
|
@ -34,19 +37,60 @@ public class VCFHeader {
|
|||
// the header string indicator
|
||||
public static final String HEADER_INDICATOR = "#";
|
||||
|
||||
/** our log, which we use to capture anything from this class */
|
||||
/**
|
||||
* our log, which we use to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(VCFHeader.class);
|
||||
|
||||
/**
|
||||
* do we have genotying data?
|
||||
*/
|
||||
private boolean hasGenotypingData = false;
|
||||
|
||||
/**
|
||||
* the current vcf version we support.
|
||||
*/
|
||||
private static final String VCF_VERSION = "VCFv3.2";
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param metaData
|
||||
* @param additionalColumns
|
||||
* @param headerFields the required header fields, in order they're presented
|
||||
* @param metaData the meta data associated with this header
|
||||
*/
|
||||
public VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData, List<String> additionalColumns) {
|
||||
protected VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData) {
|
||||
for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field);
|
||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||
for (String col : additionalColumns) auxillaryTags.add(col);
|
||||
checkVCFVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* create a VCF header, given a list of meta data and auxillary tags
|
||||
*
|
||||
* @param headerFields the required header fields, in order they're presented
|
||||
* @param metaData the meta data associated with this header
|
||||
* @param genotypeSampleNames the genotype format field, and the sample names
|
||||
*/
|
||||
protected VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData, List<String> genotypeSampleNames) {
|
||||
for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field);
|
||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||
for (String col : genotypeSampleNames) mGenotypeSampleNames.add(col);
|
||||
hasGenotypingData = true;
|
||||
checkVCFVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
|
||||
* or the version is not present
|
||||
*/
|
||||
public void checkVCFVersion() {
|
||||
if (mMetaData.containsKey("format")) {
|
||||
if (mMetaData.get("format").equals(VCF_VERSION))
|
||||
return;
|
||||
throw new StingException("VCFHeader: VCF version of " + mMetaData.get("format") +
|
||||
" doesn't match the supported version of " + VCF_VERSION);
|
||||
}
|
||||
throw new StingException("VCFHeader: VCF version isn't present");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -68,12 +112,28 @@ public class VCFHeader {
|
|||
}
|
||||
|
||||
/**
|
||||
* get the auxillary tags
|
||||
* get the genotyping sample names
|
||||
*
|
||||
* @return a list of the extra column names, in order
|
||||
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
|
||||
*/
|
||||
public List<String> getAuxillaryTags() {
|
||||
return auxillaryTags;
|
||||
public List<String> getGenotypeSamples() {
|
||||
return mGenotypeSampleNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* do we have genotyping data?
|
||||
*
|
||||
* @return true if we have genotyping columns, false otherwise
|
||||
*/
|
||||
public boolean hasGenotypingData() {
|
||||
return hasGenotypingData;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the column count,
|
||||
*/
|
||||
public int getColumnCount() {
|
||||
return mHeaderFields.size() + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
new FileInputStream(vcfFile),
|
||||
utf8));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Unable to find VCF file: " + vcfFile, e);
|
||||
throw new StingException("VCFReader: Unable to find VCF file: " + vcfFile, e);
|
||||
}
|
||||
|
||||
String line = null;
|
||||
|
|
@ -51,7 +51,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
mHeader = this.createHeader(lines);
|
||||
mNextRecord = new VCFRecord(mHeader, line);
|
||||
} catch (IOException e) {
|
||||
throw new StingException("Failed to parse VCF File on line: " + line, e);
|
||||
throw new StingException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -112,17 +112,19 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
if (str.startsWith("#") && !str.startsWith("##")) {
|
||||
String[] strings = str.substring(1).split("\\s+");
|
||||
for (String s : strings) {
|
||||
if (headerFields.contains(s)) throw new StingException("Header field duplication is not allowed");
|
||||
if (headerFields.contains(s)) throw new StingException("VCFReader: Header field duplication is not allowed");
|
||||
try {
|
||||
headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s));
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (!s.equals("FORMAT"))
|
||||
auxTags.add(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (headerFields.size() != VCFHeader.HEADER_FIELDS.values().length) {
|
||||
throw new StingException("The VCF header is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size()) + " required fields");
|
||||
throw new StingException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size())
|
||||
+ " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields");
|
||||
}
|
||||
return new VCFHeader(headerFields,metaData,auxTags);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,73 +2,84 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
|||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* the basic VCF record type
|
||||
*/
|
||||
/** the basic VCF record type */
|
||||
public class VCFRecord {
|
||||
// required field values
|
||||
private Map<VCFHeader.HEADER_FIELDS, String> mValues = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
private final Map<VCFHeader.HEADER_FIELDS, String> mValues = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
|
||||
// our auxillary values
|
||||
private Map<String, String> mAuxValues = new HashMap<String, String>();
|
||||
// our genotype sample fields
|
||||
private final Map<String, String> mGenotypeFields = new HashMap<String, String>();
|
||||
|
||||
// the format String, which specifies what each genotype can contain for values
|
||||
private String formatString;
|
||||
|
||||
/**
|
||||
* create a VCFRecord, given a VCF header and the the values in this field. THis is protected, so that the reader is
|
||||
* the only accessing object
|
||||
* TODO: this seems like a bad design
|
||||
*
|
||||
* @param header the VCF header
|
||||
* @param line the line to parse into individual fields
|
||||
*/
|
||||
protected VCFRecord(VCFHeader header, String line) {
|
||||
String tokens[] = line.split("\\s+");
|
||||
if (tokens.length != (header.getAuxillaryTags().size() + header.getHeaderFields().size())) {
|
||||
throw new StingException("Line:" + line + " didn't parse into " + (header.getAuxillaryTags().size() + header.getHeaderFields().size()) + " fields");
|
||||
}
|
||||
|
||||
int tokenCount = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
|
||||
mValues.put(field, tokens[tokenCount]);
|
||||
tokenCount++;
|
||||
}
|
||||
for (String aux : header.getAuxillaryTags()) {
|
||||
mAuxValues.put(aux, tokens[tokenCount]);
|
||||
tokenCount++;
|
||||
}
|
||||
List<String> values = new ArrayList<String>();
|
||||
for (String str : tokens) values.add(str);
|
||||
initialize(header, values);
|
||||
}
|
||||
|
||||
/**
|
||||
* given a VCF header, and the values for each of the columns, create a VCF record
|
||||
*
|
||||
* @param header the VCF header
|
||||
* @param values the values, as a list, for each of the columns
|
||||
*/
|
||||
public VCFRecord(VCFHeader header, List<String> values) {
|
||||
if (values.size() != (header.getAuxillaryTags().size() + header.getHeaderFields().size())) {
|
||||
throw new StingException("The input list doesn't contain enough fields, it should have " + (header.getAuxillaryTags().size() + header.getHeaderFields().size()) + " fields");
|
||||
initialize(header, values);
|
||||
}
|
||||
|
||||
/**
|
||||
* create the VCFRecord
|
||||
*
|
||||
* @param header the VCF header
|
||||
* @param values the list of strings that make up the columns of the record
|
||||
*/
|
||||
private void initialize(VCFHeader header, List<String> values) {
|
||||
if (values.size() != header.getColumnCount()) {
|
||||
throw new StingException("The input list doesn't contain enough fields, it should have " + header.getColumnCount() + " fields");
|
||||
}
|
||||
int index = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field: header.getHeaderFields()) {
|
||||
mValues.put(field,values.get(index));
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
|
||||
mValues.put(field, values.get(index));
|
||||
index++;
|
||||
}
|
||||
for (String str: header.getAuxillaryTags()) {
|
||||
mAuxValues.put(str,values.get(index));
|
||||
if (header.hasGenotypingData()) {
|
||||
formatString = values.get(index);
|
||||
index++;
|
||||
for (String str : header.getGenotypeSamples()) {
|
||||
mGenotypeFields.put(str, values.get(index));
|
||||
index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* lookup a value, given it's column name
|
||||
*
|
||||
* @param key the column name, which is looked up in both the set columns and the auxillary columns
|
||||
*
|
||||
* @return a String representing the column values, or null if the field doesn't exist in this record
|
||||
*/
|
||||
public String getValue(String key) {
|
||||
try {
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.valueOf(key));
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (this.mAuxValues.containsKey(key)) {
|
||||
return mAuxValues.get(key);
|
||||
if (this.mGenotypeFields.containsKey(key)) {
|
||||
return mGenotypeFields.get(key);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
@ -77,30 +88,25 @@ public class VCFRecord {
|
|||
/**
|
||||
* get a required field, given the field tag
|
||||
*
|
||||
* @param field
|
||||
* @return
|
||||
* @param field the key for the field
|
||||
*
|
||||
* @return the field value
|
||||
*/
|
||||
public String getValue(VCFHeader.HEADER_FIELDS field) {
|
||||
return mValues.get(field);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the string for the chromosome that this VCF record is associated with
|
||||
*/
|
||||
/** @return the string for the chromosome that this VCF record is associated with */
|
||||
public String getChromosome() {
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.CHROM);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return this VCF records position on the specified chromosome
|
||||
*/
|
||||
/** @return this VCF records position on the specified chromosome */
|
||||
public long getPosition() {
|
||||
return Long.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.POS));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the ID value for this record
|
||||
*/
|
||||
/** @return the ID value for this record */
|
||||
public String getID() {
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.ID);
|
||||
}
|
||||
|
|
@ -131,9 +137,7 @@ public class VCFRecord {
|
|||
return getAlternateAlleles() != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the phred-scaled quality score
|
||||
*/
|
||||
/** @return the phred-scaled quality score */
|
||||
public int getQual() {
|
||||
return Integer.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.QUAL));
|
||||
}
|
||||
|
|
@ -156,24 +160,37 @@ public class VCFRecord {
|
|||
|
||||
/**
|
||||
* get the information key-value pairs as a Map<>
|
||||
*
|
||||
* @return a map, of the info key-value pairs
|
||||
*/
|
||||
public Map<String,String> getInfoValues() {
|
||||
Map<String,String> ret = new HashMap<String,String>();
|
||||
public Map<String, String> getInfoValues() {
|
||||
Map<String, String> ret = new HashMap<String, String>();
|
||||
String infoSplit[] = mValues.get(VCFHeader.HEADER_FIELDS.INFO).split(";");
|
||||
for (String s: infoSplit) {
|
||||
for (String s : infoSplit) {
|
||||
String keyValue[] = s.split("=");
|
||||
if (keyValue.length != 2) throw new StingException("Key value pairs must have both a key and a value; pair: " + s);
|
||||
ret.put(keyValue[0],keyValue[1]);
|
||||
if (keyValue.length != 2)
|
||||
throw new StingException("Key value pairs must have both a key and a value; pair: " + s);
|
||||
ret.put(keyValue[0], keyValue[1]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return the number of columnsof data we're storing
|
||||
*/
|
||||
/** @return the number of columnsof data we're storing */
|
||||
public int getColumnCount() {
|
||||
return this.mAuxValues.size() + this.mValues.size();
|
||||
return this.mGenotypeFields.size() + this.mValues.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* return the mapping of the format tags to the specified sample's values
|
||||
* @param sampleName the sample name to get the genotyping tags for
|
||||
* @return a VCFGenotypeRecord
|
||||
*/
|
||||
public VCFGenotypeRecord getVCFGenotypeRecord(String sampleName) {
|
||||
if (!this.mGenotypeFields.containsKey(sampleName)) {
|
||||
throw new IllegalArgumentException("Sample Name: " + sampleName + " doesn't exist in this VCF record");
|
||||
}
|
||||
return new VCFGenotypeRecord(formatString,mGenotypeFields.get(sampleName),this.getAlternateAlleles(),this.getReferenceBase());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,57 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class VCFValidator
|
||||
* <p/>
|
||||
* validate a VCF file
|
||||
*/
|
||||
public class VCFValidator {
|
||||
|
||||
private static final String VCF_VERSION = "VCFv3.2";
|
||||
|
||||
/**
|
||||
* about as simple as things come right now. We open the file, process all the entries in the file,
|
||||
* and if no errors pop up in processing, well hey, looks good to us.
|
||||
* TODO: add validation to individual records fields as they make sense
|
||||
*
|
||||
* @param args the vcf file is the only parameter
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
if (args.length != 1) {
|
||||
printUsage();
|
||||
return;
|
||||
}
|
||||
File vcfFile = new File(args[0]);
|
||||
if (!vcfFile.exists()) {
|
||||
System.err.println("Specified VCF file doesn't exist, please check the input file\n");
|
||||
printUsage();
|
||||
return;
|
||||
}
|
||||
int counter = 0;
|
||||
try {
|
||||
VCFReader reader = new VCFReader(vcfFile);
|
||||
while (reader.hasNext()) {
|
||||
counter++;
|
||||
reader.next();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("VCF Validation failed, after parsing " + counter + " entries.");
|
||||
System.err.println("The reason given was: " + e.getMessage());
|
||||
}
|
||||
System.err.println("Viewed " + counter + " VCF record entries.");
|
||||
}
|
||||
|
||||
public static void printUsage() {
|
||||
System.err.println("VCF validator (VCF Version " + VCF_VERSION + ")");
|
||||
System.err.println("Usage:");
|
||||
System.err.println("vcfvalidator <fille.vcf>");
|
||||
System.err.println("");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -41,7 +41,7 @@ public class VCFWriter {
|
|||
StringBuilder b = new StringBuilder();
|
||||
b.append(VCFHeader.HEADER_INDICATOR);
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + "\t");
|
||||
for (String field : header.getAuxillaryTags()) b.append(field + "\t");
|
||||
for (String field : header.getGenotypeSamples()) b.append(field + "\t");
|
||||
mWriter.write(b.toString() + "\n");
|
||||
}
|
||||
catch (IOException e) {
|
||||
|
|
@ -54,10 +54,9 @@ public class VCFWriter {
|
|||
* @param record the record to output
|
||||
*/
|
||||
public void addRecord(VCFRecord record) {
|
||||
if (record.getColumnCount() != mHeader.getAuxillaryTags().size() + mHeader.getHeaderFields().size()) {
|
||||
if (record.getColumnCount() != mHeader.getGenotypeSamples().size() + mHeader.getHeaderFields().size()) {
|
||||
throw new StingException("Record has " + record.getColumnCount() +
|
||||
" columns, when is should have " + (mHeader.getAuxillaryTags().size() +
|
||||
mHeader.getHeaderFields().size()));
|
||||
" columns, when is should have " + mHeader.getColumnCount());
|
||||
}
|
||||
StringBuilder builder = new StringBuilder();
|
||||
// first output the required fields in order
|
||||
|
|
@ -66,7 +65,7 @@ public class VCFWriter {
|
|||
if (first) { first = false; builder.append(record.getValue(field)); }
|
||||
else builder.append("\t" + record.getValue(field));
|
||||
}
|
||||
for (String auxTag : mHeader.getAuxillaryTags()) {
|
||||
for (String auxTag : mHeader.getGenotypeSamples()) {
|
||||
builder.append("\t" + record.getValue(auxTag));
|
||||
}
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -1,28 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class ArtificialSAMGenerator
|
||||
*
|
||||
* This provides for an external utility, that creates sam files and associates fasta files
|
||||
*/
|
||||
public class ArtificialSAMGenerator {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class ArtificialFASTAUtils {
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class VCFGenotypeRecordTest
|
||||
*
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class VCFGenotypeRecordTest extends BaseTest {
|
||||
|
||||
/**
|
||||
* test the basic parsing
|
||||
*/
|
||||
@Test
|
||||
public void testBasicParsing() {
|
||||
String formatString = "GT:B:C:D";
|
||||
String genotypeString = "0|1:2:3:4";
|
||||
String altAlleles[] = {"A","C","G","T"};
|
||||
char referenceBase = 'N';
|
||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(formatString,genotypeString,altAlleles,referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.GT_GENOTYPE.PHASED,rec.getPhaseType());
|
||||
Assert.assertEquals(referenceBase,rec.getReference());
|
||||
Assert.assertEquals("N",rec.getAllele().get(0));
|
||||
Assert.assertEquals("A",rec.getAllele().get(1));
|
||||
Map<String,String> values = rec.getFields();
|
||||
Assert.assertEquals(3,values.size());
|
||||
Assert.assertTrue(values.get("B").equals("2"));
|
||||
Assert.assertTrue(values.get("C").equals("3"));
|
||||
Assert.assertTrue(values.get("D").equals("4"));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* test the parsing of a genotype field with missing parameters
|
||||
*/
|
||||
@Test
|
||||
public void testMissingFieldParsing() {
|
||||
String formatString = "GT:B:C:D";
|
||||
String genotypeString = "0|1:::4";
|
||||
String altAlleles[] = {"A","C","G","T"};
|
||||
char referenceBase = 'N';
|
||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(formatString,genotypeString,altAlleles,referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.GT_GENOTYPE.PHASED,rec.getPhaseType());
|
||||
Assert.assertEquals(referenceBase,rec.getReference());
|
||||
Assert.assertEquals("N",rec.getAllele().get(0));
|
||||
Assert.assertEquals("A",rec.getAllele().get(1));
|
||||
Map<String,String> values = rec.getFields();
|
||||
Assert.assertEquals(3,values.size());
|
||||
Assert.assertTrue(values.get("B").equals(""));
|
||||
Assert.assertTrue(values.get("C").equals(""));
|
||||
Assert.assertTrue(values.get("D").equals("4"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test the parsing of a genotype field with different missing parameters
|
||||
*/
|
||||
@Test
|
||||
public void testMissingAllFields() {
|
||||
String formatString = "GT:B:C:D";
|
||||
String genotypeString = "0|1:::";
|
||||
String altAlleles[] = {"A","C","G","T"};
|
||||
char referenceBase = 'N';
|
||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(formatString,genotypeString,altAlleles,referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.GT_GENOTYPE.PHASED,rec.getPhaseType());
|
||||
Assert.assertEquals(referenceBase,rec.getReference());
|
||||
Assert.assertEquals("N",rec.getAllele().get(0));
|
||||
Assert.assertEquals("A",rec.getAllele().get(1));
|
||||
Map<String,String> values = rec.getFields();
|
||||
Assert.assertEquals(3,values.size());
|
||||
Assert.assertTrue(values.get("B").equals(""));
|
||||
Assert.assertTrue(values.get("C").equals(""));
|
||||
Assert.assertTrue(values.get("D").equals(""));
|
||||
}
|
||||
}
|
||||
|
|
@ -29,7 +29,7 @@ public class VCFHeaderTest extends BaseTest {
|
|||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||
headerFields.add(field);
|
||||
}
|
||||
metaData.put("one","1");
|
||||
metaData.put("format","VCFv3.2");
|
||||
metaData.put("two","2");
|
||||
additionalColumns.add("extra1");
|
||||
additionalColumns.add("extra2");
|
||||
|
|
@ -50,7 +50,7 @@ public class VCFHeaderTest extends BaseTest {
|
|||
}
|
||||
Assert.assertEquals(metaData.size(),index);
|
||||
index = 0;
|
||||
for (String key: header.getAuxillaryTags()) {
|
||||
for (String key: header.getGenotypeSamples()) {
|
||||
Assert.assertTrue(additionalColumns.contains(key));
|
||||
index++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,8 +20,9 @@ public class VCFReaderTest extends BaseTest {
|
|||
while (reader.hasNext()) {
|
||||
counter++;
|
||||
reader.next();
|
||||
System.err.println(counter);
|
||||
}
|
||||
Assert.assertEquals(5,counter);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@ public class VCFWriterTest extends BaseTest {
|
|||
Assert.assertEquals(2,counter);
|
||||
reader.close();
|
||||
fakeVCFFile.delete();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -50,8 +49,9 @@ public class VCFWriterTest extends BaseTest {
|
|||
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) {
|
||||
headerFields.add(field);
|
||||
}
|
||||
metaData.put("one", "1");
|
||||
metaData.put("format", "VCFv3.2"); // required
|
||||
metaData.put("two", "2");
|
||||
additionalColumns.add("FORMAT");
|
||||
additionalColumns.add("extra1");
|
||||
additionalColumns.add("extra2");
|
||||
// this should create a header that is valid
|
||||
|
|
@ -60,7 +60,7 @@ public class VCFWriterTest extends BaseTest {
|
|||
}
|
||||
|
||||
private VCFRecord createVCFRecord(VCFHeader header) {
|
||||
int totalVals = header.getHeaderFields().size() + header.getAuxillaryTags().size();
|
||||
int totalVals = header.getColumnCount();
|
||||
List<String> array = new ArrayList<String>();
|
||||
for (int x = 0; x < totalVals; x++) {
|
||||
array.add(String.valueOf(x));
|
||||
|
|
@ -87,10 +87,10 @@ public class VCFWriterTest extends BaseTest {
|
|||
}
|
||||
Assert.assertEquals(metaData.size(), index);
|
||||
index = 0;
|
||||
for (String key : header.getAuxillaryTags()) {
|
||||
for (String key : header.getGenotypeSamples()) {
|
||||
Assert.assertTrue(additionalColumns.contains(key));
|
||||
index++;
|
||||
}
|
||||
Assert.assertEquals(additionalColumns.size(), index);
|
||||
Assert.assertEquals(additionalColumns.size(), index+1 /* for the header field we don't see */);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue