created a better seperation between instantiation of an VCF object and the object itself
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1440 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ed8c92a12a
commit
0b927f44fa
|
|
@ -4,8 +4,6 @@ import java.util.ArrayList;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -26,77 +24,28 @@ public class VCFGenotypeRecord {
|
|||
private GT_GENOTYPE phaseType;
|
||||
|
||||
// our reference bases(s)
|
||||
private final char reference;
|
||||
private final char mReferenceBase;
|
||||
|
||||
// our bases(s)
|
||||
private final List<String> bases = new ArrayList<String>();
|
||||
private final List<String> mAlleleBases = new ArrayList<String>();
|
||||
|
||||
// our mapping of the format fields to values
|
||||
private final Map<String, String> fields = new HashMap<String, String>();
|
||||
|
||||
// our pattern matching for the genotype fields
|
||||
private static final Pattern basicSplit = Pattern.compile("([0-9]*)([\\\\|\\/])([0-9]*):(\\S*)");
|
||||
// our mapping of the format mFields to values
|
||||
private final Map<String, String> mFields = new HashMap<String, String>();
|
||||
|
||||
/**
|
||||
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||
* create a VCF record
|
||||
*
|
||||
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
||||
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||
* @param referenceBase the reference base
|
||||
*/
|
||||
protected VCFGenotypeRecord(String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
||||
reference = referenceBase;
|
||||
// check that the first format field is GT, which is required
|
||||
String keys[] = formatString.split(":");
|
||||
if (keys.length < 0 || !keys[0].equals("GT"))
|
||||
throw new IllegalArgumentException("The format string must have fields, and the first must be GT (genotype)");
|
||||
|
||||
// find the values for each of the keys, of which the GT field should be the first
|
||||
Matcher match = basicSplit.matcher(genotypeString);
|
||||
if (!match.matches() || match.groupCount() < 3)
|
||||
throw new IllegalArgumentException("Unable to match genotype string to expected regex");
|
||||
|
||||
// add the alternate base (which can be ref by specifying 0)
|
||||
addAllele(match.group(1), altAlleles, referenceBase);
|
||||
|
||||
determinePhase(match.group(2));
|
||||
|
||||
// do we have a second alt allele?
|
||||
if (match.group(3).length() > 0) {
|
||||
addAllele(match.group(3), altAlleles, referenceBase);
|
||||
}
|
||||
|
||||
// check to see what other records we have
|
||||
if (match.groupCount() == 4) {
|
||||
// make sure we'll have enough occurances
|
||||
String tokens[] = match.group(4).split(":{1}"); // the {1} was required, since string.split does a greedy match of the specified regex, like :+
|
||||
int keyIndex = 1;
|
||||
for (String token: tokens) {
|
||||
this.fields.put(keys[keyIndex],token);
|
||||
keyIndex++;
|
||||
}
|
||||
if (keyIndex + 1 == tokens.length) fields.put(keys[++keyIndex],""); // if the last value is blank, split will leave it off
|
||||
if (keyIndex == 1 && match.group(4).contains(":")) {
|
||||
// there was a string of all semicolons, split doesn't handle this well (or at all)
|
||||
while(keyIndex < keys.length) this.fields.put(keys[keyIndex++],"");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* add an alternate allele to the list of alleles we have
|
||||
*
|
||||
* @param alleleNumber the allele number, as a string
|
||||
* @param altAlleles the list of alternate alleles
|
||||
* @param keyValues the key values
|
||||
* @param Alleles the alleles, one if we're halpoid, two if we're diploid
|
||||
* @param phasing the phasing of the the genotype
|
||||
* @param referenceBase the reference base
|
||||
*/
|
||||
private void addAllele(String alleleNumber, String[] altAlleles, char referenceBase) {
|
||||
if (Integer.valueOf(alleleNumber) == 0)
|
||||
bases.add(String.valueOf(referenceBase));
|
||||
else
|
||||
bases.add(altAlleles[Integer.valueOf(alleleNumber) - 1]);
|
||||
public VCFGenotypeRecord(Map<String, String> keyValues, List<String> Alleles, GT_GENOTYPE phasing, char referenceBase) {
|
||||
// validate
|
||||
this.mReferenceBase = referenceBase;
|
||||
this.mFields.putAll(keyValues);
|
||||
this.mAlleleBases.addAll(Alleles);
|
||||
this.phaseType = phasing;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -104,14 +53,14 @@ public class VCFGenotypeRecord {
|
|||
*
|
||||
* @param phase the string that contains the phase character
|
||||
*/
|
||||
private void determinePhase(String phase) {
|
||||
static GT_GENOTYPE determinePhase(String phase) {
|
||||
// find the phasing information
|
||||
if (phase.equals("/"))
|
||||
phaseType = GT_GENOTYPE.UNPHASED;
|
||||
return GT_GENOTYPE.UNPHASED;
|
||||
else if (phase.equals("|"))
|
||||
phaseType = GT_GENOTYPE.PHASED;
|
||||
return GT_GENOTYPE.PHASED;
|
||||
else if (phase.equals("\\"))
|
||||
phaseType = GT_GENOTYPE.PHASED_SWITCH_PROB;
|
||||
return GT_GENOTYPE.PHASED_SWITCH_PROB;
|
||||
else
|
||||
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
||||
}
|
||||
|
|
@ -123,14 +72,14 @@ public class VCFGenotypeRecord {
|
|||
}
|
||||
|
||||
public char getReference() {
|
||||
return reference;
|
||||
return mReferenceBase;
|
||||
}
|
||||
|
||||
public List<String> getAllele() {
|
||||
return bases;
|
||||
return mAlleleBases;
|
||||
}
|
||||
|
||||
public Map<String, String> getFields() {
|
||||
return fields;
|
||||
return mFields;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,10 +20,13 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
|
||||
// our next record
|
||||
private VCFRecord mNextRecord = null;
|
||||
|
||||
|
||||
// a pattern we use for detecting meta data and header lines
|
||||
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
|
||||
|
||||
// our pattern matching for the genotype mFields
|
||||
private static final Pattern basicSplit = Pattern.compile("([0-9]*)([\\\\|\\/])([0-9]*):(\\S*)");
|
||||
|
||||
/**
|
||||
* Create a VCF reader, given a VCF file
|
||||
*
|
||||
|
|
@ -49,7 +52,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
line = mReader.readLine();
|
||||
}
|
||||
mHeader = this.createHeader(lines);
|
||||
mNextRecord = new VCFRecord(mHeader, line);
|
||||
mNextRecord = createRecord(mReader.readLine());
|
||||
} catch (IOException e) {
|
||||
throw new StingException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
||||
}
|
||||
|
|
@ -71,7 +74,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
try {
|
||||
String line = mReader.readLine();
|
||||
if (line == null) mNextRecord = null;
|
||||
else mNextRecord = new VCFRecord(mHeader, line);
|
||||
else mNextRecord = createRecord(line);
|
||||
} catch (IOException e) {
|
||||
mNextRecord = null;
|
||||
}
|
||||
|
|
@ -91,7 +94,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
*/
|
||||
protected VCFHeader createHeader(List<String> headerStrings) {
|
||||
|
||||
Map<String,String> metaData = new HashMap<String,String>();
|
||||
Map<String, String> metaData = new HashMap<String, String>();
|
||||
Set<VCFHeader.HEADER_FIELDS> headerFields = new LinkedHashSet<VCFHeader.HEADER_FIELDS>();
|
||||
List<String> auxTags = new ArrayList<String>();
|
||||
// iterate over all the passed in strings
|
||||
|
|
@ -112,12 +115,13 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
if (str.startsWith("#") && !str.startsWith("##")) {
|
||||
String[] strings = str.substring(1).split("\\s+");
|
||||
for (String s : strings) {
|
||||
if (headerFields.contains(s)) throw new StingException("VCFReader: Header field duplication is not allowed");
|
||||
if (headerFields.contains(s))
|
||||
throw new StingException("VCFReader: Header field duplication is not allowed");
|
||||
try {
|
||||
headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s));
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (!s.equals("FORMAT"))
|
||||
auxTags.add(s);
|
||||
if (!s.equals("FORMAT"))
|
||||
auxTags.add(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -126,13 +130,117 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
throw new StingException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size())
|
||||
+ " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields");
|
||||
}
|
||||
return new VCFHeader(headerFields,metaData,auxTags);
|
||||
return new VCFHeader(headerFields, metaData, auxTags);
|
||||
}
|
||||
|
||||
/**
|
||||
* create the next VCFRecord, given the input line
|
||||
*
|
||||
* @return get the header associated with this reader
|
||||
* @param line the line from the file
|
||||
*
|
||||
* @return the VCFRecord
|
||||
*/
|
||||
public VCFRecord createRecord(String line) {
|
||||
// things we need to make a VCF record
|
||||
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
String tokens[] = line.split("\\s+");
|
||||
|
||||
// check to ensure that the column count of tokens is right
|
||||
if (tokens.length != mHeader.getColumnCount()) {
|
||||
throw new StingException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has" + values.size());
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields())
|
||||
values.put(field, tokens[index++]);
|
||||
// if we have genotyping data, we try and extract the genotype fields
|
||||
if (mHeader.hasGenotypingData()) {
|
||||
String mFormatString = tokens[index];
|
||||
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||
index++;
|
||||
for (String str : mHeader.getGenotypeSamples()) {
|
||||
genotypeRecords.add(getVCFGenotype(mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
||||
index++;
|
||||
}
|
||||
return new VCFRecord(mHeader,values,mFormatString,genotypeRecords);
|
||||
}
|
||||
return new VCFRecord(mHeader, values);
|
||||
}
|
||||
|
||||
/**
|
||||
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||
*
|
||||
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
||||
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||
* @param referenceBase the reference base
|
||||
*/
|
||||
public VCFGenotypeRecord getVCFGenotype(String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
||||
// check that the first format field is GT, which is required
|
||||
String keys[] = formatString.split(":");
|
||||
List<String> alleles = new ArrayList<String>();
|
||||
if (keys.length < 0 || !keys[0].equals("GT"))
|
||||
throw new IllegalArgumentException("The format string must have fields, and the first must be GT (genotype)");
|
||||
|
||||
// find the values for each of the keys, of which the GT field should be the first
|
||||
Matcher match = basicSplit.matcher(genotypeString);
|
||||
if (!match.matches() || match.groupCount() < 3)
|
||||
throw new IllegalArgumentException("Unable to match genotype string to expected regex");
|
||||
|
||||
// add the alternate base (which can be ref by specifying 0)
|
||||
addAllele(match.group(1), altAlleles, referenceBase, alleles);
|
||||
|
||||
VCFGenotypeRecord.GT_GENOTYPE phase = VCFGenotypeRecord.determinePhase(match.group(2));
|
||||
|
||||
// do we have a second alt allele?
|
||||
if (match.group(3).length() > 0) {
|
||||
addAllele(match.group(3), altAlleles, referenceBase, alleles);
|
||||
}
|
||||
|
||||
Map<String, String> fields = new HashMap<String, String>();
|
||||
// check to see what other records we have
|
||||
if (match.groupCount() == 4) {
|
||||
// make sure we'll have enough occurances
|
||||
String tokens[] = match.group(4).split(":{1}"); // the {1} was required, since string.split does a greedy match of the specified regex, like :+
|
||||
int keyIndex = 1;
|
||||
try {
|
||||
for (String token : tokens) {
|
||||
fields.put(keys[keyIndex], token);
|
||||
keyIndex++;
|
||||
}
|
||||
}
|
||||
// we catch the follow exception. What this generally means is that the format string specified less mFields then the genotype string contains
|
||||
catch (ArrayIndexOutOfBoundsException e) {
|
||||
throw new StingException("VCFGenotypeRecord: ArrayIndexOutOfBoundsException, most likely the field list was less then the genotype " + "" +
|
||||
"values provided. Format String = " + formatString + ", genotype value string = " + genotypeString, e);
|
||||
}
|
||||
|
||||
// you're allowed to leave out mFields, if any field doesn't have a value fill it in
|
||||
if (keyIndex < tokens.length && match.group(4).contains(":")) {
|
||||
while (keyIndex < keys.length)
|
||||
if (!fields.containsKey(keys[keyIndex]))
|
||||
fields.put(keys[keyIndex++], "");
|
||||
}
|
||||
}
|
||||
return new VCFGenotypeRecord(fields, alleles, phase, referenceBase);
|
||||
}
|
||||
|
||||
/**
|
||||
* add an alternate allele to the list of alleles we have for a VCF genotype record
|
||||
*
|
||||
* @param alleleNumber the allele number, as a string
|
||||
* @param altAlleles the list of alternate alleles
|
||||
* @param referenceBase the reference base
|
||||
*/
|
||||
private void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<String> bases) {
|
||||
if (Integer.valueOf(alleleNumber) == 0)
|
||||
bases.add(String.valueOf(referenceBase));
|
||||
else
|
||||
bases.add(altAlleles[Integer.valueOf(alleleNumber) - 1]);
|
||||
}
|
||||
|
||||
|
||||
/** @return get the header associated with this reader */
|
||||
public VCFHeader getHeader() {
|
||||
return this.mHeader;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,76 +13,60 @@ public class VCFRecord {
|
|||
private final Map<VCFHeader.HEADER_FIELDS, String> mValues = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||
|
||||
// our genotype sample fields
|
||||
private final Map<String, String> mGenotypeFields = new HashMap<String, String>();
|
||||
private final List<VCFGenotypeRecord> mGenotypeFields;
|
||||
|
||||
// the format String, which specifies what each genotype can contain for values
|
||||
private String formatString;
|
||||
private final String mFormatString;
|
||||
|
||||
// the associated header
|
||||
private final VCFHeader mHeader;
|
||||
|
||||
/**
|
||||
* create a VCFRecord, given a VCF header and the the values in this field. THis is protected, so that the reader is
|
||||
* the only accessing object
|
||||
* given a VCF header, and the values for each of the columns, create a VCF record.
|
||||
*
|
||||
* @param header the VCF header
|
||||
* @param line the line to parse into individual fields
|
||||
* @param header the VCF header
|
||||
* @param columnValues a mapping of header strings to values
|
||||
* @param formatString the format string for the genotype records
|
||||
* @param genotypeRecords the genotype records
|
||||
*/
|
||||
protected VCFRecord(VCFHeader header, String line) {
|
||||
String tokens[] = line.split("\\s+");
|
||||
List<String> values = new ArrayList<String>();
|
||||
for (String str : tokens) values.add(str);
|
||||
initialize(header, values);
|
||||
public VCFRecord(VCFHeader header, Map<VCFHeader.HEADER_FIELDS, String> columnValues, String formatString, List<VCFGenotypeRecord> genotypeRecords) {
|
||||
mHeader = header;
|
||||
mValues.putAll(columnValues);
|
||||
mFormatString = formatString;
|
||||
mGenotypeFields = new ArrayList<VCFGenotypeRecord>();
|
||||
mGenotypeFields.addAll(genotypeRecords);
|
||||
}
|
||||
|
||||
/**
|
||||
* given a VCF header, and the values for each of the columns, create a VCF record
|
||||
* given a VCF header, and the values for each of the columns, create a VCF record.
|
||||
*
|
||||
* @param header the VCF header
|
||||
* @param values the values, as a list, for each of the columns
|
||||
* @param header the VCF header
|
||||
* @param columnValues a mapping of header strings to values
|
||||
*/
|
||||
public VCFRecord(VCFHeader header, List<String> values) {
|
||||
initialize(header, values);
|
||||
public VCFRecord(VCFHeader header, Map<VCFHeader.HEADER_FIELDS, String> columnValues) {
|
||||
mHeader = header;
|
||||
mValues.putAll(columnValues);
|
||||
mGenotypeFields = null;
|
||||
mFormatString = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the VCFRecord
|
||||
*
|
||||
* @param header the VCF header
|
||||
* @param values the list of strings that make up the columns of the record
|
||||
* do we have genotyping data
|
||||
* @return true if we have genotyping data, false otherwise
|
||||
*/
|
||||
private void initialize(VCFHeader header, List<String> values) {
|
||||
if (values.size() != header.getColumnCount()) {
|
||||
throw new StingException("The input list doesn't contain enough fields, it should have " + header.getColumnCount() + " fields");
|
||||
}
|
||||
int index = 0;
|
||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) {
|
||||
mValues.put(field, values.get(index));
|
||||
index++;
|
||||
}
|
||||
if (header.hasGenotypingData()) {
|
||||
formatString = values.get(index);
|
||||
index++;
|
||||
for (String str : header.getGenotypeSamples()) {
|
||||
mGenotypeFields.put(str, values.get(index));
|
||||
index++;
|
||||
}
|
||||
public boolean hasGenotypeData() {
|
||||
if (mGenotypeFields==null) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* lookup a value, given it's column name
|
||||
*
|
||||
* @param key the column name, which is looked up in both the set columns and the auxillary columns
|
||||
*
|
||||
* @return a String representing the column values, or null if the field doesn't exist in this record
|
||||
* get the format string
|
||||
* @return the format sting, null if it doesn't exist
|
||||
*/
|
||||
public String getValue(String key) {
|
||||
try {
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.valueOf(key));
|
||||
} catch (IllegalArgumentException e) {
|
||||
if (this.mGenotypeFields.containsKey(key)) {
|
||||
return mGenotypeFields.get(key);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
public String getFormatString() {
|
||||
return mFormatString;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -98,7 +82,7 @@ public class VCFRecord {
|
|||
|
||||
/** @return the string for the chromosome that this VCF record is associated with */
|
||||
public String getChromosome() {
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.CHROM);
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.CHROM);
|
||||
}
|
||||
|
||||
/** @return this VCF records position on the specified chromosome */
|
||||
|
|
@ -108,7 +92,7 @@ public class VCFRecord {
|
|||
|
||||
/** @return the ID value for this record */
|
||||
public String getID() {
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.ID);
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.ID);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -118,7 +102,7 @@ public class VCFRecord {
|
|||
*/
|
||||
public char getReferenceBase() {
|
||||
// TODO: this field isn't validated correctly
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0);
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.REF).charAt(0);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -127,10 +111,10 @@ public class VCFRecord {
|
|||
* @return an array of strings representing the alt alleles, or null if there are none
|
||||
*/
|
||||
public String[] getAlternateAlleles() {
|
||||
if (this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) {
|
||||
if (mValues.get(VCFHeader.HEADER_FIELDS.ALT).trim().equals(".")) {
|
||||
return null;
|
||||
}
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(",");
|
||||
}
|
||||
|
||||
public boolean hasAlternateAllele() {
|
||||
|
|
@ -139,7 +123,7 @@ public class VCFRecord {
|
|||
|
||||
/** @return the phred-scaled quality score */
|
||||
public int getQual() {
|
||||
return Integer.valueOf(this.mValues.get(VCFHeader.HEADER_FIELDS.QUAL));
|
||||
return Integer.valueOf(mValues.get(VCFHeader.HEADER_FIELDS.QUAL));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -148,10 +132,10 @@ public class VCFRecord {
|
|||
* @return an array of strings representing the filtering criteria, or null if none were applied
|
||||
*/
|
||||
public String[] getFilteringCodes() {
|
||||
if (this.mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) {
|
||||
if (mValues.get(VCFHeader.HEADER_FIELDS.FILTER).trim().equals("0")) {
|
||||
return null;
|
||||
}
|
||||
return this.mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";");
|
||||
return mValues.get(VCFHeader.HEADER_FIELDS.ALT).split(";");
|
||||
}
|
||||
|
||||
public boolean hasFilteringCodes() {
|
||||
|
|
@ -177,20 +161,22 @@ public class VCFRecord {
|
|||
|
||||
/** @return the number of columnsof data we're storing */
|
||||
public int getColumnCount() {
|
||||
return this.mGenotypeFields.size() + this.mValues.size();
|
||||
return mGenotypeFields.size() + mValues.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* return the mapping of the format tags to the specified sample's values
|
||||
* @param sampleName the sample name to get the genotyping tags for
|
||||
* @return a VCFGenotypeRecord
|
||||
*/
|
||||
public VCFGenotypeRecord getVCFGenotypeRecord(String sampleName) {
|
||||
if (!this.mGenotypeFields.containsKey(sampleName)) {
|
||||
throw new IllegalArgumentException("Sample Name: " + sampleName + " doesn't exist in this VCF record");
|
||||
}
|
||||
return new VCFGenotypeRecord(formatString,mGenotypeFields.get(sampleName),this.getAlternateAlleles(),this.getReferenceBase());
|
||||
public List<VCFGenotypeRecord> getVCFGenotypeRecords() {
|
||||
return this.mGenotypeFields;
|
||||
}
|
||||
|
||||
/** @return a List of the sample names */
|
||||
public String[] getSampleNames() {
|
||||
String ret[] = new String[mHeader.getGenotypeSamples().size()];
|
||||
mHeader.getGenotypeSamples().toArray(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ package org.broadinstitute.sting.utils.genotype.vcf;
|
|||
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -23,34 +25,83 @@ public class VCFValidator {
|
|||
* @param args the vcf file is the only parameter
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
if (args.length != 1) {
|
||||
boolean catchAll = false;
|
||||
|
||||
if (args.length == 2 && args[0].equals("-A"))
|
||||
catchAll = true;
|
||||
else if (args.length == 1)
|
||||
catchAll = false;
|
||||
else {
|
||||
printUsage();
|
||||
return;
|
||||
}
|
||||
File vcfFile = new File(args[0]);
|
||||
File vcfFile = new File(args[(catchAll) ? 1 : 0]);
|
||||
if (!vcfFile.exists()) {
|
||||
System.err.println("Specified VCF file doesn't exist, please check the input file\n");
|
||||
printUsage();
|
||||
return;
|
||||
}
|
||||
int counter = 0;
|
||||
// count hom many records we see
|
||||
int recordCount = 0;
|
||||
Map<Integer,Exception> problems = new TreeMap<Integer,Exception>();
|
||||
|
||||
try {
|
||||
// open up our reader
|
||||
VCFReader reader = new VCFReader(vcfFile);
|
||||
|
||||
while (reader.hasNext()) {
|
||||
counter++;
|
||||
reader.next();
|
||||
try {
|
||||
recordCount++;
|
||||
VCFRecord rec = reader.next();
|
||||
// if the header indicates we have genotyping data, try to extract it for all samples
|
||||
if (reader.getHeader().hasGenotypingData()) {
|
||||
for (VCFGenotypeRecord genorec : rec.getVCFGenotypeRecords()) {
|
||||
// just cycle through them, more checks go here
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (catchAll)
|
||||
problems.put(recordCount,e);
|
||||
else {
|
||||
validationFailed(e, recordCount);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println("VCF Validation failed, after parsing " + counter + " entries.");
|
||||
System.err.println("The reason given was: " + e.getMessage());
|
||||
if (catchAll)
|
||||
problems.put(new Integer(0),e);
|
||||
else
|
||||
validationFailed(e, recordCount);
|
||||
}
|
||||
System.err.println("Viewed " + recordCount + " VCF record entries.");
|
||||
if (problems.size() > 0) {
|
||||
System.err.println("Encountered " + problems.size() + " number of issues. (record zero indicates a header problem)");
|
||||
for (Integer e : problems.keySet()) {
|
||||
System.err.println("\tProblem at record " + e + " : " + problems.get(e));
|
||||
}
|
||||
}
|
||||
System.err.println("Viewed " + counter + " VCF record entries.");
|
||||
}
|
||||
|
||||
/**
|
||||
* validation failed
|
||||
*
|
||||
* @param e the exception
|
||||
* @param count the current record count
|
||||
*/
|
||||
public static void validationFailed(Exception e, int count) {
|
||||
System.err.println("VCF Validation failed, after parsing " + count + " entries.");
|
||||
System.err.println("The reason given was: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
/** print the usage information for the VCF validator */
|
||||
public static void printUsage() {
|
||||
System.err.println("VCF validator (VCF Version " + VCF_VERSION + ")");
|
||||
System.err.println("Usage:");
|
||||
System.err.println("vcfvalidator <fille.vcf>");
|
||||
System.err.println("vcfvalidator <-A> <fille.vcf>");
|
||||
System.err.println("");
|
||||
System.err.println("\t-A\tTell the validator to attempt to catch all the problems, and not stop at the first. Some may be too fatal to continue.");
|
||||
System.err.println("");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ public class VCFWriter {
|
|||
|
||||
/**
|
||||
* output a record to the VCF file
|
||||
*
|
||||
* @param record the record to output
|
||||
*/
|
||||
public void addRecord(VCFRecord record) {
|
||||
|
|
@ -59,14 +60,19 @@ public class VCFWriter {
|
|||
" columns, when is should have " + mHeader.getColumnCount());
|
||||
}
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
// first output the required fields in order
|
||||
boolean first = true;
|
||||
for (VCFHeader.HEADER_FIELDS field : mHeader.getHeaderFields()) {
|
||||
if (first) { first = false; builder.append(record.getValue(field)); }
|
||||
else builder.append("\t" + record.getValue(field));
|
||||
if (first) {
|
||||
first = false;
|
||||
builder.append(record.getValue(field));
|
||||
} else builder.append("\t" + record.getValue(field));
|
||||
}
|
||||
for (String auxTag : mHeader.getGenotypeSamples()) {
|
||||
builder.append("\t" + record.getValue(auxTag));
|
||||
for (VCFGenotypeRecord rec : record.getVCFGenotypeRecords()) {
|
||||
builder.append("\t");
|
||||
for (String s : rec.getFields().keySet())
|
||||
builder.append(":" + rec.getFields().get(s));
|
||||
}
|
||||
try {
|
||||
mWriter.write(builder.toString() + "\n");
|
||||
|
|
@ -75,9 +81,7 @@ public class VCFWriter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* attempt to close the VCF file
|
||||
*/
|
||||
/** attempt to close the VCF file */
|
||||
public void close() {
|
||||
try {
|
||||
mWriter.close();
|
||||
|
|
|
|||
Loading…
Reference in New Issue