fixes in VCF, some changes to get it ready to move out of the GATK
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1441 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0b927f44fa
commit
5725de56dc
|
|
@ -16,12 +16,12 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class VCFGenotypeRecord {
|
public class VCFGenotypeRecord {
|
||||||
// what kind of phasing this genotype has
|
// what kind of phasing this genotype has
|
||||||
enum GT_GENOTYPE {
|
enum PHASE {
|
||||||
UNPHASED, PHASED, PHASED_SWITCH_PROB
|
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
||||||
}
|
}
|
||||||
|
|
||||||
// our phasing
|
// our phasing
|
||||||
private GT_GENOTYPE phaseType;
|
private PHASE phaseType;
|
||||||
|
|
||||||
// our reference bases(s)
|
// our reference bases(s)
|
||||||
private final char mReferenceBase;
|
private final char mReferenceBase;
|
||||||
|
|
@ -31,7 +31,9 @@ public class VCFGenotypeRecord {
|
||||||
|
|
||||||
// our mapping of the format mFields to values
|
// our mapping of the format mFields to values
|
||||||
private final Map<String, String> mFields = new HashMap<String, String>();
|
private final Map<String, String> mFields = new HashMap<String, String>();
|
||||||
|
|
||||||
|
// our sample name
|
||||||
|
private final String mSampleName;
|
||||||
/**
|
/**
|
||||||
* create a VCF record
|
* create a VCF record
|
||||||
*
|
*
|
||||||
|
|
@ -40,12 +42,12 @@ public class VCFGenotypeRecord {
|
||||||
* @param phasing the phasing of the the genotype
|
* @param phasing the phasing of the the genotype
|
||||||
* @param referenceBase the reference base
|
* @param referenceBase the reference base
|
||||||
*/
|
*/
|
||||||
public VCFGenotypeRecord(Map<String, String> keyValues, List<String> Alleles, GT_GENOTYPE phasing, char referenceBase) {
|
public VCFGenotypeRecord(String sampleName, Map<String, String> keyValues, List<String> Alleles, PHASE phasing, char referenceBase) {
|
||||||
// validate
|
mSampleName = sampleName;
|
||||||
this.mReferenceBase = referenceBase;
|
mReferenceBase = referenceBase;
|
||||||
this.mFields.putAll(keyValues);
|
mFields.putAll(keyValues);
|
||||||
this.mAlleleBases.addAll(Alleles);
|
mAlleleBases.addAll(Alleles);
|
||||||
this.phaseType = phasing;
|
phaseType = phasing;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -53,21 +55,21 @@ public class VCFGenotypeRecord {
|
||||||
*
|
*
|
||||||
* @param phase the string that contains the phase character
|
* @param phase the string that contains the phase character
|
||||||
*/
|
*/
|
||||||
static GT_GENOTYPE determinePhase(String phase) {
|
static PHASE determinePhase(String phase) {
|
||||||
// find the phasing information
|
// find the phasing information
|
||||||
if (phase.equals("/"))
|
if (phase.equals("/"))
|
||||||
return GT_GENOTYPE.UNPHASED;
|
return PHASE.UNPHASED;
|
||||||
else if (phase.equals("|"))
|
else if (phase.equals("|"))
|
||||||
return GT_GENOTYPE.PHASED;
|
return PHASE.PHASED;
|
||||||
else if (phase.equals("\\"))
|
else if (phase.equals("\\"))
|
||||||
return GT_GENOTYPE.PHASED_SWITCH_PROB;
|
return PHASE.PHASED_SWITCH_PROB;
|
||||||
else
|
else
|
||||||
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
throw new IllegalArgumentException("Unknown genotype phasing parameter");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** getter methods */
|
/** getter methods */
|
||||||
|
|
||||||
public GT_GENOTYPE getPhaseType() {
|
public PHASE getPhaseType() {
|
||||||
return phaseType;
|
return phaseType;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,8 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -37,19 +34,13 @@ public class VCFHeader {
|
||||||
// the header string indicator
|
// the header string indicator
|
||||||
public static final String HEADER_INDICATOR = "#";
|
public static final String HEADER_INDICATOR = "#";
|
||||||
|
|
||||||
/**
|
/** our log, which we use to capture anything from this class */
|
||||||
* our log, which we use to capture anything from this class
|
|
||||||
*/
|
|
||||||
private static Logger logger = Logger.getLogger(VCFHeader.class);
|
private static Logger logger = Logger.getLogger(VCFHeader.class);
|
||||||
|
|
||||||
/**
|
/** do we have genotying data? */
|
||||||
* do we have genotying data?
|
|
||||||
*/
|
|
||||||
private boolean hasGenotypingData = false;
|
private boolean hasGenotypingData = false;
|
||||||
|
|
||||||
/**
|
/** the current vcf version we support. */
|
||||||
* the current vcf version we support.
|
|
||||||
*/
|
|
||||||
private static final String VCF_VERSION = "VCFv3.2";
|
private static final String VCF_VERSION = "VCFv3.2";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -74,7 +65,10 @@ public class VCFHeader {
|
||||||
protected VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData, List<String> genotypeSampleNames) {
|
protected VCFHeader(Set<HEADER_FIELDS> headerFields, Map<String, String> metaData, List<String> genotypeSampleNames) {
|
||||||
for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field);
|
for (HEADER_FIELDS field : headerFields) mHeaderFields.add(field);
|
||||||
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
for (String key : metaData.keySet()) mMetaData.put(key, metaData.get(key));
|
||||||
for (String col : genotypeSampleNames) mGenotypeSampleNames.add(col);
|
for (String col : genotypeSampleNames) {
|
||||||
|
if (!col.equals("FORMAT"))
|
||||||
|
mGenotypeSampleNames.add(col);
|
||||||
|
}
|
||||||
hasGenotypingData = true;
|
hasGenotypingData = true;
|
||||||
checkVCFVersion();
|
checkVCFVersion();
|
||||||
}
|
}
|
||||||
|
|
@ -87,10 +81,10 @@ public class VCFHeader {
|
||||||
if (mMetaData.containsKey("format")) {
|
if (mMetaData.containsKey("format")) {
|
||||||
if (mMetaData.get("format").equals(VCF_VERSION))
|
if (mMetaData.get("format").equals(VCF_VERSION))
|
||||||
return;
|
return;
|
||||||
throw new StingException("VCFHeader: VCF version of " + mMetaData.get("format") +
|
throw new RuntimeException("VCFHeader: VCF version of " + mMetaData.get("format") +
|
||||||
" doesn't match the supported version of " + VCF_VERSION);
|
" doesn't match the supported version of " + VCF_VERSION);
|
||||||
}
|
}
|
||||||
throw new StingException("VCFHeader: VCF version isn't present");
|
throw new RuntimeException("VCFHeader: VCF version isn't present");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -129,9 +123,7 @@ public class VCFHeader {
|
||||||
return hasGenotypingData;
|
return hasGenotypingData;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** @return the column count, */
|
||||||
* @return the column count,
|
|
||||||
*/
|
|
||||||
public int getColumnCount() {
|
public int getColumnCount() {
|
||||||
return mHeaderFields.size() + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
return mHeaderFields.size() + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
@ -25,7 +24,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
|
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
|
||||||
|
|
||||||
// our pattern matching for the genotype mFields
|
// our pattern matching for the genotype mFields
|
||||||
private static final Pattern basicSplit = Pattern.compile("([0-9]*)([\\\\|\\/])([0-9]*):(\\S*)");
|
private static final Pattern gtPattern = Pattern.compile("([0-9]+)([\\\\|\\/])([0-9]*)");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a VCF reader, given a VCF file
|
* Create a VCF reader, given a VCF file
|
||||||
|
|
@ -40,7 +39,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
new FileInputStream(vcfFile),
|
new FileInputStream(vcfFile),
|
||||||
utf8));
|
utf8));
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new StingException("VCFReader: Unable to find VCF file: " + vcfFile, e);
|
throw new RuntimeException("VCFReader: Unable to find VCF file: " + vcfFile, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
String line = null;
|
String line = null;
|
||||||
|
|
@ -52,9 +51,9 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
line = mReader.readLine();
|
line = mReader.readLine();
|
||||||
}
|
}
|
||||||
mHeader = this.createHeader(lines);
|
mHeader = this.createHeader(lines);
|
||||||
mNextRecord = createRecord(mReader.readLine());
|
mNextRecord = createRecord(line, mHeader);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new StingException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
@ -74,7 +73,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
try {
|
try {
|
||||||
String line = mReader.readLine();
|
String line = mReader.readLine();
|
||||||
if (line == null) mNextRecord = null;
|
if (line == null) mNextRecord = null;
|
||||||
else mNextRecord = createRecord(line);
|
else mNextRecord = createRecord(line, mHeader);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
mNextRecord = null;
|
mNextRecord = null;
|
||||||
}
|
}
|
||||||
|
|
@ -116,7 +115,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
String[] strings = str.substring(1).split("\\s+");
|
String[] strings = str.substring(1).split("\\s+");
|
||||||
for (String s : strings) {
|
for (String s : strings) {
|
||||||
if (headerFields.contains(s))
|
if (headerFields.contains(s))
|
||||||
throw new StingException("VCFReader: Header field duplication is not allowed");
|
throw new RuntimeException("VCFReader: Header field duplication is not allowed");
|
||||||
try {
|
try {
|
||||||
headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s));
|
headerFields.add(VCFHeader.HEADER_FIELDS.valueOf(s));
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
|
|
@ -127,7 +126,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (headerFields.size() != VCFHeader.HEADER_FIELDS.values().length) {
|
if (headerFields.size() != VCFHeader.HEADER_FIELDS.values().length) {
|
||||||
throw new StingException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size())
|
throw new RuntimeException("VCFReader: The VCF column header line is missing " + (VCFHeader.HEADER_FIELDS.values().length - headerFields.size())
|
||||||
+ " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields");
|
+ " of the " + VCFHeader.HEADER_FIELDS.values().length + " required fields");
|
||||||
}
|
}
|
||||||
return new VCFHeader(headerFields, metaData, auxTags);
|
return new VCFHeader(headerFields, metaData, auxTags);
|
||||||
|
|
@ -140,14 +139,14 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
*
|
*
|
||||||
* @return the VCFRecord
|
* @return the VCFRecord
|
||||||
*/
|
*/
|
||||||
public VCFRecord createRecord(String line) {
|
public static VCFRecord createRecord(String line, VCFHeader mHeader) {
|
||||||
// things we need to make a VCF record
|
// things we need to make a VCF record
|
||||||
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
Map<VCFHeader.HEADER_FIELDS, String> values = new HashMap<VCFHeader.HEADER_FIELDS, String>();
|
||||||
String tokens[] = line.split("\\s+");
|
String tokens[] = line.split("\\s+");
|
||||||
|
|
||||||
// check to ensure that the column count of tokens is right
|
// check to ensure that the column count of tokens is right
|
||||||
if (tokens.length != mHeader.getColumnCount()) {
|
if (tokens.length != mHeader.getColumnCount()) {
|
||||||
throw new StingException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has" + values.size());
|
throw new RuntimeException("The input file line doesn't contain enough fields, it should have " + mHeader.getColumnCount() + " fields, it has " + tokens.length);
|
||||||
}
|
}
|
||||||
|
|
||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
@ -159,10 +158,10 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||||
index++;
|
index++;
|
||||||
for (String str : mHeader.getGenotypeSamples()) {
|
for (String str : mHeader.getGenotypeSamples()) {
|
||||||
genotypeRecords.add(getVCFGenotype(mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
genotypeRecords.add(getVCFGenotype(str, mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
return new VCFRecord(mHeader,values,mFormatString,genotypeRecords);
|
return new VCFRecord(mHeader, values, mFormatString, genotypeRecords);
|
||||||
}
|
}
|
||||||
return new VCFRecord(mHeader, values);
|
return new VCFRecord(mHeader, values);
|
||||||
}
|
}
|
||||||
|
|
@ -170,61 +169,46 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
/**
|
/**
|
||||||
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
* generate a VCF genotype record, given it's format string, the genotype string, and allele info
|
||||||
*
|
*
|
||||||
|
* @param sampleName the sample name
|
||||||
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
* @param formatString the format string for this record, which contains the keys for the genotype parameters
|
||||||
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
* @param genotypeString contains the phasing information, allele information, and values for genotype parameters
|
||||||
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
* @param altAlleles the alternate allele string array, which we index into based on the field parameters
|
||||||
* @param referenceBase the reference base
|
* @param referenceBase the reference base
|
||||||
*/
|
*/
|
||||||
public VCFGenotypeRecord getVCFGenotype(String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
public static VCFGenotypeRecord getVCFGenotype(String sampleName, String formatString, String genotypeString, String altAlleles[], char referenceBase) {
|
||||||
// check that the first format field is GT, which is required
|
// parameters to create the VCF genotype record
|
||||||
String keys[] = formatString.split(":");
|
Map<String,String> tagToValue = new HashMap<String, String>();
|
||||||
List<String> alleles = new ArrayList<String>();
|
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
||||||
if (keys.length < 0 || !keys[0].equals("GT"))
|
List<String> bases = new ArrayList<String>();
|
||||||
throw new IllegalArgumentException("The format string must have fields, and the first must be GT (genotype)");
|
|
||||||
|
|
||||||
// find the values for each of the keys, of which the GT field should be the first
|
String keyStrings[] = formatString.split(":");
|
||||||
Matcher match = basicSplit.matcher(genotypeString);
|
for (String key : keyStrings) {
|
||||||
if (!match.matches() || match.groupCount() < 3)
|
String parse;
|
||||||
throw new IllegalArgumentException("Unable to match genotype string to expected regex");
|
int nextDivider;
|
||||||
|
if (!genotypeString.contains(":")) {
|
||||||
// add the alternate base (which can be ref by specifying 0)
|
nextDivider = genotypeString.length();
|
||||||
addAllele(match.group(1), altAlleles, referenceBase, alleles);
|
parse = genotypeString;
|
||||||
|
} else {
|
||||||
VCFGenotypeRecord.GT_GENOTYPE phase = VCFGenotypeRecord.determinePhase(match.group(2));
|
nextDivider = (genotypeString.indexOf(":") > genotypeString.length()) ? genotypeString.length() : genotypeString.indexOf(":");
|
||||||
|
parse = genotypeString.substring(0, nextDivider);
|
||||||
// do we have a second alt allele?
|
}
|
||||||
if (match.group(3).length() > 0) {
|
if (key.equals("GT")) {
|
||||||
addAllele(match.group(3), altAlleles, referenceBase, alleles);
|
Matcher m = gtPattern.matcher(parse);
|
||||||
|
if (!m.matches())
|
||||||
|
throw new RuntimeException("Ubable to match GT genotype flag to it's regular expression");
|
||||||
|
phase = VCFGenotypeRecord.determinePhase(m.group(2));
|
||||||
|
addAllele(m.group(1),altAlleles,referenceBase,bases);
|
||||||
|
if (m.group(3).length() > 0) addAllele(m.group(3),altAlleles,referenceBase,bases);
|
||||||
|
}
|
||||||
|
tagToValue.put(key,parse);
|
||||||
|
if (nextDivider+1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
||||||
|
genotypeString = genotypeString.substring(nextDivider+1,genotypeString.length());
|
||||||
}
|
}
|
||||||
|
if (keyStrings.length != tagToValue.size() || genotypeString.length() > 0) throw new RuntimeException("genotype value count doesn't match the key count");
|
||||||
Map<String, String> fields = new HashMap<String, String>();
|
return new VCFGenotypeRecord(sampleName,tagToValue,bases,phase,referenceBase);
|
||||||
// check to see what other records we have
|
|
||||||
if (match.groupCount() == 4) {
|
|
||||||
// make sure we'll have enough occurances
|
|
||||||
String tokens[] = match.group(4).split(":{1}"); // the {1} was required, since string.split does a greedy match of the specified regex, like :+
|
|
||||||
int keyIndex = 1;
|
|
||||||
try {
|
|
||||||
for (String token : tokens) {
|
|
||||||
fields.put(keys[keyIndex], token);
|
|
||||||
keyIndex++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// we catch the follow exception. What this generally means is that the format string specified less mFields then the genotype string contains
|
|
||||||
catch (ArrayIndexOutOfBoundsException e) {
|
|
||||||
throw new StingException("VCFGenotypeRecord: ArrayIndexOutOfBoundsException, most likely the field list was less then the genotype " + "" +
|
|
||||||
"values provided. Format String = " + formatString + ", genotype value string = " + genotypeString, e);
|
|
||||||
}
|
|
||||||
|
|
||||||
// you're allowed to leave out mFields, if any field doesn't have a value fill it in
|
|
||||||
if (keyIndex < tokens.length && match.group(4).contains(":")) {
|
|
||||||
while (keyIndex < keys.length)
|
|
||||||
if (!fields.containsKey(keys[keyIndex]))
|
|
||||||
fields.put(keys[keyIndex++], "");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new VCFGenotypeRecord(fields, alleles, phase, referenceBase);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* add an alternate allele to the list of alleles we have for a VCF genotype record
|
* add an alternate allele to the list of alleles we have for a VCF genotype record
|
||||||
*
|
*
|
||||||
|
|
@ -232,7 +216,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
* @param altAlleles the list of alternate alleles
|
* @param altAlleles the list of alternate alleles
|
||||||
* @param referenceBase the reference base
|
* @param referenceBase the reference base
|
||||||
*/
|
*/
|
||||||
private void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<String> bases) {
|
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<String> bases) {
|
||||||
if (Integer.valueOf(alleleNumber) == 0)
|
if (Integer.valueOf(alleleNumber) == 0)
|
||||||
bases.add(String.valueOf(referenceBase));
|
bases.add(String.valueOf(referenceBase));
|
||||||
else
|
else
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
@ -153,7 +152,7 @@ public class VCFRecord {
|
||||||
for (String s : infoSplit) {
|
for (String s : infoSplit) {
|
||||||
String keyValue[] = s.split("=");
|
String keyValue[] = s.split("=");
|
||||||
if (keyValue.length != 2)
|
if (keyValue.length != 2)
|
||||||
throw new StingException("Key value pairs must have both a key and a value; pair: " + s);
|
throw new RuntimeException("Key value pairs must have both a key and a value; pair: " + s);
|
||||||
ret.put(keyValue[0], keyValue[1]);
|
ret.put(keyValue[0], keyValue[1]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
|
@ -161,7 +160,8 @@ public class VCFRecord {
|
||||||
|
|
||||||
/** @return the number of columnsof data we're storing */
|
/** @return the number of columnsof data we're storing */
|
||||||
public int getColumnCount() {
|
public int getColumnCount() {
|
||||||
return mGenotypeFields.size() + mValues.size();
|
if (this.hasGenotypeData()) return mGenotypeFields.size() + mValues.size();
|
||||||
|
return mValues.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
|
@ -13,6 +12,7 @@ public class VCFWriter {
|
||||||
|
|
||||||
// the print stream we're writting to
|
// the print stream we're writting to
|
||||||
BufferedWriter mWriter;
|
BufferedWriter mWriter;
|
||||||
|
private final String FIELD_SEPERATOR = "\t";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* create a VCF writer, given a VCF header and a file to write to
|
* create a VCF writer, given a VCF header and a file to write to
|
||||||
|
|
@ -29,10 +29,9 @@ public class VCFWriter {
|
||||||
new FileOutputStream(location),
|
new FileOutputStream(location),
|
||||||
utf8));
|
utf8));
|
||||||
} catch (FileNotFoundException e) {
|
} catch (FileNotFoundException e) {
|
||||||
throw new StingException("Unable to create VCF file: " + location, e);
|
throw new RuntimeException("Unable to create VCF file: " + location, e);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
|
||||||
// write the header meta-data out
|
// write the header meta-data out
|
||||||
for (String metadata : header.getMetaData().keySet()) {
|
for (String metadata : header.getMetaData().keySet()) {
|
||||||
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "=" + header.getMetaData().get(metadata) + "\n");
|
mWriter.write(VCFHeader.METADATA_INDICATOR + metadata + "=" + header.getMetaData().get(metadata) + "\n");
|
||||||
|
|
@ -40,12 +39,15 @@ public class VCFWriter {
|
||||||
// write out the column line
|
// write out the column line
|
||||||
StringBuilder b = new StringBuilder();
|
StringBuilder b = new StringBuilder();
|
||||||
b.append(VCFHeader.HEADER_INDICATOR);
|
b.append(VCFHeader.HEADER_INDICATOR);
|
||||||
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + "\t");
|
for (VCFHeader.HEADER_FIELDS field : header.getHeaderFields()) b.append(field + FIELD_SEPERATOR);
|
||||||
for (String field : header.getGenotypeSamples()) b.append(field + "\t");
|
if (header.hasGenotypingData()) {
|
||||||
mWriter.write(b.toString() + "\n");
|
b.append("FORMAT" + FIELD_SEPERATOR);
|
||||||
|
for (String field : header.getGenotypeSamples()) b.append(field + FIELD_SEPERATOR);
|
||||||
|
mWriter.write(b.toString() + "\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
throw new StingException("IOException writing the VCF header", e);
|
throw new RuntimeException("IOException writing the VCF header", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -56,7 +58,7 @@ public class VCFWriter {
|
||||||
*/
|
*/
|
||||||
public void addRecord(VCFRecord record) {
|
public void addRecord(VCFRecord record) {
|
||||||
if (record.getColumnCount() != mHeader.getGenotypeSamples().size() + mHeader.getHeaderFields().size()) {
|
if (record.getColumnCount() != mHeader.getGenotypeSamples().size() + mHeader.getHeaderFields().size()) {
|
||||||
throw new StingException("Record has " + record.getColumnCount() +
|
throw new RuntimeException("Record has " + record.getColumnCount() +
|
||||||
" columns, when is should have " + mHeader.getColumnCount());
|
" columns, when is should have " + mHeader.getColumnCount());
|
||||||
}
|
}
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
@ -67,17 +69,24 @@ public class VCFWriter {
|
||||||
if (first) {
|
if (first) {
|
||||||
first = false;
|
first = false;
|
||||||
builder.append(record.getValue(field));
|
builder.append(record.getValue(field));
|
||||||
} else builder.append("\t" + record.getValue(field));
|
} else builder.append(FIELD_SEPERATOR + record.getValue(field));
|
||||||
}
|
}
|
||||||
for (VCFGenotypeRecord rec : record.getVCFGenotypeRecords()) {
|
if (record.hasGenotypeData()) {
|
||||||
builder.append("\t");
|
builder.append(FIELD_SEPERATOR + record.getFormatString());
|
||||||
for (String s : rec.getFields().keySet())
|
for (VCFGenotypeRecord rec : record.getVCFGenotypeRecords()) {
|
||||||
builder.append(":" + rec.getFields().get(s));
|
builder.append(FIELD_SEPERATOR);
|
||||||
}
|
boolean ft = true;
|
||||||
try {
|
for (String s : rec.getFields().keySet()) {
|
||||||
mWriter.write(builder.toString() + "\n");
|
if (!ft) builder.append(":");
|
||||||
} catch (IOException e) {
|
else ft = true;
|
||||||
throw new StingException("Unable to write the VCF object to a file");
|
builder.append(rec.getFields().get(s));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
mWriter.write(builder.toString() + "\n");
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Unable to write the VCF object to a file");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -86,7 +95,7 @@ public class VCFWriter {
|
||||||
try {
|
try {
|
||||||
mWriter.close();
|
mWriter.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new StingException("Unable to close VCFFile");
|
throw new RuntimeException("Unable to close VCFFile");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,83 +0,0 @@
|
||||||
package org.broadinstitute.sting.utils.genotype.vcf;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.BaseTest;
|
|
||||||
import org.junit.Assert;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @author aaron
|
|
||||||
*
|
|
||||||
* Class VCFGenotypeRecordTest
|
|
||||||
*
|
|
||||||
* A descriptions should go here. Blame aaron if it's missing.
|
|
||||||
*/
|
|
||||||
public class VCFGenotypeRecordTest extends BaseTest {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* test the basic parsing
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testBasicParsing() {
|
|
||||||
String formatString = "GT:B:C:D";
|
|
||||||
String genotypeString = "0|1:2:3:4";
|
|
||||||
String altAlleles[] = {"A","C","G","T"};
|
|
||||||
char referenceBase = 'N';
|
|
||||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(formatString,genotypeString,altAlleles,referenceBase);
|
|
||||||
Assert.assertEquals(VCFGenotypeRecord.GT_GENOTYPE.PHASED,rec.getPhaseType());
|
|
||||||
Assert.assertEquals(referenceBase,rec.getReference());
|
|
||||||
Assert.assertEquals("N",rec.getAllele().get(0));
|
|
||||||
Assert.assertEquals("A",rec.getAllele().get(1));
|
|
||||||
Map<String,String> values = rec.getFields();
|
|
||||||
Assert.assertEquals(3,values.size());
|
|
||||||
Assert.assertTrue(values.get("B").equals("2"));
|
|
||||||
Assert.assertTrue(values.get("C").equals("3"));
|
|
||||||
Assert.assertTrue(values.get("D").equals("4"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* test the parsing of a genotype field with missing parameters
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testMissingFieldParsing() {
|
|
||||||
String formatString = "GT:B:C:D";
|
|
||||||
String genotypeString = "0|1:::4";
|
|
||||||
String altAlleles[] = {"A","C","G","T"};
|
|
||||||
char referenceBase = 'N';
|
|
||||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(formatString,genotypeString,altAlleles,referenceBase);
|
|
||||||
Assert.assertEquals(VCFGenotypeRecord.GT_GENOTYPE.PHASED,rec.getPhaseType());
|
|
||||||
Assert.assertEquals(referenceBase,rec.getReference());
|
|
||||||
Assert.assertEquals("N",rec.getAllele().get(0));
|
|
||||||
Assert.assertEquals("A",rec.getAllele().get(1));
|
|
||||||
Map<String,String> values = rec.getFields();
|
|
||||||
Assert.assertEquals(3,values.size());
|
|
||||||
Assert.assertTrue(values.get("B").equals(""));
|
|
||||||
Assert.assertTrue(values.get("C").equals(""));
|
|
||||||
Assert.assertTrue(values.get("D").equals("4"));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* test the parsing of a genotype field with different missing parameters
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void testMissingAllFields() {
|
|
||||||
String formatString = "GT:B:C:D";
|
|
||||||
String genotypeString = "0|1:::";
|
|
||||||
String altAlleles[] = {"A","C","G","T"};
|
|
||||||
char referenceBase = 'N';
|
|
||||||
VCFGenotypeRecord rec = new VCFGenotypeRecord(formatString,genotypeString,altAlleles,referenceBase);
|
|
||||||
Assert.assertEquals(VCFGenotypeRecord.GT_GENOTYPE.PHASED,rec.getPhaseType());
|
|
||||||
Assert.assertEquals(referenceBase,rec.getReference());
|
|
||||||
Assert.assertEquals("N",rec.getAllele().get(0));
|
|
||||||
Assert.assertEquals("A",rec.getAllele().get(1));
|
|
||||||
Map<String,String> values = rec.getFields();
|
|
||||||
Assert.assertEquals(3,values.size());
|
|
||||||
Assert.assertTrue(values.get("B").equals(""));
|
|
||||||
Assert.assertTrue(values.get("C").equals(""));
|
|
||||||
Assert.assertTrue(values.get("D").equals(""));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -5,6 +5,7 @@ import org.junit.Assert;
|
||||||
import org.broadinstitute.sting.BaseTest;
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* test the VCFReader class test
|
* test the VCFReader class test
|
||||||
|
|
@ -24,5 +25,67 @@ public class VCFReaderTest extends BaseTest {
|
||||||
Assert.assertEquals(5,counter);
|
Assert.assertEquals(5,counter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test the basic parsing
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testBasicParsing() {
|
||||||
|
String formatString = "GT:B:C:D";
|
||||||
|
String genotypeString = "0|1:2:3:4";
|
||||||
|
String altAlleles[] = {"A","C","G","T"};
|
||||||
|
char referenceBase = 'N';
|
||||||
|
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
|
||||||
|
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
|
||||||
|
Assert.assertEquals(referenceBase,rec.getReference());
|
||||||
|
Assert.assertEquals("N",rec.getAllele().get(0));
|
||||||
|
Assert.assertEquals("A",rec.getAllele().get(1));
|
||||||
|
Map<String,String> values = rec.getFields();
|
||||||
|
Assert.assertEquals(4,values.size());
|
||||||
|
Assert.assertTrue(values.get("B").equals("2"));
|
||||||
|
Assert.assertTrue(values.get("C").equals("3"));
|
||||||
|
Assert.assertTrue(values.get("D").equals("4"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test the parsing of a genotype field with missing parameters
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testMissingFieldParsing() {
|
||||||
|
String formatString = "GT:B:C:D";
|
||||||
|
String genotypeString = "0|1:::4";
|
||||||
|
String altAlleles[] = {"A","C","G","T"};
|
||||||
|
char referenceBase = 'N';
|
||||||
|
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
|
||||||
|
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
|
||||||
|
Assert.assertEquals(referenceBase,rec.getReference());
|
||||||
|
Assert.assertEquals("N",rec.getAllele().get(0));
|
||||||
|
Assert.assertEquals("A",rec.getAllele().get(1));
|
||||||
|
Map<String,String> values = rec.getFields();
|
||||||
|
Assert.assertEquals(4,values.size());
|
||||||
|
Assert.assertTrue(values.get("B").equals(""));
|
||||||
|
Assert.assertTrue(values.get("C").equals(""));
|
||||||
|
Assert.assertTrue(values.get("D").equals("4"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* test the parsing of a genotype field with different missing parameters
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testMissingAllFields() {
|
||||||
|
String formatString = "GT:B:C:D";
|
||||||
|
String genotypeString = "0|1:::";
|
||||||
|
String altAlleles[] = {"A","C","G","T"};
|
||||||
|
char referenceBase = 'N';
|
||||||
|
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
|
||||||
|
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
|
||||||
|
Assert.assertEquals(referenceBase,rec.getReference());
|
||||||
|
Assert.assertEquals("N",rec.getAllele().get(0));
|
||||||
|
Assert.assertEquals("A",rec.getAllele().get(1));
|
||||||
|
Map<String,String> values = rec.getFields();
|
||||||
|
Assert.assertEquals(4,values.size());
|
||||||
|
Assert.assertTrue(values.get("B").equals(""));
|
||||||
|
Assert.assertTrue(values.get("C").equals(""));
|
||||||
|
Assert.assertTrue(values.get("D").equals(""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -54,18 +54,27 @@ public class VCFWriterTest extends BaseTest {
|
||||||
additionalColumns.add("FORMAT");
|
additionalColumns.add("FORMAT");
|
||||||
additionalColumns.add("extra1");
|
additionalColumns.add("extra1");
|
||||||
additionalColumns.add("extra2");
|
additionalColumns.add("extra2");
|
||||||
// this should create a header that is valid
|
|
||||||
|
|
||||||
return new VCFHeader(headerFields, metaData, additionalColumns);
|
return new VCFHeader(headerFields, metaData, additionalColumns);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create a fake VCF record
|
||||||
|
* @param header the VCF header
|
||||||
|
* @return a VCFRecord
|
||||||
|
*/
|
||||||
private VCFRecord createVCFRecord(VCFHeader header) {
|
private VCFRecord createVCFRecord(VCFHeader header) {
|
||||||
int totalVals = header.getColumnCount();
|
Map<VCFHeader.HEADER_FIELDS,String> map = new HashMap<VCFHeader.HEADER_FIELDS,String>();
|
||||||
List<String> array = new ArrayList<String>();
|
for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values())
|
||||||
for (int x = 0; x < totalVals; x++) {
|
map.put(field,String.valueOf(1));
|
||||||
array.add(String.valueOf(x));
|
List<VCFGenotypeRecord> gt = new ArrayList<VCFGenotypeRecord>();
|
||||||
|
for (String name : header.getGenotypeSamples()) {
|
||||||
|
Map<String,String> str = new HashMap<String,String>();
|
||||||
|
str.put("key","0|0");
|
||||||
|
List<String> alleles = new ArrayList<String>();
|
||||||
|
alleles.add("AAA");
|
||||||
|
gt.add(new VCFGenotypeRecord(name,str,alleles, VCFGenotypeRecord.PHASE.PHASED,'A'));
|
||||||
}
|
}
|
||||||
return new VCFRecord(header,array);
|
return new VCFRecord(header,map,"GT",gt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue