clean-up and fixes to the VCF input
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1849 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
a32470cea1
commit
a9094c835c
|
|
@ -15,6 +15,9 @@ import java.util.Map;
|
||||||
* so they were broken off into their own class
|
* so they were broken off into their own class
|
||||||
*/
|
*/
|
||||||
public class VCFGenotypeRecord {
|
public class VCFGenotypeRecord {
|
||||||
|
// the symbol for a empty genotype
|
||||||
|
public static final String EMPTY_GENOTYPE = ".";
|
||||||
|
|
||||||
// what kind of phasing this genotype has
|
// what kind of phasing this genotype has
|
||||||
public enum PHASE {
|
public enum PHASE {
|
||||||
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
UNPHASED, PHASED, PHASED_SWITCH_PROB, UNKNOWN
|
||||||
|
|
|
||||||
|
|
@ -54,6 +54,7 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
||||||
// setup the header fields
|
// setup the header fields
|
||||||
hInfo.put("format", "VCRv3.2");
|
hInfo.put("format", "VCRv3.2");
|
||||||
hInfo.put("source", mSource);
|
hInfo.put("source", mSource);
|
||||||
|
hInfo.put("reference", mReferenceName);
|
||||||
|
|
||||||
// setup the sample names
|
// setup the sample names
|
||||||
mHeader = new VCFHeader(hInfo, sampleNames);
|
mHeader = new VCFHeader(hInfo, sampleNames);
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ class VCFParameters {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFormatString() {
|
public String getFormatString() {
|
||||||
return Utils.join(";", formatList);
|
return Utils.join(VCFRecord.FORMAT_FIELD_SEPERATOR, formatList);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<VCFGenotypeRecord> getGenotypesRecords() {
|
public List<VCFGenotypeRecord> getGenotypesRecords() {
|
||||||
|
|
|
||||||
|
|
@ -192,7 +192,8 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
List<VCFGenotypeRecord> genotypeRecords = new ArrayList<VCFGenotypeRecord>();
|
||||||
index++;
|
index++;
|
||||||
for (String str : mHeader.getGenotypeSamples()) {
|
for (String str : mHeader.getGenotypeSamples()) {
|
||||||
genotypeRecords.add(getVCFGenotype(str, mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
if (!tokens[index].equalsIgnoreCase(VCFGenotypeRecord.EMPTY_GENOTYPE))
|
||||||
|
genotypeRecords.add(getVCFGenotype(str, mFormatString, tokens[index], values.get(VCFHeader.HEADER_FIELDS.ALT).split(","), values.get(VCFHeader.HEADER_FIELDS.REF).charAt(0)));
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
return new VCFRecord(values, mFormatString, genotypeRecords);
|
return new VCFRecord(values, mFormatString, genotypeRecords);
|
||||||
|
|
@ -217,6 +218,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
||||||
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
VCFGenotypeRecord.PHASE phase = VCFGenotypeRecord.PHASE.UNKNOWN;
|
||||||
List<String> bases = new ArrayList<String>();
|
List<String> bases = new ArrayList<String>();
|
||||||
String keyStrings[] = formatString.split(":");
|
String keyStrings[] = formatString.split(":");
|
||||||
|
|
||||||
for (String key : keyStrings) {
|
for (String key : keyStrings) {
|
||||||
String parse;
|
String parse;
|
||||||
int nextDivider;
|
int nextDivider;
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,14 @@ import java.util.*;
|
||||||
|
|
||||||
/** the basic VCF record type */
|
/** the basic VCF record type */
|
||||||
public class VCFRecord {
|
public class VCFRecord {
|
||||||
|
// commonly used strings that are in the standard
|
||||||
|
public static final String FORMAT_FIELD_SEPERATOR = ":";
|
||||||
|
public static final String GENOTYPE_FIELD_SEPERATOR = ":";
|
||||||
public static final String FIELD_SEPERATOR = "\t";
|
public static final String FIELD_SEPERATOR = "\t";
|
||||||
|
public static final String FILTER_CODE_SEPERATOR = ";";
|
||||||
|
public static final String INFO_FIELD_SEPERATOR = ";";
|
||||||
|
public static final String EMPTY_INFO_FIELD = ".";
|
||||||
|
public static final String DOUBLE_PRECISION_FORMAT_STRING = "%.2f";
|
||||||
// the reference base
|
// the reference base
|
||||||
private char mReferenceBase;
|
private char mReferenceBase;
|
||||||
// our contig
|
// our contig
|
||||||
|
|
@ -146,10 +153,7 @@ public class VCFRecord {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public boolean hasGenotypeData() {
|
public boolean hasGenotypeData() {
|
||||||
if (mGenotypeFields.size() < 1) {
|
return (mGenotypeFields.size() > 0);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @return the string for the chromosome that this VCF record is associated with */
|
/** @return the string for the chromosome that this VCF record is associated with */
|
||||||
|
|
@ -321,14 +325,14 @@ public class VCFRecord {
|
||||||
String alts = "";
|
String alts = "";
|
||||||
for (String str : this.getAlternateAlleles()) alts += str + ",";
|
for (String str : this.getAlternateAlleles()) alts += str + ",";
|
||||||
builder.append((alts.length() > 0) ? alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR : "." + FIELD_SEPERATOR);
|
builder.append((alts.length() > 0) ? alts.substring(0, alts.length() - 1) + FIELD_SEPERATOR : "." + FIELD_SEPERATOR);
|
||||||
builder.append(String.format("%.2f",getQual()) + FIELD_SEPERATOR);
|
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING,getQual()) + FIELD_SEPERATOR);
|
||||||
builder.append(Utils.join(";", getFilteringCodes()) + FIELD_SEPERATOR);
|
builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()) + FIELD_SEPERATOR);
|
||||||
String info = "";
|
String info = "";
|
||||||
for (String str : this.getInfoValues().keySet()) {
|
for (String str : this.getInfoValues().keySet()) {
|
||||||
if (str.equals("."))
|
if (str.equals(EMPTY_INFO_FIELD))
|
||||||
info = ".";
|
info = EMPTY_INFO_FIELD;
|
||||||
else
|
else
|
||||||
info += str + "=" + getInfoValues().get(str) + ";";
|
info += str + "=" + getInfoValues().get(str) + INFO_FIELD_SEPERATOR;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info.length() > 1) builder.append(info.substring(0, info.length() - 1));
|
if (info.length() > 1) builder.append(info.substring(0, info.length() - 1));
|
||||||
|
|
@ -363,12 +367,12 @@ public class VCFRecord {
|
||||||
builder.append(rec.toGenotypeString(this.mAlts));
|
builder.append(rec.toGenotypeString(this.mAlts));
|
||||||
for (String s : rec.getFields().keySet()) {
|
for (String s : rec.getFields().keySet()) {
|
||||||
if (rec.getFields().get(s).equals("")) continue;
|
if (rec.getFields().get(s).equals("")) continue;
|
||||||
builder.append(":");
|
builder.append(GENOTYPE_FIELD_SEPERATOR);
|
||||||
builder.append(rec.getFields().get(s));
|
builder.append(rec.getFields().get(s));
|
||||||
}
|
}
|
||||||
gMap.remove(genotype);
|
gMap.remove(genotype);
|
||||||
} else {
|
} else {
|
||||||
builder.append(".");
|
builder.append(VCFGenotypeRecord.EMPTY_GENOTYPE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (gMap.size() != 0) {
|
if (gMap.size() != 0) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue