VCF output now emits no calls as ./.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1863 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d1a4cd2f73
commit
8aacc43203
|
|
@ -90,7 +90,10 @@ public class VCFGenotypeRecord {
|
|||
String str = "";
|
||||
boolean first = true;
|
||||
for (VCFGenotypeEncoding allele : mGenotypeAlleles) {
|
||||
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
|
||||
if (allele.getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
||||
str += VCFGenotypeRecord.EMPTY_GENOTYPE;
|
||||
else
|
||||
str += String.valueOf((altAlleles.contains(allele)) ? altAlleles.indexOf(allele) + 1 : 0);
|
||||
if (first) {
|
||||
switch (mPhaseType) {
|
||||
case UNPHASED:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.*;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.OutputStream;
|
||||
|
|
@ -25,6 +26,10 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
private final File mFile;
|
||||
private final OutputStream mStream;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(VCFGenotypeWriterAdapter.class);
|
||||
|
||||
|
||||
public VCFGenotypeWriterAdapter(String source, String referenceName, File writeTo, Set<String> sampleNames) {
|
||||
mReferenceName = referenceName;
|
||||
mSource = source;
|
||||
|
|
@ -125,12 +130,32 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
VCFParameters params = new VCFParameters();
|
||||
params.addFormatItem("GT");
|
||||
|
||||
for (Genotype gtype : genotypes) {
|
||||
// setup the parameters
|
||||
params.setLocations(gtype.getLocation(), gtype.getReference());
|
||||
// mapping of our sample names to genotypes
|
||||
if (genotypes.size() < 1) {
|
||||
throw new IllegalArgumentException("Unable to parse out the current location: genotype array must at least contain one entry");
|
||||
}
|
||||
|
||||
VCFGenotypeRecord record = createVCFGenotypeRecord(params, gtype);
|
||||
params.addGenotypeRecord(record);
|
||||
// get the location and reference
|
||||
params.setLocations(genotypes.get(0).getLocation(), genotypes.get(0).getReference());
|
||||
|
||||
Map<String, Genotype> genotypeMap = genotypeListToSampleNameMap(genotypes);
|
||||
|
||||
for (String name : mHeader.getGenotypeSamples()) {
|
||||
if (genotypeMap.containsKey(name)) {
|
||||
Genotype gtype = genotypeMap.get(name);
|
||||
VCFGenotypeRecord record = createVCFGenotypeRecord(params, gtype);
|
||||
params.addGenotypeRecord(record);
|
||||
genotypeMap.remove(name);
|
||||
} else {
|
||||
VCFGenotypeRecord record = createNoCallRecord(params, name);
|
||||
params.addGenotypeRecord(record);
|
||||
}
|
||||
}
|
||||
|
||||
if (genotypeMap.size() > 0) {
|
||||
for (String name : genotypeMap.keySet())
|
||||
logger.fatal("Genotype " + name + " was present in the VCFHeader");
|
||||
throw new IllegalArgumentException("Genotype array passed to VCFGenotypeWriterAdapter contained Genotypes not in the VCF header");
|
||||
}
|
||||
|
||||
Map<String, String> infoFields = getInfoFields(metadata, params);
|
||||
|
|
@ -213,6 +238,29 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a no call record
|
||||
*
|
||||
* @param params the VCF parameters object
|
||||
* @param sampleName the sample name
|
||||
*
|
||||
* @return a VCFGenotypeRecord for the no call situation
|
||||
*/
|
||||
private VCFGenotypeRecord createNoCallRecord(VCFParameters params, String sampleName) {
|
||||
Map<String, String> map = new HashMap<String, String>();
|
||||
|
||||
|
||||
List<VCFGenotypeEncoding> alleles = new ArrayList<VCFGenotypeEncoding>();
|
||||
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
|
||||
alleles.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
|
||||
|
||||
VCFGenotypeRecord record = new VCFGenotypeRecord(sampleName,
|
||||
alleles,
|
||||
VCFGenotypeRecord.PHASE.UNPHASED,
|
||||
map);
|
||||
return record;
|
||||
}
|
||||
|
||||
/**
|
||||
* create the allele array?
|
||||
*
|
||||
|
|
@ -234,4 +282,21 @@ public class VCFGenotypeWriterAdapter implements GenotypeWriter {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a genotype mapping from a list and their sample names
|
||||
*
|
||||
* @param list a list of genotype samples
|
||||
*
|
||||
* @return a mapping of the sample name to genotype fields
|
||||
*/
|
||||
private static Map<String, Genotype> genotypeListToSampleNameMap(List<Genotype> list) {
|
||||
Map<String, Genotype> map = new HashMap<String, Genotype>();
|
||||
for (Genotype rec : list) {
|
||||
if (!(rec instanceof SampleBacked))
|
||||
throw new IllegalArgumentException("Genotype must be backed by sample information");
|
||||
map.put(((SampleBacked) rec).getSampleName(), rec);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
private static Pattern pMeta = Pattern.compile("^" + VCFHeader.METADATA_INDICATOR + "\\s*(\\S+)\\s*=\\s*(\\S+)\\s*$");
|
||||
|
||||
// our pattern matching for the genotype mFields
|
||||
private static final Pattern gtPattern = Pattern.compile("([0-9]+)([\\\\|\\/])([0-9]*)");
|
||||
private static final Pattern gtPattern = Pattern.compile("([0-9\\.]+)([\\\\|\\/])([0-9\\.]*)");
|
||||
|
||||
/**
|
||||
* Create a VCF reader, given a VCF file
|
||||
|
|
@ -48,7 +48,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
lines.add(line);
|
||||
line = mReader.readLine();
|
||||
}
|
||||
mHeader = this.createHeader(lines);
|
||||
mHeader = this.createHeader(lines);
|
||||
mNextRecord = createRecord(line, mHeader);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("VCFReader: Failed to parse VCF File on line: " + line, e);
|
||||
|
|
@ -236,8 +236,7 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
phase = VCFGenotypeRecord.determinePhase(m.group(2));
|
||||
addAllele(m.group(1), altAlleles, referenceBase, bases);
|
||||
if (m.group(3).length() > 0) addAllele(m.group(3), altAlleles, referenceBase, bases);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
tagToValue.put(key, parse);
|
||||
}
|
||||
if (nextDivider + 1 >= genotypeString.length()) nextDivider = genotypeString.length() - 1;
|
||||
|
|
@ -263,14 +262,18 @@ public class VCFReader implements Iterator<VCFRecord>, Iterable<VCFRecord> {
|
|||
* @param bases the list of bases for this genotype call
|
||||
*/
|
||||
private static void addAllele(String alleleNumber, String[] altAlleles, char referenceBase, List<VCFGenotypeEncoding> bases) {
|
||||
int alleleValue = Integer.valueOf(alleleNumber);
|
||||
// check to make sure the allele value is within bounds
|
||||
if (alleleValue < 0 || alleleValue > altAlleles.length)
|
||||
throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
|
||||
if (alleleValue == 0)
|
||||
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
|
||||
else
|
||||
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
|
||||
if (alleleNumber.equals(VCFGenotypeRecord.EMPTY_GENOTYPE)) {
|
||||
bases.add(new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_GENOTYPE));
|
||||
} else {
|
||||
int alleleValue = Integer.valueOf(alleleNumber);
|
||||
// check to make sure the allele value is within bounds
|
||||
if (alleleValue < 0 || alleleValue > altAlleles.length)
|
||||
throw new IllegalArgumentException("VCFReader: the allele value of " + alleleValue + " is out of bounds given the alternate allele list.");
|
||||
if (alleleValue == 0)
|
||||
bases.add(new VCFGenotypeEncoding(String.valueOf(referenceBase)));
|
||||
else
|
||||
bases.add(new VCFGenotypeEncoding(altAlleles[alleleValue - 1]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,91 +1,136 @@
|
|||
package org.broadinstitute.sting.utils.genotype.vcf;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* test the VCFReader class test
|
||||
*/
|
||||
/** test the VCFReader class test */
|
||||
public class VCFReaderTest extends BaseTest {
|
||||
|
||||
private static File vcfFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample.vcf");
|
||||
private static File multiSampleVCF = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/MultiSample.vcf");
|
||||
|
||||
@Test
|
||||
public void testVCFInput() {
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
|
||||
}
|
||||
VCFReader reader = new VCFReader(vcfFile);
|
||||
int counter = 0;
|
||||
while (reader.hasNext()) {
|
||||
counter++;
|
||||
reader.next();
|
||||
}
|
||||
Assert.assertEquals(5,counter);
|
||||
Assert.assertEquals(5, counter);
|
||||
}
|
||||
|
||||
/**
|
||||
* test the basic parsing
|
||||
*/
|
||||
@Test
|
||||
public void testMultiSampleVCFInput() {
|
||||
VCFReader reader = new VCFReader(multiSampleVCF);
|
||||
int counter = 0;
|
||||
while (reader.hasNext()) {
|
||||
counter++;
|
||||
reader.next();
|
||||
}
|
||||
Assert.assertEquals(99, counter);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoCallSites() {
|
||||
VCFReader reader = new VCFReader(multiSampleVCF);
|
||||
if (!reader.hasNext()) Assert.fail("The reader should have a record");
|
||||
VCFRecord rec = reader.next();
|
||||
final int numberOfNoCallsTruth = 9;
|
||||
int noCalls = 0;
|
||||
for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) {
|
||||
List<VCFGenotypeEncoding> encodings = record.getAlleles();
|
||||
if (encodings.get(0).getType() == VCFGenotypeEncoding.TYPE.UNCALLED &&
|
||||
encodings.get(1).getType() == VCFGenotypeEncoding.TYPE.UNCALLED)
|
||||
noCalls++;
|
||||
}
|
||||
Assert.assertEquals(numberOfNoCallsTruth, noCalls);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testKnownCallSites() {
|
||||
VCFReader reader = new VCFReader(multiSampleVCF);
|
||||
if (!reader.hasNext()) Assert.fail("The reader should have a record");
|
||||
VCFRecord rec = reader.next();
|
||||
boolean seenNA11992 = false;
|
||||
boolean seenNA12287 = false;
|
||||
for (VCFGenotypeRecord record : rec.getVCFGenotypeRecords()) {
|
||||
if (record.getSampleName().equals("NA11992")) {
|
||||
List<VCFGenotypeEncoding> encodings = record.getAlleles();
|
||||
if (!encodings.get(0).getBases().equals("A") ||
|
||||
!encodings.get(1).getBases().equals("A")) {
|
||||
Assert.fail("Sample NA11992 at site 1:10000005 should be AA");
|
||||
}
|
||||
seenNA11992 = true;
|
||||
}
|
||||
if (record.getSampleName().equals("NA12287")) {
|
||||
List<VCFGenotypeEncoding> encodings = record.getAlleles();
|
||||
if (!encodings.get(0).getBases().equals("A") ||
|
||||
!encodings.get(1).getBases().equals("T")) {
|
||||
Assert.fail("Sample NA11992 at site 1:10000005 should be AA");
|
||||
}
|
||||
seenNA12287 = true;
|
||||
}
|
||||
}
|
||||
Assert.assertTrue(seenNA11992 && seenNA12287);
|
||||
}
|
||||
|
||||
/** test the basic parsing */
|
||||
@Test
|
||||
public void testBasicParsing() {
|
||||
String formatString = "GT:B:C:D";
|
||||
String genotypeString = "0|1:2:3:4";
|
||||
String altAlleles[] = {"A","G","T"};
|
||||
String altAlleles[] = {"A", "G", "T"};
|
||||
char referenceBase = 'C';
|
||||
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
|
||||
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
|
||||
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
|
||||
Map<String,String> values = rec.getFields();
|
||||
Assert.assertEquals(3,values.size());
|
||||
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType());
|
||||
Assert.assertEquals("C", rec.getAlleles().get(0).toString());
|
||||
Assert.assertEquals("A", rec.getAlleles().get(1).toString());
|
||||
Map<String, String> values = rec.getFields();
|
||||
Assert.assertEquals(3, values.size());
|
||||
Assert.assertTrue(values.get("B").equals("2"));
|
||||
Assert.assertTrue(values.get("C").equals("3"));
|
||||
Assert.assertTrue(values.get("D").equals("4"));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* test the parsing of a genotype field with missing parameters
|
||||
*/
|
||||
/** test the parsing of a genotype field with missing parameters */
|
||||
@Test
|
||||
public void testMissingFieldParsing() {
|
||||
String formatString = "GT:B:C:D";
|
||||
String genotypeString = "0|1:::4";
|
||||
String altAlleles[] = {"A","G","T"};
|
||||
String altAlleles[] = {"A", "G", "T"};
|
||||
char referenceBase = 'C';
|
||||
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
|
||||
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
|
||||
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
|
||||
Map<String,String> values = rec.getFields();
|
||||
Assert.assertEquals(3,values.size());
|
||||
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType());
|
||||
Assert.assertEquals("C", rec.getAlleles().get(0).toString());
|
||||
Assert.assertEquals("A", rec.getAlleles().get(1).toString());
|
||||
Map<String, String> values = rec.getFields();
|
||||
Assert.assertEquals(3, values.size());
|
||||
Assert.assertTrue(values.get("B").equals(""));
|
||||
Assert.assertTrue(values.get("C").equals(""));
|
||||
Assert.assertTrue(values.get("D").equals("4"));
|
||||
}
|
||||
|
||||
/**
|
||||
* test the parsing of a genotype field with different missing parameters
|
||||
*/
|
||||
/** test the parsing of a genotype field with different missing parameters */
|
||||
@Test
|
||||
public void testMissingAllFields() {
|
||||
String formatString = "GT:B:C:D";
|
||||
String genotypeString = "0|1:::";
|
||||
String altAlleles[] = {"A","G","T"};
|
||||
String altAlleles[] = {"A", "G", "T"};
|
||||
char referenceBase = 'C';
|
||||
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test",formatString,genotypeString,altAlleles,referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED,rec.getPhaseType());
|
||||
Assert.assertEquals("C",rec.getAlleles().get(0).toString());
|
||||
Assert.assertEquals("A",rec.getAlleles().get(1).toString());
|
||||
Map<String,String> values = rec.getFields();
|
||||
Assert.assertEquals(3,values.size());
|
||||
VCFGenotypeRecord rec = VCFReader.getVCFGenotype("test", formatString, genotypeString, altAlleles, referenceBase);
|
||||
Assert.assertEquals(VCFGenotypeRecord.PHASE.PHASED, rec.getPhaseType());
|
||||
Assert.assertEquals("C", rec.getAlleles().get(0).toString());
|
||||
Assert.assertEquals("A", rec.getAlleles().get(1).toString());
|
||||
Map<String, String> values = rec.getFields();
|
||||
Assert.assertEquals(3, values.size());
|
||||
Assert.assertTrue(values.get("B").equals(""));
|
||||
Assert.assertTrue(values.get("C").equals(""));
|
||||
Assert.assertTrue(values.get("D").equals(""));
|
||||
|
|
|
|||
Loading…
Reference in New Issue