Various improvements to plink, variant context, and VCF code.

We almost completely support indels. Not yet done with plink stuff.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2926 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-03-04 17:58:01 +00:00
parent c8077b7a22
commit 0dd65461a1
10 changed files with 285 additions and 138 deletions

View File

@ -84,8 +84,10 @@ public class Allele implements Comparable<Allele> {
private byte[] bases = null;
public final static String NULL_ALLELE_STRING = "-";
public final static String NO_CALL_STRING = ".";
/** A generic static NO_CALL allele for use */
public final static Allele NO_CALL = new Allele(".");
public final static Allele NO_CALL = new Allele(NO_CALL_STRING);
/**
* Create a new Allele that includes bases and if tagged as the reference allele if isRef == true. If bases

View File

@ -596,6 +596,11 @@ public class VariantContext {
//
// ---------------------------------------------------------------------------------------------------------
/**
* @return the number of samples in the context
*/
public int getNSamples() { return genotypes.size(); }
/**
* @return true if the context has associated genotypes
*/
@ -905,7 +910,7 @@ public class VariantContext {
for ( Allele a : alleles ) {
for ( Allele b : dest ) {
if ( a.basesMatch(b) )
throw new IllegalArgumentException("Duplicate allele added to VariantContext" + a);
throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a);
}
dest.add(a);
@ -915,10 +920,10 @@ public class VariantContext {
}
private static Map<String, Genotype> genotypeCollectionToMap(Map<String, Genotype> dest, Collection<Genotype> genotypes) {
for ( Genotype a : genotypes ) {
if ( dest.containsKey(a.getSampleName() ) )
throw new IllegalArgumentException("Duplicate genotype added to VariantContext " + a);
dest.put(a.getSampleName(), a);
for ( Genotype g : genotypes ) {
if ( dest.containsKey(g.getSampleName() ) )
throw new IllegalArgumentException("Duplicate genotype added to VariantContext: " + g);
dest.put(g.getSampleName(), g);
}
return dest;

View File

@ -16,7 +16,7 @@ import java.util.*;
public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<PlinkRod> {
public static final String SEQUENOM_NO_CALL = "0";
public static final String SEQUENOM_NO_VARIANT = "-";
public static final String SEQUENOM_NO_BASE = "-";
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
@ -27,6 +27,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
private PlinkFileType plinkFileType;
private List<String> sampleNames;
public enum PlinkFileType {
STANDARD_PED, RAW_PED, BINARY_PED
}
@ -35,10 +37,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
super(name);
}
public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter) {
public PlinkRod(String name, PlinkVariantInfo record, ListIterator<PlinkVariantInfo> iter, List<String> sampleNames) {
super(name);
currentVariant = record;
variantIterator = iter;
this.sampleNames = sampleNames;
}
@Override
@ -81,7 +84,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
// get the next record
currentVariant = variantIterator.next();
return new PlinkRod(name, currentVariant, variantIterator);
return new PlinkRod(name, currentVariant, variantIterator, sampleNames);
}
@Override
@ -111,9 +114,12 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
}
public VariantContext getVariantContext() {
assertNotNull();
return currentVariant.getVariantContext();
public Map<String, List<String>> getGenotypes() {
return currentVariant.getGenotypes();
}
public List<String> getSampleNames() {
return sampleNames;
}
public boolean isIndel() {
@ -121,6 +127,16 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return currentVariant.isIndel();
}
public boolean isInsertion() {
assertNotNull();
return currentVariant.isInsertion();
}
public int getLength() {
assertNotNull();
return currentVariant.getLength();
}
// AM I PARSING A TEXT OR A BINARY FILE ??
@ -150,6 +166,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header);
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
sampleNames = new ArrayList<String>();
String line;
do {
line = reader.readLine();
@ -180,6 +198,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
plinkInfo = plinkLine.split("\t");
String individualName = plinkInfo[1];
sampleNames.add(individualName);
int snpNumber = 0;
for ( int i : offsets ) {
@ -237,8 +256,8 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
private ArrayList<PlinkVariantInfo> parseBinaryFormattedPlinkFile(File file) {
PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file);
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
ArrayList<String> sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile);
sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants, binaryFiles.bedFile);
java.util.Collections.sort(updatedVariants);
@ -294,7 +313,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return variants;
}
private ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
private static ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
BufferedReader reader;
try {
reader = new BufferedReader( new FileReader( famFile ));
@ -321,7 +340,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return sampleNames;
}
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, ArrayList<String> samples, File bedFile) {
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, File bedFile) {
FileInputStream inStream;
try {
inStream = new FileInputStream(bedFile);
@ -342,11 +361,11 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
bytesRead++;
if ( genotype != -1 ) {
if ( bytesRead > 3 ) {
addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode);
addGenotypeByte(genotype,variants,snpOffset,sampleOffset, snpMajorMode);
if ( snpMajorMode ) {
sampleOffset = sampleOffset + 4;
if ( sampleOffset > samples.size() -1 ) {
if ( sampleOffset > sampleNames.size() -1 ) {
snpOffset ++;
sampleOffset = 0;
}
@ -372,7 +391,7 @@ public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator<Pli
return variants;
}
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, ArrayList<String> sampleNames, int snpOffset, int sampleOffset, boolean major) {
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, int snpOffset, int sampleOffset, boolean major) {
// four genotypes encoded in this byte
int[] genotypes = parseGenotypes(genotype);
for ( int g : genotypes ) {
@ -406,13 +425,12 @@ class PlinkVariantInfo implements Comparable {
private String variantName;
private GenomeLoc loc;
private HashSet<Allele> alleles = new HashSet<Allele>();
private HashSet<Genotype> genotypes = new HashSet<Genotype>();
private Map<String, List<String>> genotypes = new HashMap<String, List<String>>();
// for indels
private boolean isIndel = false;
private boolean isInsertion = false;
private int indelLength = 0;
private int length = 1;
// for binary parsing
private String locAllele1;
@ -432,18 +450,22 @@ class PlinkVariantInfo implements Comparable {
return variantName;
}
public VariantContext getVariantContext() {
try {
return new VariantContext(variantName, loc, alleles, genotypes);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
}
public Map<String, List<String>> getGenotypes() {
return genotypes;
}
public boolean isIndel() {
return isIndel;
}
public boolean isInsertion() {
return isInsertion;
}
public int getLength() {
return length;
}
public void setGenomeLoc(GenomeLoc loc) {
this.loc = loc;
}
@ -463,10 +485,17 @@ class PlinkVariantInfo implements Comparable {
String pos = pieces[1].substring(1);
loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
if ( pieces.length > 2 && (pieces[2].equals("gI") || pieces[2].equals("gD")) ) {
if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) {
// it's an indel
isIndel = true;
isInsertion = pieces[2].equals("gI");
isInsertion = pieces[2].startsWith("gI");
try {
// length of insertion on reference is still 1
if ( !isInsertion )
length = Integer.parseInt(pieces[2].substring(2));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])");
}
}
}
@ -482,32 +511,16 @@ class PlinkVariantInfo implements Comparable {
public void addGenotypeEntry(String[] alleleStrings, String sampleName) {
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
ArrayList<String> alleles = new ArrayList<String>(2);
for ( String alleleString : alleleStrings ) {
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) ) {
myAlleles.add(Allele.NO_CALL);
} else {
Allele allele;
if ( !isIndel ) {
allele = new Allele(alleleString);
} else if ( alleleString.equals(PlinkRod.SEQUENOM_NO_VARIANT) ) {
allele = new Allele(alleleString, isInsertion);
} else {
allele = new Allele(alleleString, !isInsertion);
if ( indelLength == 0 ) {
indelLength = alleleString.length();
loc = GenomeLocParser.parseGenomeLoc(loc.getContig(), loc.getStart(), loc.getStart()+indelLength-1);
}
}
myAlleles.add(allele);
if ( alleles.size() < 2 )
alleles.add(allele);
}
if ( alleleString.equals(PlinkRod.SEQUENOM_NO_CALL) )
alleles.add(Allele.NO_CALL_STRING);
else
alleles.add(alleleString);
}
genotypes.add(new Genotype(sampleName, myAlleles, 20.0));
genotypes.put(sampleName, alleles);
}
public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) {

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Allele;
import org.broadinstitute.sting.gatk.contexts.variantcontext.Genotype;
@ -36,6 +38,7 @@ public class VariantContextAdaptors {
adaptors.put(rodDbSNP.class, new RodDBSnpAdaptor());
adaptors.put(RodVCF.class, new RodVCFAdaptor());
adaptors.put(VCFRecord.class, new VCFRecordAdaptor());
adaptors.put(PlinkRod.class, new PlinkRodAdaptor());
}
public static boolean canBeConvertedToVariantContext(Object variantContainingObject) {
@ -46,6 +49,7 @@ public class VariantContextAdaptors {
/** generic superclass */
private static abstract class VCAdaptor {
abstract VariantContext convert(String name, Object input);
abstract VariantContext convert(String name, Object input, Allele refAllele);
}
public static VariantContext toVariantContext(String name, Object variantContainingObject) {
@ -56,9 +60,17 @@ public class VariantContextAdaptors {
}
}
public static VariantContext toVariantContext(String name, Object variantContainingObject, Allele refAllele) {
if ( ! adaptors.containsKey(variantContainingObject.getClass()) )
return null;
else {
return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, refAllele);
}
}
// --------------------------------------------------------------------------------------------------------------
//
// From here below you can add adaptor classes for new rods (or other types) to convert to VCF
// From here below you can add adaptor classes for new rods (or other types) to convert to VC
//
// --------------------------------------------------------------------------------------------------------------
@ -72,16 +84,17 @@ public class VariantContextAdaptors {
private static class RodDBSnpAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
if ( ! Allele.acceptableAlleleBases(((rodDbSNP)input).getReference()) )
return null;
Allele refAllele = new Allele(((rodDbSNP)input).getReference(), true);
return convert(name, input, refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
rodDbSNP dbsnp = (rodDbSNP)input;
if ( dbsnp.isSNP() || dbsnp.isIndel() || dbsnp.varType.contains("mixed") ) {
// add the reference allele
if ( ! Allele.acceptableAlleleBases(dbsnp.getReference()) ) {
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
return null;
}
List<Allele> alleles = new ArrayList<Allele>();
Allele refAllele = new Allele(dbsnp.getReference(), true);
alleles.add(refAllele);
// add all of the alt alleles
@ -112,18 +125,28 @@ public class VariantContextAdaptors {
private static class RodVCFAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
return vcfToVariantContext(name, ((RodVCF)input).getRecord());
Allele refAllele = new Allele(((RodVCF)input).getReference(), true);
return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
return vcfToVariantContext(name, ((RodVCF)input).getRecord(), refAllele);
}
}
private static class VCFRecordAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
return vcfToVariantContext(name, (VCFRecord)input);
Allele refAllele = new Allele(((VCFRecord)input).getReference(), true);
return vcfToVariantContext(name, (VCFRecord)input, refAllele);
}
VariantContext convert(String name, Object input, Allele refAllele) {
return vcfToVariantContext(name, (VCFRecord)input, refAllele);
}
}
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf) {
private static VariantContext vcfToVariantContext(String name, VCFRecord vcf, Allele refAllele) {
if ( vcf.isSNP() || vcf.isIndel() ) {
// add the reference allele
if ( ! Allele.acceptableAlleleBases(vcf.getReference()) ) {
@ -137,7 +160,6 @@ public class VariantContextAdaptors {
// add all of the alt alleles
List<Allele> alleles = new ArrayList<Allele>();
Allele refAllele = new Allele(vcf.getReference(), true);
alleles.add(refAllele);
for ( String alt : vcf.getAlternateAlleleList() ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
@ -194,9 +216,9 @@ public class VariantContextAdaptors {
return new VCFHeader(hInfo == null ? new HashSet<VCFHeaderLine>() : hInfo, names);
}
public static VCFRecord toVCF(VariantContext vc) {
public static VCFRecord toVCF(VariantContext vc, char previousRefBase) {
// deal with the reference
String referenceBase = new String(vc.getReference().getBases());
String referenceBases = new String(vc.getReference().getBases());
String contig = vc.getLocation().getContig();
long position = vc.getLocation().getStart();
@ -205,15 +227,51 @@ public class VariantContextAdaptors {
String filters = vc.isFiltered() ? Utils.join(";", Utils.sorted(vc.getFilters())) : VCFRecord.PASSES_FILTERS;
Map<Allele, VCFGenotypeEncoding> alleleMap = new HashMap<Allele, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL, new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
// TODO - fixme: temporarily using Strings as keys because Alleles aren't hashing correctly
Map<String, VCFGenotypeEncoding> alleleMap = new HashMap<String, VCFGenotypeEncoding>();
alleleMap.put(Allele.NO_CALL.toString(), new VCFGenotypeEncoding(VCFGenotypeRecord.EMPTY_ALLELE)); // convenience for lookup
List<VCFGenotypeEncoding> vcfAltAlleles = new ArrayList<VCFGenotypeEncoding>();
for ( Allele a : vc.getAlleles() ) {
VCFGenotypeEncoding encoding = new VCFGenotypeEncoding(new String(a.getBases()));
VCFGenotypeEncoding encoding;
// This is tricky because the VCF spec states that the reference must be a single
// character, whereas the reference alleles for deletions of length > 1 are strings.
// To be safe, we require the reference base be passed in and we use that whenever
// we know that the given allele is the reference.
String alleleString = new String(a.getBases());
if ( vc.getType() == VariantContext.Type.MIXED ) {
throw new UnsupportedOperationException("Conversion from a mixed type isn't currently supported");
} else if ( vc.getType() == VariantContext.Type.INDEL ) {
if ( a.isNull() ) {
if ( a.isReference() ) {
// ref, where alt is insertion
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
} else {
// non-ref deletion
encoding = new VCFGenotypeEncoding("D" + Integer.toString(referenceBases.length()));
}
} else if ( a.isReference() ) {
// ref, where alt is deletion
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
} else {
// non-ref insertion
encoding = new VCFGenotypeEncoding("I" + alleleString);
}
} else if ( vc.getType() == VariantContext.Type.NO_VARIATION ) {
// ref
encoding = new VCFGenotypeEncoding(Character.toString(previousRefBase));
} else {
// ref or alt for snp
encoding = new VCFGenotypeEncoding(alleleString);
}
if ( a.isNonReference() ) {
vcfAltAlleles.add(encoding);
}
alleleMap.put(a, encoding);
alleleMap.put(a.toString(), encoding);
}
List<String> vcfGenotypeAttributeKeys = new ArrayList<String>(Arrays.asList(VCFGenotypeRecord.GENOTYPE_KEY));
@ -225,7 +283,7 @@ public class VariantContextAdaptors {
for ( Genotype g : vc.getGenotypesSortedByName() ) {
List<VCFGenotypeEncoding> encodings = new ArrayList<VCFGenotypeEncoding>(g.getPloidy());
for ( Allele a : g.getAlleles() ) {
encodings.add(alleleMap.get(a));
encodings.add(alleleMap.get(a.toString()));
}
VCFGenotypeRecord.PHASE phasing = g.genotypesArePhased() ? VCFGenotypeRecord.PHASE.PHASED : VCFGenotypeRecord.PHASE.UNPHASED;
@ -240,7 +298,6 @@ public class VariantContextAdaptors {
genotypeObjects.add(vcfG);
}
// info fields
Map<String, String> infoFields = new HashMap<String, String>();
for ( Map.Entry<String, Object> elt : vc.getAttributes().entrySet() ) {
@ -251,7 +308,7 @@ public class VariantContextAdaptors {
}
}
return new VCFRecord(referenceBase, contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects);
return new VCFRecord(Character.toString(previousRefBase), contig, position, ID, vcfAltAlleles, qual, filters, infoFields, genotypeFormatString, genotypeObjects);
}
private static List<String> calcVCFGenotypeKeys(VariantContext vc) {
@ -266,4 +323,66 @@ public class VariantContextAdaptors {
keys.add(VCFGenotypeRecord.GENOTYPE_QUALITY_KEY);
return Utils.sorted(new ArrayList<String>(keys));
}
// --------------------------------------------------------------------------------------------------------------
//
// Plink to VariantContext
//
// --------------------------------------------------------------------------------------------------------------
private static class PlinkRodAdaptor extends VCAdaptor {
VariantContext convert(String name, Object input) {
throw new UnsupportedOperationException("Conversion from Plink to VC requires a reference allele");
}
VariantContext convert(String name, Object input, Allele refAllele) {
PlinkRod plink = (PlinkRod)input;
HashMap<String, Allele> alleles = new HashMap<String, Allele>(); // use String keys to help maintain uniqueness
alleles.put(refAllele.isNull() ? Allele.NULL_ALLELE_STRING : new String(refAllele.getBases()), refAllele);
Set<Genotype> genotypes = new HashSet<Genotype>();
Map<String, List<String>> genotypeSets = plink.getGenotypes();
// for each sample
for ( Map.Entry<String, List<String>> genotype : genotypeSets.entrySet() ) {
ArrayList<Allele> myAlleles = new ArrayList<Allele>(2);
// for each allele
for ( String alleleString : genotype.getValue() ) {
Allele allele;
if ( alleleString.equals(Allele.NO_CALL_STRING) ) {
allele = Allele.NO_CALL;
} else {
if ( !plink.isIndel() ) {
allele = new Allele(alleleString, refAllele.basesMatch(alleleString));
} else if ( alleleString.equals(Allele.NULL_ALLELE_STRING) ) {
allele = new Allele(alleleString, plink.isInsertion());
} else {
allele = new Allele(alleleString, !plink.isInsertion());
}
if ( !alleles.containsKey(alleleString) )
alleles.put(alleleString, allele);
}
myAlleles.add(allele);
}
// create the genotype
genotypes.add(new Genotype(genotype.getKey(), myAlleles));
}
// create the variant context
try {
GenomeLoc loc = GenomeLocParser.setStop(plink.getLocation(), plink.getLocation().getStop() + plink.getLength()-1);
VariantContext vc = new VariantContext(plink.getVariantName(), loc, alleles.values(), genotypes);
vc.validate();
return vc;
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file");
}
}
}
}

View File

@ -5,15 +5,10 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.vcf.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.EnumSet;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.io.File;
/**
@ -29,7 +24,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
@Argument(fullName="onlyContextsStartinAtCurrentPosition", doc="Only take variant contexts at actually start at the current position, excluding those at span to the current location but start earlier", required=false)
boolean onlyContextsStartinAtCurrentPosition = false;
@Argument(fullName="printPerLocus", doc="If true, we'll print the variant contexts, in addition to counts", required=false)
@Argument(fullName="printPerLocus", doc="If true, we'll psetenv LD_LIBRARY_PATH .:/util/gcc-4.3.0/lib64:/util/gcc-4.3.0/lib/gcc/x86_64-unknown-linux-gnu/4.3.0:/util/gcc-4.3.0/lib/:/util/lib:/broad/tools/Linux/x86_64/pkgs/boost_1.38.0/lib:/humgen/gsa-scr1/GATK_Data/bwarint the variant contexts, in addition to counts", required=false)
boolean printContexts = false;
@Argument(fullName="outputVCF", doc="If provided, we'll convert the first input context into a VCF", required=false)
@ -57,7 +52,7 @@ public class TestVariantContextWalker extends RodWalker<Integer, Integer> {
wroteHeader = true;
}
writer.addRecord(VariantContextAdaptors.toVCF(vc));
writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
}
n++;

View File

@ -396,13 +396,13 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
}
if ( group.enableInterestingSiteCaptures && captureInterestingSitesOfEvalSet(group) )
writeInterestingSite(interestingReasons, vc);
writeInterestingSite(interestingReasons, vc, ref.getBase());
}
return 0;
}
private void writeInterestingSite(List<String> interestingReasons, VariantContext vc) {
private void writeInterestingSite(List<String> interestingReasons, VariantContext vc, char ref) {
if ( writer != null && interestingReasons.size() > 0 ) {
MutableVariantContext mvc = new MutableVariantContext(vc);
@ -431,7 +431,7 @@ public class VariantEval2Walker extends RodWalker<Integer, Integer> {
wroteHeader = true;
}
writer.addRecord(VariantContextAdaptors.toVCF(mvc));
writer.addRecord(VariantContextAdaptors.toVCF(mvc, ref));
//interestingReasons.clear();
}
}

View File

@ -10,19 +10,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.genotype.vcf.VCFRecord;
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeader;
import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.oneoffprojects.walkers.varianteval2.MendelianViolationEvaluator;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import java.io.FileNotFoundException;
/**
* Test routine for new VariantContext object
@ -68,6 +60,9 @@ public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
}
}
if ( vc != null )
writer.addRecord(VariantContextAdaptors.toVCF(vc, ref.getBase()));
return vc;
}
@ -121,13 +116,6 @@ public class BeagleTrioToVCFWalker extends RodWalker<VariantContext, Long> {
}
public Long reduce(VariantContext vc, Long prevReduce) {
if ( vc == null ) {
return prevReduce;
} else {
writer.addRecord(VariantContextAdaptors.toVCF(vc));
prevReduce++;
}
return prevReduce;
return ( vc == null ? prevReduce : prevReduce+1);
}
}

View File

@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.refdata.PlinkRod;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.genotype.vcf.*;
@ -22,6 +24,7 @@ import java.util.*;
/**
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
*/
@Reference(window=@Window(start=0,stop=40))
public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="outputVCF", shortName="vcf", doc="The VCF file to write to", required=true)
public File vcfFile = null;
@ -39,27 +42,19 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
@Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false)
public double maxHomNonref = 1.1;
private final Set<String> HEADER_FIELDS = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype",
"FID","IID","PAT","MAT","SEX","PHENOTYPE"));
private static final int MAX_INDEL_SIZE = 40;
private final int INIT_NUMBER_OF_POPULATIONS = 10;
private final int DEFAULT_QUALITY = 20;
private ArrayList<String> sampleNames = new ArrayList<String>(nSamples);
private VCFGenotypeWriterAdapter vcfWriter;
private VCFWriter vcfWriter = null;
private final HardyWeinberg HWCalc = new HardyWeinberg();
private final boolean useSmartHardy = popFile != null;
private HashMap<String,String> samplesToPopulation;
public void initialize() {
vcfWriter = new VCFGenotypeWriterAdapter(vcfFile);
if ( useSmartHardy ) {
samplesToPopulation = parsePopulationFile(popFile);
}
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.addAll(VCFUtils.getHeaderFields(getToolkit()));
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
vcfWriter.writeHeader(new TreeSet<String>(sampleNames), hInfo);
nSamples = sampleNames.size();
}
public Integer reduceInit() {
@ -85,50 +80,69 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
if ( plinkRod == null )
return null;
if ( vcfWriter == null )
initializeWriter(plinkRod);
return addVariantInformationToCall(ref, plinkRod);
}
private void initializeWriter(PlinkRod plinkRod) {
vcfWriter = new VCFWriter(vcfFile);
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
hInfo.add(new VCFHeaderLine("source", "PlinkToVCF"));
hInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName()));
VCFHeader header = new VCFHeader(hInfo, new TreeSet<String>(plinkRod.getSampleNames()));
vcfWriter.writeHeader(header);
nSamples = sampleNames.size();
}
public Integer reduce(VCFRecord call, Integer numVariants) {
if ( call != null ) {
numVariants++;
printToVCF(call);
vcfWriter.addRecord(call);
}
return numVariants;
}
public void onTraversalDone(Integer finalReduce) {
logger.info("Variants processed="+finalReduce.toString());
vcfWriter.close();
if ( vcfWriter != null )
vcfWriter.close();
}
private void printToVCF(VCFRecord call) {
try {
vcfWriter.addRecord(call);
} catch ( RuntimeException e ) {
if ( e.getLocalizedMessage().equalsIgnoreCase("We have more genotype samples than the header specified")) {
throw new StingException("We have more sample genotypes than sample names -- check that there are no duplicates in the .ped file",e);
} else {
throw new StingException("Error in VCF creation: "+e.getLocalizedMessage(),e);
}
}
}
private VCFRecord addVariantInformationToCall(ReferenceContext ref, PlinkRod plinkRod) {
VariantContext vContext = plinkRod.getVariantContext();
VCFRecord record = VariantContextAdaptors.toVCF(vContext);
// determine the reference allele
Allele refAllele;
if ( !plinkRod.isIndel() ) {
refAllele = new Allele(Character.toString(ref.getBase()), true);
} else if ( plinkRod.isInsertion() ) {
refAllele = new Allele(PlinkRod.SEQUENOM_NO_BASE, true);
} else {
if ( plinkRod.getLength() > MAX_INDEL_SIZE )
throw new UnsupportedOperationException("PlinkToVCF currently can only handle indels up to length " + MAX_INDEL_SIZE);
char[] deletion = new char[plinkRod.getLength()];
System.arraycopy(ref.getBases(), 1, deletion, 0, plinkRod.getLength());
refAllele = new Allele(new String(deletion), true);
}
VariantContext vContext = VariantContextAdaptors.toVariantContext(plinkRod.getName(), plinkRod, refAllele);
VCFRecord record = VariantContextAdaptors.toVCF(vContext, ref.getBase());
record.setGenotypeFormatString("GT");
int numNoCalls = vContext.getNoCallCount();
int numHomVarCalls = vContext.getHomVarCount();
if ( true )
return record;
int numHetCalls = vContext.getHetCount();
double noCallProp = (double)vContext.getNoCallCount() / (double)vContext.getNSamples();
double homVarProp = (double)vContext.getHomVarCount() / (double)vContext.getNSamples();
double noCallProp = ( (double) numNoCalls )/( (double) sampleNames.size());
double homNonRProp = ( (double) numHomVarCalls )/( (double) sampleNames.size() - numNoCalls);
record.setQual(DEFAULT_QUALITY);
String hw = hardyWeinbergCalculation(ref,record);
double hwScore = hw != null ? Double.valueOf(hw) : -0.0;
// TODO -- record.addInfoFields(generateInfoField(record, numNoCalls,numHomVarCalls,numNonrefAlleles,ref, plinkRod, hwScore));
@ -136,7 +150,7 @@ public class PlinkToVCF extends RodWalker<VCFRecord,Integer> {
record.setFilterString("Hardy-Weinberg");
} else if ( noCallProp > maxNoCall ) {
record.setFilterString("No-calls");
} else if ( homNonRProp > maxHomNonref) {
} else if ( homVarProp > maxHomNonref) {
record.setFilterString("HomNonref-calls");
}

View File

@ -582,7 +582,10 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
builder.append(EMPTY_ALLELE_FIELD);
}
builder.append(FIELD_SEPERATOR);
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, getQual()));
if ( MathUtils.compareDoubles(mQual, -1.0) == 0 )
builder.append("-1");
else
builder.append(String.format(DOUBLE_PRECISION_FORMAT_STRING, mQual));
builder.append(FIELD_SEPERATOR);
builder.append(Utils.join(FILTER_CODE_SEPERATOR, getFilteringCodes()));
builder.append(FIELD_SEPERATOR);
@ -623,12 +626,20 @@ public class VCFRecord implements Variation, VariantBackedByGenotype {
* @param header the header object
*/
private void addGenotypeData(StringBuilder builder, VCFHeader header) {
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
StringBuffer tempStr = new StringBuffer();
if ( header.getGenotypeSamples().size() < getGenotypes().size() )
throw new IllegalStateException("We have more genotype samples than the header specified");
if ( header.getGenotypeSamples().size() < getGenotypes().size() ) {
for ( String sample : gMap.keySet() ) {
if ( !header.getGenotypeSamples().contains(sample) )
System.err.println("Sample " + sample + " is a duplicate or is otherwise not present in the header");
else
header.getGenotypeSamples().remove(sample);
}
throw new IllegalStateException("We have more genotype samples than the header specified; please check that samples aren't duplicated");
}
tempStr.append(FIELD_SEPERATOR + mGenotypeFormatString);
Map<String, VCFGenotypeRecord> gMap = genotypeListToMap(getGenotypes());
String[] genotypeFormatStrings = mGenotypeFormatString.split(":");
for ( String genotype : header.getGenotypeSamples() ) {

View File

@ -7,7 +7,7 @@ import java.util.Arrays;
public class IndelRealignerIntegrationTest extends WalkerTest {
@Test
public void testIntervals() {
public void testRealigner() {
String[] md5lod5 = {"67c3fc25e9d192cc5fbfd48ade0efc84", "86778f92b0fa6aa7c26e651c8c1eb320"};
WalkerTestSpec spec1 = new WalkerTestSpec(
@ -23,4 +23,4 @@ public class IndelRealignerIntegrationTest extends WalkerTest {
Arrays.asList(md5lod200));
executeTest("testLod200", spec2);
}
}
}