From 286b658fab8bd062e70bfa3158694dc212fb602b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 21:25:14 -0400 Subject: [PATCH 1/4] Re-enabling parallelism in the BaseRecalibrator now that the release is out. --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 9 ++++----- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 4 ---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 580667ee2..bd75806dd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -75,11 +75,10 @@ public class BQSRIntegrationTest extends WalkerTest { Arrays.asList(params.md5)); executeTest("testBQSR-"+params.args, spec).getFirst(); - // TODO -- re-enable once parallelization is fixed in BaseRecalibrator - //WalkerTestSpec specNT2 = new WalkerTestSpec( - // params.getCommandLine() + " -nt 2", - // Arrays.asList(params.md5)); - //executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); + WalkerTestSpec specNT2 = new WalkerTestSpec( + params.getCommandLine() + " -nt 2", + Arrays.asList(params.md5)); + executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 91d982f20..e45cad971 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -136,10 +136,6 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed */ public void initialize() { - // TODO -- remove me after the 2.1 release - if ( getToolkit().getArguments().numberOfThreads > 1 ) - throw new UserException("We have temporarily disabled the ability to run BaseRecalibrator multi-threaded for performance reasons. We hope to have this fixed for the next GATK release (2.2) and apologize for the inconvenience."); - // check for unsupported access if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals) throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument"); From 3514fb6e6620a0e289837a8fb6c216de9d14c608 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 20 Aug 2012 21:39:38 -0400 Subject: [PATCH 2/4] Changed the default memory limit from none to 2GB upon suggestions from delangel, carneiro, and depristo. --- .../scala/src/org/broadinstitute/sting/queue/QSettings.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 1a50301f1..429428c4c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -52,8 +52,8 @@ class QSettings { @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) var jobEnvironmentNames: Seq[String] = Nil - @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) - var memoryLimit: Option[Double] = None + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + var memoryLimit: Option[Double] = Some(2) @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) var memoryLimitThreshold: Option[Double] = None From 40d5efc8043d2fc93642f35d6731a7b297858ccf Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 23:12:41 -0400 Subject: [PATCH 3/4] Fix for Adam K's reported bug: we weren't handling reads that were entirely insertions properly in LIBS. Specifically, the event bases were off-by-one (which was disasterous in Adam's case with a 1bp read). Added a unit test to cover this case. --- .../gatk/iterators/LocusIteratorByState.java | 9 ++-- .../LocusIteratorByStateUnitTest.java | 45 +++++++++++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index f97069189..b5d5dfc0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -301,6 +301,7 @@ public class LocusIteratorByState extends LocusIterator { final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator final int readOffset = state.getReadOffset(); // the base offset on this read @@ -308,7 +309,7 @@ public class LocusIteratorByState extends LocusIterator { final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; - final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); int nextElementLength = nextElement.getLength(); @@ -328,8 +329,10 @@ public class LocusIteratorByState extends LocusIterator { else { if (!filterBaseInRead(read, location.getStart())) { String insertedBaseString = null; - if (nextOp == CigarOperator.I) - insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength())); + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); size++; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index dc908c323..e65a7fb27 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -104,7 +104,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); after.setCigarString("10M"); - List reads = Arrays.asList(before,during,after); + List reads = Arrays.asList(before, during, after); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads,readAttributes); @@ -134,9 +134,9 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // create a test version of the Reads object ReadProperties readAttributes = createTestReadProperties(); - SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,firstLocus,76); + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); indelOnlyRead.setCigarString("76I"); List reads = Arrays.asList(indelOnlyRead); @@ -148,10 +148,10 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // and considers it to be an indel-containing read. Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus,"Base pileup is at incorrect location."); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); ReadBackedPileup basePileup = alignmentContext.getBasePileup(); Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); - Assert.assertSame(basePileup.getReads().get(0),indelOnlyRead,"Read in pileup is incorrect"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); } /** @@ -168,7 +168,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { leadingRead.setCigarString("1M75I"); SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); indelOnlyRead.setCigarString("76I"); @@ -177,7 +177,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); fullMatchAfterIndel.setCigarString("75I1M"); - List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); @@ -204,7 +204,36 @@ public class LocusIteratorByStateUnitTest extends BaseTest { numAlignmentContextsFound++; } - Assert.assertEquals(numAlignmentContextsFound,2,"Found incorrect number of alignment contexts"); + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelReadTest2() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read",0,secondLocus,1); + read.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read.setCigarString("1I"); + + List reads = Arrays.asList(read); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } } private static ReadProperties createTestReadProperties() { From ba8622ff0d30298dce4418760ed4a434fa1bda02 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 21 Aug 2012 07:03:50 -0400 Subject: [PATCH 4/4] number of stashed changes are lurking in here. In order of importance: - Fix for M_Trieb's error report on the forum, and addition of integration tests to cover the walker. - Addition of StructuralIndel as a class of variation within the VariantContext. These are for variants with a full alt allele that's >150bp in length. - Adaptation of the MVLikelihoodRatio to work for a set of trios (takes the max over the trios of the MVLR) - InsertSizeDistribution changed to use the new gatk report output (it was previously broken) - RetrogeneDiscovery changed to be compatible with the new gatk report - A maxIndelSize argument added to SelectVariants - ByTranscriptEvaluator rewritten for cleanliness - VariantRecalibrator modified to not exclude structural indels from recalibration if the mode is INDEL - Documentation added to DepthOfCoverageIntegrationTest (no, don't yell at chartl ;_; ) Also sorry for the long commit history behind this that is the result of fixing merge conflicts. Because this *also* fixes a conflict (from git stash apply), for some reason I can't rebase all of them away. I'm pretty sure some of the commit notes say "this note isn't important because I'm going to rebase it anyway". --- .../walkers/annotator/MVLikelihoodRatio.java | 64 +++++---- .../VariantDataManager.java | 2 +- .../walkers/variantutils/SelectVariants.java | 20 +++ .../variantutils/VariantsToBinaryPed.java | 136 +++++++++++------- .../utils/variantcontext/VariantContext.java | 25 +++- .../DepthOfCoverageIntegrationTest.java | 2 +- .../VariantsToBinaryPedIntegrationTest.java | 92 ++++++++++++ 7 files changed, 260 insertions(+), 81 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b6f24433e..7b0db6855 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -28,9 +29,13 @@ import java.util.*; public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; - private String motherId; - private String fatherId; - private String childId; + private Set trios; + + private class Trio { + String motherId; + String fatherId; + String childId; + } public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( mendelianViolation == null ) { @@ -38,17 +43,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && - vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && - vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); - if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); + //double pNoMV = 1.0; + double maxMVLR = Double.MIN_VALUE; + for ( Trio trio : trios ) { + boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() && + vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() && + vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods(); + if ( hasAppropriateGenotypes ) { + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId); + maxMVLR = likR > maxMVLR ? likR : maxMVLR; + //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); + } + } + //double pSomeMV = 1.0-pNoMV; + //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); + toRet.put("MVLR",maxMVLR); return toRet; } @@ -58,25 +73,24 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } private boolean checkAndSetSamples(SampleDB db){ + trios = new HashSet(); Set families = db.getFamilyIDs(); - if(families.size() != 1) - return false; - - Set family = db.getFamily(families.iterator().next()); - if(family.size() != 3) - return false; - - Iterator sampleIter = family.iterator(); - Sample sample; - for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ - if(sample.getParents().size()==2){ - motherId = sample.getMaternalID(); - fatherId = sample.getPaternalID(); - childId = sample.getID(); - return true; + for ( String familyString : families ) { + Set family = db.getFamily(familyString); + Iterator sampleIterator = family.iterator(); + Sample sample; + for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) { + if ( sample.getParents().size() == 2 ) { + Trio trio = new Trio(); + trio.childId = sample.getID(); + trio.fatherId = sample.getFather().getID(); + trio.motherId = sample.getMother().getID(); + trios.add(trio); + } } } - return false; + + return trios.size() > 0; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e88505f99..1f06cc249 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -297,7 +297,7 @@ public class VariantDataManager { case SNP: return evalVC.isSNP() || evalVC.isMNP(); case INDEL: - return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); case BOTH: return true; default: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bfd9aa52f..4c0c0cabf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -322,6 +322,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; + @Argument(doc="indel size select",required=false,fullName="maxIndelSize") + private int maxIndelSize = Integer.MAX_VALUE; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -541,6 +544,9 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; + if ( badIndelSize(vc) ) + continue; + VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { @@ -572,6 +578,20 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private boolean badIndelSize(final VariantContext vc) { + if ( vc.getReference().length() > maxIndelSize ) { + return true; + } + + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.length() > maxIndelSize ) { + return true; + } + } + + return false; + } + private boolean hasPLs(final VariantContext vc) { for ( Genotype g : vc.getGenotypes() ) { if ( g.hasLikelihoods() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 3fba8fa77..7111bac46 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -76,47 +76,11 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; + initializeValidator(); + writeBedHeader(); + Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers - // write magic bits into the ped file - try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); - // ultimately, the bed will be in individual-major mode - } catch (IOException e) { - throw new ReviewedStingException("error writing to output file."); - } - // write to the fam file, the first six columns of the standard ped file - // first, load data from the input meta data file - Map> metaValues = new HashMap>(); - logger.debug("Reading in metadata..."); - try { - if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { - for ( String line : new XReadLines(metaDataFile) ) { - String[] famSplit = line.split("\\t"); - String sid = famSplit[1]; - outFam.printf("%s%n",line); - } - } else { - for ( String line : new XReadLines(metaDataFile) ) { - logger.debug(line); - String[] split = line.split("\\t"); - String sampleID = split[0]; - String keyVals = split[1]; - HashMap values = new HashMap(); - for ( String kvp : keyVals.split(";") ) { - String[] kvp_split = kvp.split("="); - values.put(kvp_split[0],kvp_split[1]); - } - metaValues.put(sampleID,values); - } - } - } catch (FileNotFoundException e) { - throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); - } // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype int dummyID = 0; // increments for dummy parental and family IDs used // want to be especially careful to maintain order here @@ -126,21 +90,23 @@ public class VariantsToBinaryPed extends RodWalker { continue; } for ( String sample : header.getValue().getGenotypeSamples() ) { - Map mVals = metaValues.get(sample); - if ( mVals == null ) { - throw new UserException("No metadata provided for sample "+sample); + if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + Map mVals = sampleMetaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } - if ( ! mVals.containsKey("phenotype") ) { - throw new UserException("No phenotype data provided for sample "+sample); - } - String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); - String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); - String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); - String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; - String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); try { - File temp = File.createTempFile(sample, ".tmp"); + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); printMap.put(sample,new PrintStream(temp)); tempFiles.put(sample,temp); } catch (IOException e) { @@ -216,6 +182,7 @@ public class VariantsToBinaryPed extends RodWalker { // reset the buffer for this sample genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } + byteCount = 0; } genotypeCount = 0; } @@ -337,4 +304,69 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } + + private void initializeValidator() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + } + + private void writeBedHeader() { + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + } + + private Map> parseMetaData() { + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + logger.debug("Reading in metadata..."); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\s+"); + if ( famSplit.length != 6 ) { + throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + } + String sid = famSplit[1]; + String fid = famSplit[0]; + String mom = famSplit[2]; + String dad = famSplit[3]; + String sex = famSplit[4]; + String pheno = famSplit[5]; + HashMap values = new HashMap(); + values.put("mom",mom); + values.put("dad",dad); + values.put("fid",fid); + values.put("sex",sex); + values.put("phenotype",pheno); + metaValues.put(sid,values); + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); + String[] split = line.split("\\s+"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + + return metaValues; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1fe6b8652..8015889f5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.apache.commons.math.stat.descriptive.rank.Max; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; @@ -178,9 +179,8 @@ import java.util.*; */ public class VariantContext implements Feature { // to enable tribble integration private final static boolean WARN_ABOUT_BAD_END = true; + private final static long MAX_ALLELE_SIZE_FOR_NON_SV = 150; final protected static Logger logger = Logger.getLogger(VariantContext.class); - - private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -458,6 +458,7 @@ public class VariantContext implements Feature { // to enable tribble integratio SNP, MNP, // a multi-nucleotide polymorphism INDEL, + STRUCTURAL_INDEL, SYMBOLIC, MIXED, } @@ -530,6 +531,18 @@ public class VariantContext implements Feature { // to enable tribble integratio return getType() == Type.SYMBOLIC; } + public boolean isStructuralIndel() { + return getType() == Type.STRUCTURAL_INDEL; + } + + /** + * + * @return true if the variant is symbolic or a large indel + */ + public boolean isSymbolicOrSV() { + return isSymbolic() || isStructuralIndel(); + } + public boolean isMNP() { return getType() == Type.MNP; } @@ -1250,6 +1263,14 @@ public class VariantContext implements Feature { // to enable tribble integratio // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + + // Because a number of structural variation callers write the whole alternate allele into the VCF where possible, + // this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now + // classify these as structural events, rather than indel events, as we think differently about the mechanism, + // representation, and handling of these events. Check for this case here: + if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV ) + return Type.STRUCTURAL_INDEL; + return Type.INDEL; // old incorrect logic: diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 6f1370008..9bec1b75d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -9,7 +9,7 @@ import java.util.Arrays; import java.util.List; /** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * Integration tests for the Depth of Coverage walker * * @Author chartl * @Date Feb 25, 2010 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java new file mode 100644 index 000000000..07e82b869 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -0,0 +1,92 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/20/12 + * Time: 9:57 PM + * To change this template use File | Settings | File Templates. + */ +public class VariantsToBinaryPedIntegrationTest extends WalkerTest { + + public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/"; + + public static String baseTestString(String inputVCF, String inputMetaData, int gq) { + return "-T VariantsToBinaryPed -R " + b37KGReference + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + + @Test + public void testNA12878Alone() { + String testName = "testNA12878Alone"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testNA12878AloneMetaData() { + String testName = "testNA12878AloneMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrio() { + String testName = "testCEUTrio"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrioMetaData() { + String testName = "testCEUTrioMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testMalformedFam() { + String testName = "testMalformedFam"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } +} + +