From 1be8a88909abe9fbab855e8b63f1ca73e0175e84 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 3 Oct 2012 16:02:42 -0400 Subject: [PATCH] Changes: 1) GATKArgumentCollection has a command to turn off randomization if setting the seed isn't enough. Right now it's only hooked into RankSumTest. 2) RankSumTest now can be passed a boolean telling it whether to use a dithering or non-randomizing comparator. Unit tested. 3) VariantsToBinaryPed can now output in both individual-major and SNP-major mode. Integration test. 4) Updates to PlinkBed-handling python scripts and utilities. 5) Tool for calculating (LD-corrected) GRMs put under version control. This is analysis for T2D, but I don't want to lose it should something happen to my computer. --- .../arguments/GATKArgumentCollection.java | 3 + .../gatk/walkers/annotator/RankSumTest.java | 21 ++++-- .../variantutils/VariantsToBinaryPed.java | 69 ++++++++++++++++--- .../VariantsToBinaryPedIntegrationTest.java | 29 ++++++++ 4 files changed, 107 insertions(+), 15 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index c8887b8b2..7875ced5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -140,6 +140,9 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; + @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.") + public boolean disableRandomization = false; + // -------------------------------------------------------------------------------------------------------------- // // Downsampling Arguments diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ec873c5dd..7c7391812 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsC import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -19,10 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -30,6 +29,7 @@ import java.util.Map; */ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { static final boolean DEBUG = false; + private boolean useDithering = true; public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -70,7 +70,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR if (refQuals.isEmpty() && altQuals.isEmpty()) return null; - final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } @@ -131,4 +131,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here } + + /** + * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if + * engine randomization is turned off, and if so does not dither. + * @param walker + * @param toolkit + * @param headerLines + */ + public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + useDithering = ! toolkit.getArguments().disableRandomization; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index b7ef85a04..48a7ead5a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -66,6 +66,9 @@ public class VariantsToBinaryPed extends RodWalker { "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; + @Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)") + OutputMode mode = OutputMode.INDIVIDUAL_MAJOR; + @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") PrintStream outBed; @@ -81,6 +84,8 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; + enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR } + private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static final byte HOM_REF = 0x0; @@ -138,14 +143,18 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("No metadata provided for sample "+sample); } } - try { - File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); - printMap.put(sample,new PrintStream(temp)); - tempFiles.put(sample,temp); - } catch (IOException e) { - throw new ReviewedStingException("Error creating temporary file",e); + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + // only need to instantiate the files and buffers if in individual major. + // Cut down on memory. + try { + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); + printMap.put(sample,new PrintStream(temp)); + tempFiles.put(sample,temp); + } catch (IOException e) { + throw new ReviewedStingException("Error creating temporary file",e); + } + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } - genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); famOrder.add(sample); } } @@ -195,6 +204,17 @@ public class VariantsToBinaryPed extends RodWalker { // write an entry into the map file outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), refOut,altOut); + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + writeIndividualMajor(vc,altMajor); + } else { + writeSNPMajor(vc,altMajor); + } + + + return 1; + } + + public void writeIndividualMajor(VariantContext vc, boolean altMajor) { // store genotypes per sample into the buffer for ( Genotype g : vc.getGenotypes() ) { String sample = g.getSampleName(); @@ -202,6 +222,7 @@ public class VariantsToBinaryPed extends RodWalker { byte enc = getEncoding(g,genotypeCount,altMajor); samBuf[byteCount] |= enc; } + genotypeCount++; if ( genotypeCount % 4 == 0 ) { byteCount++; @@ -222,8 +243,29 @@ public class VariantsToBinaryPed extends RodWalker { } genotypeCount = 0; } + } - return 1; + public void writeSNPMajor(VariantContext vc, boolean altMajor) { + // for each sample, write the genotype into the bed file, in the + // order of the fam file + genotypeCount = 0; + byteCount = 0; + byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2 + for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) { + byte enc = getEncoding(g,genotypeCount,altMajor); + bytes[byteCount] |= enc; + genotypeCount++; + if ( genotypeCount % 4 == 0 ) { + byteCount++; + genotypeCount = 0; + } + } + + try { + outBed.write(bytes); + } catch (IOException e) { + throw new ReviewedStingException("Error writing to output bed file",e); + } } public Integer reduce(Integer m, Integer r) { @@ -236,6 +278,14 @@ public class VariantsToBinaryPed extends RodWalker { public void onTraversalDone(Integer numSites) { logger.info(String.format("%d sites processed!",numSites)); + + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + mergeGenotypeTempFiles(numSites); + } + + } + + private void mergeGenotypeTempFiles(int numSites) { // push out the remaining genotypes and close stream for ( String sample : printMap.keySet() ) { try { @@ -278,7 +328,6 @@ public class VariantsToBinaryPed extends RodWalker { throw new ReviewedStingException("Error reading form temp file for input.",e); } } - } private byte getEncoding(Genotype g, int offset, boolean altMajor) { @@ -355,7 +404,7 @@ public class VariantsToBinaryPed extends RodWalker { private void writeBedHeader() { // write magic bits into the ped file try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)}); // ultimately, the bed will be in individual-major mode } catch (IOException e) { throw new ReviewedStingException("error writing to output file."); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index 3e59508bc..8f11c09f6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { } + public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) { + return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + @Test public void testNA12878Alone() { String testName = "testNA12878Alone"; @@ -52,6 +59,18 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + @Test + public void testNA12878AloneSNPMajor() { + String testName = "testNA12878AloneSNPMajor"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446") + ); + + executeTest(testName, spec); + } + @Test public void testNA12878HighGQ() { String testName = "testNA12878HighGQ"; @@ -86,6 +105,16 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { ); } + @Test + public void test1000GWithIndelsSNPMajor() { + String testName = "test1000GWithIndelsSNPMajor"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"), + 3, + Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af") + ); + } + @Test public void test1000G_Symbolic() { String testName = "test1000G_Symbolic";