Changes:
1) GATKArgumentCollection has a command to turn off randomization if setting the seed isn't enough. Right now it's only hooked into RankSumTest. 2) RankSumTest now can be passed a boolean telling it whether to use a dithering or non-randomizing comparator. Unit tested. 3) VariantsToBinaryPed can now output in both individual-major and SNP-major mode. Integration test. 4) Updates to PlinkBed-handling python scripts and utilities. 5) Tool for calculating (LD-corrected) GRMs put under version control. This is analysis for T2D, but I don't want to lose it should something happen to my computer.
This commit is contained in:
parent
ac87ed47bb
commit
1be8a88909
|
|
@ -140,6 +140,9 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
|
||||
public boolean nonDeterministicRandomSeed = false;
|
||||
|
||||
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
|
||||
public boolean disableRandomization = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Downsampling Arguments
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.annotator;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
|
|
@ -10,6 +11,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsC
|
|||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.MannWhitneyU;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -19,10 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
|||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -30,6 +29,7 @@ import java.util.Map;
|
|||
*/
|
||||
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
|
||||
static final boolean DEBUG = false;
|
||||
private boolean useDithering = true;
|
||||
|
||||
public Map<String, Object> annotate(final RefMetaDataTracker tracker,
|
||||
final AnnotatorCompatible walker,
|
||||
|
|
@ -70,7 +70,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
if (refQuals.isEmpty() && altQuals.isEmpty())
|
||||
return null;
|
||||
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU();
|
||||
final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
|
||||
for (final Double qual : altQuals) {
|
||||
mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
|
||||
}
|
||||
|
|
@ -131,4 +131,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
|
|||
p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ||
|
||||
((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if
|
||||
* engine randomization is turned off, and if so does not dither.
|
||||
* @param walker
|
||||
* @param toolkit
|
||||
* @param headerLines
|
||||
*/
|
||||
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
|
||||
useDithering = ! toolkit.getArguments().disableRandomization;
|
||||
}
|
||||
}
|
||||
|
|
@ -66,6 +66,9 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
"(in which case it will be copied to the file you provide as fam output).")
|
||||
File metaDataFile;
|
||||
|
||||
@Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)")
|
||||
OutputMode mode = OutputMode.INDIVIDUAL_MAJOR;
|
||||
|
||||
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
|
||||
PrintStream outBed;
|
||||
|
||||
|
|
@ -81,6 +84,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
||||
boolean majorAlleleFirst = false;
|
||||
|
||||
enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR }
|
||||
|
||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||
|
||||
private static final byte HOM_REF = 0x0;
|
||||
|
|
@ -138,14 +143,18 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
throw new UserException("No metadata provided for sample "+sample);
|
||||
}
|
||||
}
|
||||
try {
|
||||
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
|
||||
printMap.put(sample,new PrintStream(temp));
|
||||
tempFiles.put(sample,temp);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error creating temporary file",e);
|
||||
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
|
||||
// only need to instantiate the files and buffers if in individual major.
|
||||
// Cut down on memory.
|
||||
try {
|
||||
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
|
||||
printMap.put(sample,new PrintStream(temp));
|
||||
tempFiles.put(sample,temp);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error creating temporary file",e);
|
||||
}
|
||||
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
|
||||
}
|
||||
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
|
||||
famOrder.add(sample);
|
||||
}
|
||||
}
|
||||
|
|
@ -195,6 +204,17 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
// write an entry into the map file
|
||||
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
|
||||
refOut,altOut);
|
||||
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
|
||||
writeIndividualMajor(vc,altMajor);
|
||||
} else {
|
||||
writeSNPMajor(vc,altMajor);
|
||||
}
|
||||
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public void writeIndividualMajor(VariantContext vc, boolean altMajor) {
|
||||
// store genotypes per sample into the buffer
|
||||
for ( Genotype g : vc.getGenotypes() ) {
|
||||
String sample = g.getSampleName();
|
||||
|
|
@ -202,6 +222,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
byte enc = getEncoding(g,genotypeCount,altMajor);
|
||||
samBuf[byteCount] |= enc;
|
||||
}
|
||||
|
||||
genotypeCount++;
|
||||
if ( genotypeCount % 4 == 0 ) {
|
||||
byteCount++;
|
||||
|
|
@ -222,8 +243,29 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
genotypeCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
public void writeSNPMajor(VariantContext vc, boolean altMajor) {
|
||||
// for each sample, write the genotype into the bed file, in the
|
||||
// order of the fam file
|
||||
genotypeCount = 0;
|
||||
byteCount = 0;
|
||||
byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2
|
||||
for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) {
|
||||
byte enc = getEncoding(g,genotypeCount,altMajor);
|
||||
bytes[byteCount] |= enc;
|
||||
genotypeCount++;
|
||||
if ( genotypeCount % 4 == 0 ) {
|
||||
byteCount++;
|
||||
genotypeCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
outBed.write(bytes);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error writing to output bed file",e);
|
||||
}
|
||||
}
|
||||
|
||||
public Integer reduce(Integer m, Integer r) {
|
||||
|
|
@ -236,6 +278,14 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
|
||||
public void onTraversalDone(Integer numSites) {
|
||||
logger.info(String.format("%d sites processed!",numSites));
|
||||
|
||||
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
|
||||
mergeGenotypeTempFiles(numSites);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void mergeGenotypeTempFiles(int numSites) {
|
||||
// push out the remaining genotypes and close stream
|
||||
for ( String sample : printMap.keySet() ) {
|
||||
try {
|
||||
|
|
@ -278,7 +328,6 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
throw new ReviewedStingException("Error reading form temp file for input.",e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private byte getEncoding(Genotype g, int offset, boolean altMajor) {
|
||||
|
|
@ -355,7 +404,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
|||
private void writeBedHeader() {
|
||||
// write magic bits into the ped file
|
||||
try {
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)});
|
||||
// ultimately, the bed will be in individual-major mode
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("error writing to output file.");
|
||||
|
|
|
|||
|
|
@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
|
||||
}
|
||||
|
||||
public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) {
|
||||
return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode +
|
||||
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
|
||||
" -bim %s -fam %s -bed %s";
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878Alone() {
|
||||
String testName = "testNA12878Alone";
|
||||
|
|
@ -52,6 +59,18 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878AloneSNPMajor() {
|
||||
String testName = "testNA12878AloneSNPMajor";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"),
|
||||
3,
|
||||
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446")
|
||||
);
|
||||
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNA12878HighGQ() {
|
||||
String testName = "testNA12878HighGQ";
|
||||
|
|
@ -86,6 +105,16 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
|
|||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000GWithIndelsSNPMajor() {
|
||||
String testName = "test1000GWithIndelsSNPMajor";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"),
|
||||
3,
|
||||
Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af")
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1000G_Symbolic() {
|
||||
String testName = "test1000G_Symbolic";
|
||||
|
|
|
|||
Loading…
Reference in New Issue