Fix for GSA-589: SelectVariants with -number gives biased results. The implementation was not good and it's not worth keeping this busted code around given that we have a working implementation of a fractional random sampling already in place, so I removed it.
This commit is contained in:
parent
e8a6460a33
commit
bfc551f612
|
|
@ -50,7 +50,6 @@ import org.broadinstitute.sting.utils.variantcontext.*;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.PrintStream;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -278,13 +277,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
|
||||||
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
|
||||||
|
|
||||||
/**
|
|
||||||
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory
|
|
||||||
* given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants.
|
|
||||||
*/
|
|
||||||
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
|
|
||||||
protected int numRandom = 0;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
|
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
|
||||||
*/
|
*/
|
||||||
|
|
@ -330,20 +322,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
|
private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false;
|
||||||
|
|
||||||
|
|
||||||
/* Private class used to store the intermediate variants in the integer random selection process */
|
|
||||||
private static class RandomVariantStructure {
|
|
||||||
private VariantContext vc;
|
|
||||||
|
|
||||||
RandomVariantStructure(VariantContext vcP) {
|
|
||||||
vc = vcP;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void set (VariantContext vcP) {
|
|
||||||
vc = vcP;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum NumberAlleleRestriction {
|
public enum NumberAlleleRestriction {
|
||||||
ALL,
|
ALL,
|
||||||
BIALLELIC,
|
BIALLELIC,
|
||||||
|
|
@ -364,12 +342,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
|
|
||||||
|
|
||||||
/* variables used by the SELECT RANDOM modules */
|
/* variables used by the SELECT RANDOM modules */
|
||||||
private boolean SELECT_RANDOM_NUMBER = false;
|
|
||||||
private boolean SELECT_RANDOM_FRACTION = false;
|
private boolean SELECT_RANDOM_FRACTION = false;
|
||||||
private int variantNumber = 0;
|
|
||||||
private int nVariantsAdded = 0;
|
|
||||||
private int positionToAdd = 0;
|
|
||||||
private RandomVariantStructure [] variantArray;
|
|
||||||
|
|
||||||
//Random number generator for the genotypes to remove
|
//Random number generator for the genotypes to remove
|
||||||
private Random randomGenotypes = new Random();
|
private Random randomGenotypes = new Random();
|
||||||
|
|
@ -478,12 +451,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
|
mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true);
|
||||||
}
|
}
|
||||||
|
|
||||||
SELECT_RANDOM_NUMBER = numRandom > 0;
|
|
||||||
if (SELECT_RANDOM_NUMBER) {
|
|
||||||
logger.info("Selecting " + numRandom + " variants at random from the variant track");
|
|
||||||
variantArray = new RandomVariantStructure[numRandom];
|
|
||||||
}
|
|
||||||
|
|
||||||
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
SELECT_RANDOM_FRACTION = fractionRandom > 0;
|
||||||
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track");
|
||||||
|
|
||||||
|
|
@ -588,14 +555,10 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ( !failedJexlMatch ) {
|
if ( !failedJexlMatch &&
|
||||||
if (SELECT_RANDOM_NUMBER) {
|
!justRead &&
|
||||||
randomlyAddVariant(++variantNumber, sub);
|
( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) {
|
||||||
}
|
vcfWriter.add(sub);
|
||||||
else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) {
|
|
||||||
if ( ! justRead )
|
|
||||||
vcfWriter.add(sub);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -718,14 +681,6 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
|
|
||||||
public void onTraversalDone(Integer result) {
|
public void onTraversalDone(Integer result) {
|
||||||
logger.info(result + " records processed.");
|
logger.info(result + " records processed.");
|
||||||
|
|
||||||
if (SELECT_RANDOM_NUMBER) {
|
|
||||||
int positionToPrint = positionToAdd;
|
|
||||||
for (int i=0; i<numRandom; i++) {
|
|
||||||
vcfWriter.add(variantArray[positionToPrint].vc);
|
|
||||||
positionToPrint = nextCircularPosition(positionToPrint);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -809,25 +764,4 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
|
||||||
if ( sawDP )
|
if ( sawDP )
|
||||||
builder.attribute("DP", depth);
|
builder.attribute("DP", depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void randomlyAddVariant(int rank, VariantContext vc) {
|
|
||||||
if (nVariantsAdded < numRandom)
|
|
||||||
variantArray[nVariantsAdded++] = new RandomVariantStructure(vc);
|
|
||||||
|
|
||||||
else {
|
|
||||||
double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
|
|
||||||
double t = (1.0/(rank-numRandom+1));
|
|
||||||
if ( v < t) {
|
|
||||||
variantArray[positionToAdd].set(vc);
|
|
||||||
nVariantsAdded++;
|
|
||||||
positionToAdd = nextCircularPosition(positionToAdd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int nextCircularPosition(int cur) {
|
|
||||||
if ((cur + 1) == variantArray.length)
|
|
||||||
return 0;
|
|
||||||
return cur + 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -573,7 +573,7 @@ public class VariantContextUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we have more alternate alleles in the merged VC than in one or more of the
|
// if we have more alternate alleles in the merged VC than in one or more of the
|
||||||
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF
|
// original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD
|
||||||
for ( final VariantContext vc : VCs ) {
|
for ( final VariantContext vc : VCs ) {
|
||||||
if (vc.alleles.size() == 1)
|
if (vc.alleles.size() == 1)
|
||||||
continue;
|
continue;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue