warn user when dbSNP rod looks suspicious
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1400 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2841e151d0
commit
ecae619a1b
|
|
@ -10,6 +10,8 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
import org.broadinstitute.sting.utils.Pair;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
@ -45,6 +47,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
private long counted_bases = 0; // number of bases used to count covariates
|
private long counted_bases = 0; // number of bases used to count covariates
|
||||||
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||||
|
|
||||||
|
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for dbSNP loci
|
||||||
|
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0l, 0l); // mismatch/base counts for non-dbSNP loci
|
||||||
|
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||||
|
private static final int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||||
|
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
|
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
|
||||||
|
|
@ -107,13 +114,61 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
counted_sites += 1;
|
counted_sites += 1;
|
||||||
|
updateMismatchCounts(novel_counts, context, ref.getBase());
|
||||||
} else {
|
} else {
|
||||||
|
updateMismatchCounts(dbSNP_counts, context, ref.getBase());
|
||||||
skipped_sites += 1;
|
skipped_sites += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( ++lociSinceLastDbsnpCheck == DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||||
|
lociSinceLastDbsnpCheck = 0;
|
||||||
|
validateDbsnpMismatchRate();
|
||||||
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Update the mismatch / total_base counts for a given class of loci.
|
||||||
|
*
|
||||||
|
* @param counts
|
||||||
|
* @param context
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private static void updateMismatchCounts(Pair<Long, Long> counts, AlignmentContext context, char ref) {
|
||||||
|
List<SAMRecord> reads = context.getReads();
|
||||||
|
List<Integer> offsets = context.getOffsets();
|
||||||
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
|
char readChar = reads.get(i).getReadString().charAt(offsets.get(i));
|
||||||
|
int readCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(readChar);
|
||||||
|
int refCharBaseIndex = BaseUtils.simpleBaseToBaseIndex(ref);
|
||||||
|
|
||||||
|
if ( readCharBaseIndex != -1 && refCharBaseIndex != -1 ) {
|
||||||
|
if ( readCharBaseIndex != refCharBaseIndex )
|
||||||
|
counts.first++;
|
||||||
|
counts.second++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate the dbSNP mismatch rates.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private void validateDbsnpMismatchRate() {
|
||||||
|
if ( novel_counts.second == 0 || dbSNP_counts.second == 0 )
|
||||||
|
return;
|
||||||
|
|
||||||
|
double fractionMM_novel = (double)novel_counts.first / (double)novel_counts.second;
|
||||||
|
double fractionMM_dbsnp = (double)dbSNP_counts.first / (double)dbSNP_counts.second;
|
||||||
|
//System.out.println(String.format("dbSNP rate = %.2f, novel rate = %.2f", fractionMM_dbsnp, fractionMM_novel));
|
||||||
|
|
||||||
|
if ( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel )
|
||||||
|
Utils.warnUser("The variation rate of dbSNP sites seems suspicious! Please double-check that the correct DBSNP ROD is being used.");
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check to see whether this read group should be processed. Returns true if the
|
* Check to see whether this read group should be processed. Returns true if the
|
||||||
* read group is in the list of platforms to process or the platform == *, indicating
|
* read group is in the list of platforms to process or the platform == *, indicating
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue