Added Hapmap data track (using rodGFF class for GFF file format) to toolkit as a command line option, Hapmap metrics to AlleleFrequencyMetricsWalker, and a python Geli2GFF file converter.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@163 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f7363cf935
commit
9dee9ab51c
|
|
@ -34,6 +34,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
public String REGION_STR = null;
|
||||
public String Analysis_Name = null;
|
||||
public String DBSNP_FILE = null;
|
||||
public String HAPMAP_FILE = null;
|
||||
public Boolean ENABLED_THREADED_IO = false;
|
||||
public Boolean UNSAFE = false;
|
||||
public Boolean ENABLED_SORT_ON_FLY = false;
|
||||
|
|
@ -66,6 +67,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR");
|
||||
m_parser.addRequiredlArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name");
|
||||
m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE");
|
||||
m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE");
|
||||
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
||||
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
||||
m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY");
|
||||
|
|
@ -85,7 +87,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
walkerManager = new WalkerManager(pluginPathName);
|
||||
|
||||
final boolean TEST_ROD = false;
|
||||
ReferenceOrderedData[] rods = null;
|
||||
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
|
||||
|
||||
if (TEST_ROD) {
|
||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
|
||||
|
|
@ -94,13 +96,16 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
||||
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class);
|
||||
//dbsnp.testMe();
|
||||
rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp };
|
||||
rods.add(dbsnp); // { gff, dbsnp };
|
||||
} else if (DBSNP_FILE != null) {
|
||||
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class);
|
||||
//dbsnp.testMe();
|
||||
rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp };
|
||||
} else {
|
||||
rods = new ReferenceOrderedData[]{}; // { gff, dbsnp };
|
||||
rods.add(dbsnp); // { gff, dbsnp };
|
||||
}
|
||||
|
||||
if (HAPMAP_FILE != null) {
|
||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File(HAPMAP_FILE), rodGFF.class);
|
||||
rods.add(gff);
|
||||
}
|
||||
|
||||
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
|
||||
|
|
|
|||
|
|
@ -108,10 +108,10 @@ public class TraversalEngine {
|
|||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
||||
* @param rods Array of reference ordered data sets
|
||||
*/
|
||||
public TraversalEngine(File reads, File ref, ReferenceOrderedData[] rods) {
|
||||
public TraversalEngine(File reads, File ref, List<ReferenceOrderedData> rods) {
|
||||
readsFile = reads;
|
||||
refFileName = ref;
|
||||
this.rods = Arrays.asList(rods);
|
||||
this.rods = rods;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers;
|
|||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||
import org.broadinstitute.sting.gatk.walkers.BasicLociWalker;
|
||||
import org.broadinstitute.sting.gatk.LocusContext;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||
|
|
@ -25,6 +26,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
long num_loci_total=0;
|
||||
long num_loci_confident=0;
|
||||
double LOD_cutoff = 5;
|
||||
long hapmap_tp=0;
|
||||
long hapmap_fp=0;
|
||||
|
||||
AlleleFrequencyWalker caller;
|
||||
|
||||
|
|
@ -32,17 +35,6 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
{
|
||||
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context);
|
||||
|
||||
boolean is_dbSNP_SNP = false;
|
||||
|
||||
for ( ReferenceOrderedDatum datum : rodData )
|
||||
{
|
||||
if ( datum != null && datum instanceof rodDbSNP)
|
||||
{
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||
}
|
||||
}
|
||||
|
||||
num_loci_total += 1;
|
||||
|
||||
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
|
||||
|
|
@ -53,7 +45,37 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
|
||||
num_variants += 1;
|
||||
|
||||
if (is_dbSNP_SNP)
|
||||
boolean is_dbSNP_SNP = false;
|
||||
boolean hapmap_hit = false;
|
||||
|
||||
for ( ReferenceOrderedDatum datum : rodData )
|
||||
{
|
||||
if ( datum != null )
|
||||
{
|
||||
if ( datum instanceof rodDbSNP )
|
||||
{
|
||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||
}
|
||||
|
||||
if ( datum instanceof rodGFF )
|
||||
{
|
||||
rodGFF hapmap = (rodGFF) datum;
|
||||
String hapmap_genotype = hapmap.getFeature();
|
||||
String called_genotype = alleleFreq.asString();
|
||||
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
|
||||
// There is a hapmap site here, is it correct?
|
||||
if (hapmap_genotype.equals(called_genotype))
|
||||
{
|
||||
hapmap_tp++;
|
||||
} else {
|
||||
hapmap_fp++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_dbSNP_SNP)
|
||||
{
|
||||
dbsnp_hits += 1;
|
||||
}
|
||||
|
|
@ -69,12 +91,16 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
|||
System.out.printf("\n");
|
||||
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
|
||||
System.out.printf("METRICS -------------------------------------------------\n");
|
||||
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
|
||||
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
|
||||
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||
if (num_variants != 0)
|
||||
{
|
||||
System.out.printf("METRICS Number of Variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp);
|
||||
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp);
|
||||
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp);
|
||||
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp));
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,10 @@ public class AlleleFrequencyEstimate {
|
|||
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
|
||||
long numNonrefBases = Math.round(qstar * N);
|
||||
long numRefBases = N - numNonrefBases;
|
||||
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
|
||||
if (ref < alt) { // order bases alphabetically
|
||||
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
|
||||
}else{
|
||||
return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Converts Geli files (genotype likelihood in picard) to GFF format for GATK
|
||||
|
||||
import sys, os, subprocess
|
||||
|
||||
Index2AffyChr = []
|
||||
Index2AffyChr.append("chrM")
|
||||
for i in range(1,23):
|
||||
Index2AffyChr.append("chr"+str(i))
|
||||
Index2AffyChr.append("chrX")
|
||||
Index2AffyChr.append("chrY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY"
|
||||
sys.exit()
|
||||
else:
|
||||
infile = sys.argv[1]
|
||||
outdir = sys.argv[2]
|
||||
|
||||
output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff"
|
||||
outfile = open(output_file, "w")
|
||||
|
||||
cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile
|
||||
pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout
|
||||
|
||||
pipe.readline()
|
||||
for line in pipe:
|
||||
if line[0] not in ('@', '#'):
|
||||
fields = line.split("\t")
|
||||
try:
|
||||
contig = Index2AffyChr[int(fields[0])]
|
||||
except KeyError, ValueError:
|
||||
print "skipping "+fields[0]
|
||||
continue # Skip entries not in chr 0 through 24
|
||||
|
||||
print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue