Added Hapmap data track (using rodGFF class for GFF file format) to toolkit as a command line option, Hapmap metrics to AlleleFrequencyMetricsWalker, and a python Geli2GFF file converter.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@163 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
andrewk 2009-03-24 03:58:03 +00:00
parent f7363cf935
commit 9dee9ab51c
5 changed files with 107 additions and 24 deletions

View File

@ -34,6 +34,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
public String REGION_STR = null;
public String Analysis_Name = null;
public String DBSNP_FILE = null;
public String HAPMAP_FILE = null;
public Boolean ENABLED_THREADED_IO = false;
public Boolean UNSAFE = false;
public Boolean ENABLED_SORT_ON_FLY = false;
@ -66,6 +67,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR");
m_parser.addRequiredlArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name");
m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE");
m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE");
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY");
@ -85,7 +87,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
walkerManager = new WalkerManager(pluginPathName);
final boolean TEST_ROD = false;
ReferenceOrderedData[] rods = null;
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
if (TEST_ROD) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
@ -94,13 +96,16 @@ public class GenomeAnalysisTK extends CommandLineProgram {
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class);
//dbsnp.testMe();
rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp };
rods.add(dbsnp); // { gff, dbsnp };
} else if (DBSNP_FILE != null) {
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class);
//dbsnp.testMe();
rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp };
} else {
rods = new ReferenceOrderedData[]{}; // { gff, dbsnp };
rods.add(dbsnp); // { gff, dbsnp };
}
if (HAPMAP_FILE != null) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File(HAPMAP_FILE), rodGFF.class);
rods.add(gff);
}
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);

View File

@ -108,10 +108,10 @@ public class TraversalEngine {
* @param ref Reference file in FASTA format, assumes a .dict file is also available
* @param rods Array of reference ordered data sets
*/
public TraversalEngine(File reads, File ref, ReferenceOrderedData[] rods) {
public TraversalEngine(File reads, File ref, List<ReferenceOrderedData> rods) {
readsFile = reads;
refFileName = ref;
this.rods = Arrays.asList(rods);
this.rods = rods;
}
// --------------------------------------------------------------------------------------------------------------

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.gatk.walkers.BasicLociWalker;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
@ -25,6 +26,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
long num_loci_total=0;
long num_loci_confident=0;
double LOD_cutoff = 5;
long hapmap_tp=0;
long hapmap_fp=0;
AlleleFrequencyWalker caller;
@ -32,17 +35,6 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
{
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context);
boolean is_dbSNP_SNP = false;
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null && datum instanceof rodDbSNP)
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
}
num_loci_total += 1;
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
@ -53,7 +45,37 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
num_variants += 1;
if (is_dbSNP_SNP)
boolean is_dbSNP_SNP = false;
boolean hapmap_hit = false;
for ( ReferenceOrderedDatum datum : rodData )
{
if ( datum != null )
{
if ( datum instanceof rodDbSNP )
{
rodDbSNP dbsnp = (rodDbSNP)datum;
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
}
if ( datum instanceof rodGFF )
{
rodGFF hapmap = (rodGFF) datum;
String hapmap_genotype = hapmap.getFeature();
String called_genotype = alleleFreq.asString();
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
// There is a hapmap site here, is it correct?
if (hapmap_genotype.equals(called_genotype))
{
hapmap_tp++;
} else {
hapmap_fp++;
}
}
}
}
if (is_dbSNP_SNP)
{
dbsnp_hits += 1;
}
@ -69,12 +91,16 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
System.out.printf("\n");
System.out.printf("METRICS Allele Frequency Metrics (LOD >= %.0f)\n", LOD_cutoff);
System.out.printf("METRICS -------------------------------------------------\n");
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
System.out.printf("METRICS Total loci : %d\n", num_loci_total);
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
if (num_variants != 0)
{
System.out.printf("METRICS Number of Variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp);
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp);
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp);
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp));
}
System.out.println();
}

View File

@ -43,6 +43,10 @@ public class AlleleFrequencyEstimate {
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
long numNonrefBases = Math.round(qstar * N);
long numRefBases = N - numNonrefBases;
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
if (ref < alt) { // order bases alphabetically
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
}else{
return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases);
}
}
}

48
python/Geli2GFF.py 100755
View File

@ -0,0 +1,48 @@
#!/usr/bin/env python
# Converts Geli files (genotype likelihood in picard) to GFF format for GATK
import sys, os, subprocess
Index2AffyChr = []
Index2AffyChr.append("chrM")
for i in range(1,23):
Index2AffyChr.append("chr"+str(i))
Index2AffyChr.append("chrX")
Index2AffyChr.append("chrY")
if __name__ == "__main__":
if len(sys.argv) < 3:
print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY"
sys.exit()
else:
infile = sys.argv[1]
outdir = sys.argv[2]
output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff"
outfile = open(output_file, "w")
cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile
pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout
pipe.readline()
for line in pipe:
if line[0] not in ('@', '#'):
fields = line.split("\t")
try:
contig = Index2AffyChr[int(fields[0])]
except KeyError, ValueError:
print "skipping "+fields[0]
continue # Skip entries not in chr 0 through 24
print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t"