diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index abec6723e..39512935b 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -34,6 +34,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { public String REGION_STR = null; public String Analysis_Name = null; public String DBSNP_FILE = null; + public String HAPMAP_FILE = null; public Boolean ENABLED_THREADED_IO = false; public Boolean UNSAFE = false; public Boolean ENABLED_SORT_ON_FLY = false; @@ -66,6 +67,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR"); m_parser.addRequiredlArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name"); m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE"); + m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE"); m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO"); m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE"); m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY"); @@ -85,7 +87,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { walkerManager = new WalkerManager(pluginPathName); final boolean TEST_ROD = false; - ReferenceOrderedData[] rods = null; + List rods = new ArrayList(); if (TEST_ROD) { ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class); @@ -94,13 +96,16 @@ public class GenomeAnalysisTK extends CommandLineProgram { //ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class ); ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class); //dbsnp.testMe(); - rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp }; + rods.add(dbsnp); // { gff, dbsnp }; } else if (DBSNP_FILE != null) { ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class); //dbsnp.testMe(); - rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp }; - } else { - rods = new ReferenceOrderedData[]{}; // { gff, dbsnp }; + rods.add(dbsnp); // { gff, dbsnp }; + } + + if (HAPMAP_FILE != null) { + ReferenceOrderedData gff = new ReferenceOrderedData(new File(HAPMAP_FILE), rodGFF.class); + rods.add(gff); } this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods); diff --git a/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java index 29cc7a0cb..3bd2ecad4 100755 --- a/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java @@ -108,10 +108,10 @@ public class TraversalEngine { * @param ref Reference file in FASTA format, assumes a .dict file is also available * @param rods Array of reference ordered data sets */ - public TraversalEngine(File reads, File ref, ReferenceOrderedData[] rods) { + public TraversalEngine(File reads, File ref, List rods) { readsFile = reads; refFileName = ref; - this.rods = Arrays.asList(rods); + this.rods = rods; } // -------------------------------------------------------------------------------------------------------------- diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java index e3eee8e5e..50b160a05 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/AlleleFrequencyMetricsWalker.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; +import org.broadinstitute.sting.gatk.refdata.rodGFF; import org.broadinstitute.sting.gatk.walkers.BasicLociWalker; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; @@ -25,6 +26,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker= LOD_cutoff) { num_loci_confident += 1; } @@ -53,7 +45,37 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker= %.0f)\n", LOD_cutoff); System.out.printf("METRICS -------------------------------------------------\n"); - System.out.printf("METRICS Total loci : %d\n", num_loci_total); - System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); + System.out.printf("METRICS Total loci : %d\n", num_loci_total); + System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total); if (num_variants != 0) { - System.out.printf("METRICS Number of Variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); - System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); + System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants); + System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants); + System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp); + System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp); + System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp); + System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp)); } System.out.println(); } diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index 87ffda954..4a54bcb8a 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -43,6 +43,10 @@ public class AlleleFrequencyEstimate { // to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1 long numNonrefBases = Math.round(qstar * N); long numRefBases = N - numNonrefBases; - return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases); + if (ref < alt) { // order bases alphabetically + return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases); + }else{ + return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases); + } } } diff --git a/python/Geli2GFF.py b/python/Geli2GFF.py new file mode 100755 index 000000000..928434be0 --- /dev/null +++ b/python/Geli2GFF.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +# Converts Geli files (genotype likelihood in picard) to GFF format for GATK + +import sys, os, subprocess + +Index2AffyChr = [] +Index2AffyChr.append("chrM") +for i in range(1,23): + Index2AffyChr.append("chr"+str(i)) +Index2AffyChr.append("chrX") +Index2AffyChr.append("chrY") + +if __name__ == "__main__": + if len(sys.argv) < 3: + print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY" + sys.exit() + else: + infile = sys.argv[1] + outdir = sys.argv[2] + + output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff" + outfile = open(output_file, "w") + + cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile + pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout + + pipe.readline() + for line in pipe: + if line[0] not in ('@', '#'): + fields = line.split("\t") + try: + contig = Index2AffyChr[int(fields[0])] + except KeyError, ValueError: + print "skipping "+fields[0] + continue # Skip entries not in chr 0 through 24 + + print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t" + + + + + + + + + +