Added Hapmap data track (using rodGFF class for GFF file format) to toolkit as a command line option, Hapmap metrics to AlleleFrequencyMetricsWalker, and a python Geli2GFF file converter.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@163 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
f7363cf935
commit
9dee9ab51c
|
|
@ -34,6 +34,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
public String REGION_STR = null;
|
public String REGION_STR = null;
|
||||||
public String Analysis_Name = null;
|
public String Analysis_Name = null;
|
||||||
public String DBSNP_FILE = null;
|
public String DBSNP_FILE = null;
|
||||||
|
public String HAPMAP_FILE = null;
|
||||||
public Boolean ENABLED_THREADED_IO = false;
|
public Boolean ENABLED_THREADED_IO = false;
|
||||||
public Boolean UNSAFE = false;
|
public Boolean UNSAFE = false;
|
||||||
public Boolean ENABLED_SORT_ON_FLY = false;
|
public Boolean ENABLED_SORT_ON_FLY = false;
|
||||||
|
|
@ -66,6 +67,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR");
|
m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR");
|
||||||
m_parser.addRequiredlArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name");
|
m_parser.addRequiredlArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name");
|
||||||
m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE");
|
m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE");
|
||||||
|
m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE");
|
||||||
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
||||||
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
||||||
m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY");
|
m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY");
|
||||||
|
|
@ -85,7 +87,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
walkerManager = new WalkerManager(pluginPathName);
|
walkerManager = new WalkerManager(pluginPathName);
|
||||||
|
|
||||||
final boolean TEST_ROD = false;
|
final boolean TEST_ROD = false;
|
||||||
ReferenceOrderedData[] rods = null;
|
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
|
||||||
|
|
||||||
if (TEST_ROD) {
|
if (TEST_ROD) {
|
||||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
|
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
|
||||||
|
|
@ -94,13 +96,16 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
||||||
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class);
|
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class);
|
||||||
//dbsnp.testMe();
|
//dbsnp.testMe();
|
||||||
rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp };
|
rods.add(dbsnp); // { gff, dbsnp };
|
||||||
} else if (DBSNP_FILE != null) {
|
} else if (DBSNP_FILE != null) {
|
||||||
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class);
|
ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class);
|
||||||
//dbsnp.testMe();
|
//dbsnp.testMe();
|
||||||
rods = new ReferenceOrderedData[]{dbsnp}; // { gff, dbsnp };
|
rods.add(dbsnp); // { gff, dbsnp };
|
||||||
} else {
|
}
|
||||||
rods = new ReferenceOrderedData[]{}; // { gff, dbsnp };
|
|
||||||
|
if (HAPMAP_FILE != null) {
|
||||||
|
ReferenceOrderedData gff = new ReferenceOrderedData(new File(HAPMAP_FILE), rodGFF.class);
|
||||||
|
rods.add(gff);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
|
this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods);
|
||||||
|
|
|
||||||
|
|
@ -108,10 +108,10 @@ public class TraversalEngine {
|
||||||
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
* @param ref Reference file in FASTA format, assumes a .dict file is also available
|
||||||
* @param rods Array of reference ordered data sets
|
* @param rods Array of reference ordered data sets
|
||||||
*/
|
*/
|
||||||
public TraversalEngine(File reads, File ref, ReferenceOrderedData[] rods) {
|
public TraversalEngine(File reads, File ref, List<ReferenceOrderedData> rods) {
|
||||||
readsFile = reads;
|
readsFile = reads;
|
||||||
refFileName = ref;
|
refFileName = ref;
|
||||||
this.rods = Arrays.asList(rods);
|
this.rods = rods;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.playground.gatk.walkers;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
||||||
import org.broadinstitute.sting.gatk.walkers.BasicLociWalker;
|
import org.broadinstitute.sting.gatk.walkers.BasicLociWalker;
|
||||||
import org.broadinstitute.sting.gatk.LocusContext;
|
import org.broadinstitute.sting.gatk.LocusContext;
|
||||||
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker;
|
||||||
|
|
@ -25,6 +26,8 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
long num_loci_total=0;
|
long num_loci_total=0;
|
||||||
long num_loci_confident=0;
|
long num_loci_confident=0;
|
||||||
double LOD_cutoff = 5;
|
double LOD_cutoff = 5;
|
||||||
|
long hapmap_tp=0;
|
||||||
|
long hapmap_fp=0;
|
||||||
|
|
||||||
AlleleFrequencyWalker caller;
|
AlleleFrequencyWalker caller;
|
||||||
|
|
||||||
|
|
@ -32,17 +35,6 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
{
|
{
|
||||||
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context);
|
AlleleFrequencyEstimate alleleFreq = caller.map(rodData, ref, context);
|
||||||
|
|
||||||
boolean is_dbSNP_SNP = false;
|
|
||||||
|
|
||||||
for ( ReferenceOrderedDatum datum : rodData )
|
|
||||||
{
|
|
||||||
if ( datum != null && datum instanceof rodDbSNP)
|
|
||||||
{
|
|
||||||
rodDbSNP dbsnp = (rodDbSNP)datum;
|
|
||||||
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
num_loci_total += 1;
|
num_loci_total += 1;
|
||||||
|
|
||||||
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
|
if (Math.abs(alleleFreq.LOD) >= LOD_cutoff) { num_loci_confident += 1; }
|
||||||
|
|
@ -53,6 +45,36 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
|
|
||||||
num_variants += 1;
|
num_variants += 1;
|
||||||
|
|
||||||
|
boolean is_dbSNP_SNP = false;
|
||||||
|
boolean hapmap_hit = false;
|
||||||
|
|
||||||
|
for ( ReferenceOrderedDatum datum : rodData )
|
||||||
|
{
|
||||||
|
if ( datum != null )
|
||||||
|
{
|
||||||
|
if ( datum instanceof rodDbSNP )
|
||||||
|
{
|
||||||
|
rodDbSNP dbsnp = (rodDbSNP)datum;
|
||||||
|
if (dbsnp.isSNP()) is_dbSNP_SNP = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( datum instanceof rodGFF )
|
||||||
|
{
|
||||||
|
rodGFF hapmap = (rodGFF) datum;
|
||||||
|
String hapmap_genotype = hapmap.getFeature();
|
||||||
|
String called_genotype = alleleFreq.asString();
|
||||||
|
System.out.format("HAPMAP %s %s %.2f\n", hapmap_genotype, called_genotype, alleleFreq.getLOD());
|
||||||
|
// There is a hapmap site here, is it correct?
|
||||||
|
if (hapmap_genotype.equals(called_genotype))
|
||||||
|
{
|
||||||
|
hapmap_tp++;
|
||||||
|
} else {
|
||||||
|
hapmap_fp++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (is_dbSNP_SNP)
|
if (is_dbSNP_SNP)
|
||||||
{
|
{
|
||||||
dbsnp_hits += 1;
|
dbsnp_hits += 1;
|
||||||
|
|
@ -73,8 +95,12 @@ public class AlleleFrequencyMetricsWalker extends BasicLociWalker<AlleleFrequenc
|
||||||
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
System.out.printf("METRICS Total called with confidence : %d (%.2f%%)\n", num_loci_confident, 100.0 * (float)num_loci_confident / (float)num_loci_total);
|
||||||
if (num_variants != 0)
|
if (num_variants != 0)
|
||||||
{
|
{
|
||||||
System.out.printf("METRICS Number of Variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
System.out.printf("METRICS Number of variants : %d (%.2f%%) (1/%d)\n", num_variants, 100.0 * (float)num_variants / (float)num_loci_confident, num_loci_confident / num_variants);
|
||||||
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
System.out.printf("METRICS Fraction of variant sites in dbSNP : %.2f%%\n", 100.0 * (float)dbsnp_hits / (float)num_variants);
|
||||||
|
System.out.printf("METRICS Number of variants with hapmap calls : %d\n", hapmap_tp + hapmap_fp);
|
||||||
|
System.out.printf("METRICS Hapmap true positives : %d\n", hapmap_tp);
|
||||||
|
System.out.printf("METRICS Hapmap false positives : %d\n", hapmap_fp);
|
||||||
|
System.out.printf("METRICS Hapmap precision : %.2f%%\n", 100.0 * (float)hapmap_tp / (float)(hapmap_tp + hapmap_fp));
|
||||||
}
|
}
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,10 @@ public class AlleleFrequencyEstimate {
|
||||||
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
|
// to use qstar, but make N (number of chormosomes) switch to n (number of reads at locus) for n=1
|
||||||
long numNonrefBases = Math.round(qstar * N);
|
long numNonrefBases = Math.round(qstar * N);
|
||||||
long numRefBases = N - numNonrefBases;
|
long numRefBases = N - numNonrefBases;
|
||||||
|
if (ref < alt) { // order bases alphabetically
|
||||||
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
|
return AlleleFrequencyWalker.repeat(ref, numRefBases) + AlleleFrequencyWalker.repeat(alt, numNonrefBases);
|
||||||
|
}else{
|
||||||
|
return AlleleFrequencyWalker.repeat(alt, numNonrefBases) + AlleleFrequencyWalker.repeat(ref, numRefBases);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Converts Geli files (genotype likelihood in picard) to GFF format for GATK
|
||||||
|
|
||||||
|
import sys, os, subprocess
|
||||||
|
|
||||||
|
Index2AffyChr = []
|
||||||
|
Index2AffyChr.append("chrM")
|
||||||
|
for i in range(1,23):
|
||||||
|
Index2AffyChr.append("chr"+str(i))
|
||||||
|
Index2AffyChr.append("chrX")
|
||||||
|
Index2AffyChr.append("chrY")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print "usage: Geli2GFF.py INPUT_FILE OUTPUT_DIRECTORY"
|
||||||
|
sys.exit()
|
||||||
|
else:
|
||||||
|
infile = sys.argv[1]
|
||||||
|
outdir = sys.argv[2]
|
||||||
|
|
||||||
|
output_file = outdir+"/"+os.path.splitext(os.path.basename(infile))[0]+".gff"
|
||||||
|
outfile = open(output_file, "w")
|
||||||
|
|
||||||
|
cmd = "/seq/software/picard/current/bin/GeliToText.jar I="+infile
|
||||||
|
pipe = subprocess.Popen(cmd, shell=True, bufsize=4000, stdout=subprocess.PIPE).stdout
|
||||||
|
|
||||||
|
pipe.readline()
|
||||||
|
for line in pipe:
|
||||||
|
if line[0] not in ('@', '#'):
|
||||||
|
fields = line.split("\t")
|
||||||
|
try:
|
||||||
|
contig = Index2AffyChr[int(fields[0])]
|
||||||
|
except KeyError, ValueError:
|
||||||
|
print "skipping "+fields[0]
|
||||||
|
continue # Skip entries not in chr 0 through 24
|
||||||
|
|
||||||
|
print >>outfile, contig+"\tHapmapGenotype\t"+fields[5]+"\t"+fields[1]+"\t"+str(int(fields[1])+1)+"\t.\t.\t.\t"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in New Issue