diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index dff984e25..131d40bd6 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -10,6 +10,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.rodGFF; +import org.broadinstitute.sting.gatk.refdata.HapMapAlleleFrequenciesROD; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -26,7 +27,6 @@ import java.io.File; import java.io.PrintStream; import java.io.FileNotFoundException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; public class GenomeAnalysisTK extends CommandLineProgram { @@ -83,7 +83,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { * Used by the walker. */ public PrintStream err = System.err; - + /** * our log, which we want to capture anything from this class */ @@ -103,7 +103,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalArg("genome_region", "L", "Genome region to operation on: from chr:start-end", "REGION_STR"); m_parser.addRequiredArg("analysis_type", "T", "Type of analysis to run", "Analysis_Name"); m_parser.addOptionalArg("DBSNP", "D", "DBSNP file", "DBSNP_FILE"); - m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE"); + m_parser.addOptionalArg("hapmap", "H", "Hapmap file", "HAPMAP_FILE"); m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO"); m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE"); m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS"); @@ -113,7 +113,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI"); m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" ); m_parser.addOptionalArg("err", "e", "An error output file presented to the walker. Will overwrite contents if file exists.", "errFileName" ); - m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName"); + m_parser.addOptionalArg("outerr", "oe", "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", "outErrFileName"); } /** @@ -142,7 +142,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { @Override protected String getArgumentSourceName( Class argumentSource ) { - return WalkerManager.getWalkerName( (Class)argumentSource ); + return WalkerManager.getWalkerName( (Class)argumentSource ); } /** @@ -157,23 +157,27 @@ public class GenomeAnalysisTK extends CommandLineProgram { final boolean TEST_ROD = false; List rods = new ArrayList(); - if (TEST_ROD) { - ReferenceOrderedData gff = new ReferenceOrderedData(new File("single.gff"), rodGFF.class); + + + if ( TEST_ROD ) { + ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class ); gff.testMe(); //ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class ); - ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class); + ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("/Volumes/Users/mdepristo/broad/ATK/exampleSAMs/dbSNP_chr20.txt"), rodDbSNP.class ); //dbsnp.testMe(); rods.add(dbsnp); // { gff, dbsnp }; - } else if (DBSNP_FILE != null) { - ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class); - //dbsnp.testMe(); - rods.add(dbsnp); // { gff, dbsnp }; - } - - if (HAPMAP_FILE != null) { - ReferenceOrderedData gff = new ReferenceOrderedData(new File(HAPMAP_FILE), rodGFF.class); - rods.add(gff); + } else { + if ( DBSNP_FILE != null ) { + ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File(DBSNP_FILE), rodDbSNP.class ); + //dbsnp.testMe(); + rods.add(dbsnp); // { gff, dbsnp }; + } + if ( HAPMAP_FILE != null ) { + ReferenceOrderedData hapmap = new ReferenceOrderedData(new File(HAPMAP_FILE), HapMapAlleleFrequenciesROD.class ); + //dbsnp.testMe(); + rods.add(hapmap); // { gff, dbsnp }; + } } initializeOutputStreams(); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java new file mode 100644 index 000000000..4fdd2333a --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java @@ -0,0 +1,105 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; + +import java.util.List; +import java.util.Arrays; + +import edu.mit.broad.picard.util.SequenceUtil; + +/** + * ReferenceOrderedDatum class to hold HapMap AlleleFrequency Data + */ +public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { + public GenomeLoc loc; // genome location of SNP + // Reference sequence chromosome or scaffold + // Start and stop positions in chrom + + public String rsNumber; // dbsnp rsNumber for this site + + public String hgBuild; + + public char Strand; // strand of the supplied alleles + + public char refAllele; + public char varAllele; + + public double refFreq; + public double varFreq; + + + public String strand; // maybe we don't need these? + public String alleles; // maybe we don't need these? + public Integer refCounts; // maybe we don't need these? + public Integer varCounts; // maybe we don't need these? + public Integer totalCounts; // maybe we don't need these? + + + public GenomeLoc getLocation() { return loc; } + + public String toString() { + //rs11511647 HG18 chr10 62765 + T/C T C 21 97 0.178 0.822 118 + + return String.format( + "%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%1.3f\t%1.3f\t%d", + rsNumber, hgBuild, getContig(), getStart(), strand, alleles, refAllele, varAllele, + refCounts, varCounts, refFreq, varFreq, totalCounts); + } + + public String toSimpleString() { + return String.format("%s:%s:%s:%1.3f", rsNumber, alleles, strand, varFreq); + } + + public String repl() { + return toString(); + } + + public void parseLine(final String[] parts) { + try { + // rs11511647 <=> HG18 <=> chr10 <=> 62765 <=> + <=> T/C <=> T <=> C <=> 21 <=> 97 <=> 0.178 <=> 0.822 <=> 118 + + rsNumber = parts[0]; //rs# + hgBuild = parts[1]; // build + + String contig = parts[2]; // chrom + long start = Long.parseLong(parts[3]); // The final is 1 based + long stop = start; + + strand = parts[4]; // strand + alleles = parts[5]; //alleles + refAllele = parts[6].charAt(0); // ref_allele + varAllele = parts[7].charAt(0); // var_allele + refCounts = Integer.parseInt(parts[8]); // CEU_ref + varCounts = Integer.parseInt(parts[9]); // CEU_var + refFreq = Double.parseDouble(parts[10]); // CEU_ref_freq + varFreq = Double.parseDouble(parts[11]); // CEU_var_freq + totalCounts = Integer.parseInt(parts[12]); // CEU_var + + loc = new GenomeLoc(contig, start, stop); + + } catch ( RuntimeException e ) { + System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts)); + throw e; + } + } + + public double getVarAlleleFreq() { return this.varFreq; } + + public List getAllelesFWD() { + List alleleList; + if ( onFwdStrand() ) + alleleList = Arrays.asList(alleles.split("/")); + else + alleleList = Arrays.asList(SequenceUtil.reverseComplement(alleles).split("/")); + + return alleleList; + } + + public boolean onFwdStrand() { + return strand.equals("+"); + } + + + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java b/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java index 77774acc1..bf0aca249 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/PrepareROD.java @@ -8,10 +8,7 @@ import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; import edu.mit.broad.picard.reference.ReferenceSequenceFile; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; -import org.broadinstitute.sting.gatk.refdata.rodDbSNP; -import org.broadinstitute.sting.gatk.refdata.rodGFF; +import org.broadinstitute.sting.gatk.refdata.*; import java.io.*; import java.util.HashMap; @@ -36,6 +33,7 @@ public class PrepareROD extends CommandLineProgram { static { addModule("GFF", rodGFF.class); addModule("dbSNP", rodDbSNP.class); + addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); } /** Required main method implementation. */