From 782e0018e4fc338c917ae0943c9c90868636373e Mon Sep 17 00:00:00 2001 From: aaron Date: Wed, 15 Sep 2010 22:54:49 +0000 Subject: [PATCH] removal of most of the old GATK ROD system; also a fix for -Dsingle so we can again run just a single unit or integration test (single tests in tribble can be run with the -DsingleTest option now). More to come. *** Three integration tests had to change: *** RecalibarationWalkersIntegrationTest: One of the tests was using the interval as the snp track, and wasn't supplying a DbSNP track (for CountCovariates) SequenomValidationConverterIntegrationTest: relies on Plink ROD which we've removed. PileupWalkerIntegrationTest: we no longer have implicit interval tracks, so there isn't a rod name over the specified region. Otherwise the same result. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4292 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisEngine.java | 9 +- .../ReferenceOrderedDataSource.java | 11 +- .../refdata/BasicReferenceOrderedDatum.java | 46 -- .../sting/gatk/refdata/IntervalRod.java | 40 -- .../gatk/refdata/IntervalRodIterator.java | 56 -- .../sting/gatk/refdata/PlinkRod.java | 579 ------------------ .../sting/gatk/refdata/TabularROD.java | 358 ----------- .../gatk/refdata/VariantContextAdaptors.java | 164 ----- .../refdata/features/table/TableCodec.java | 63 ++ .../refdata/features/table/TableFeature.java | 56 ++ .../sting/gatk/refdata/rodGELI.java | 162 ----- .../sting/gatk/refdata/tracks/RMDTrack.java | 114 +++- .../gatk/refdata/tracks/RMDTrackManager.java | 183 ------ .../gatk/refdata/tracks/RODRMDTrack.java | 82 --- .../gatk/refdata/tracks/TribbleTrack.java | 142 ----- .../tracks/builders/RMDTrackBuilder.java | 396 +++++++++++- .../tracks/builders/RODTrackBuilder.java | 107 ---- .../builders/TribbleRMDTrackBuilder.java | 334 ---------- .../gatk/walkers/annotator/Alignability.java | 42 -- .../coverage/DepthOfCoverageWalker.java | 4 +- .../indels/IndelGenotyperV2Walker.java | 4 +- .../walkers/sequenom/PickSequenomProbes.java | 11 +- .../sequenom/SequenomValidationConverter.java | 5 - .../walkers/DbSNPWindowCounter.java | 4 +- .../walkers/DesignFileGeneratorWalker.java | 2 +- .../walkers/GCCalculatorWalker.java | 2 - .../walkers/IndelAnnotator.java | 4 +- .../varianteval/VariantEvaluatorBySample.java | 1 - .../CoverageAcrossBaitsWalker.java | 192 ------ .../HybSelPerformanceWalker.java | 320 ---------- .../utils/report/utils/ComplexDataUtils.java | 2 - .../utils/report/utils/NodeUtils.java | 2 - .../gatk/GATKExtensionsGenerator.java | 6 +- .../queue/extensions/gatk/RodBindField.java | 8 +- .../ReferenceOrderedViewUnitTest.java | 31 +- .../ReferenceOrderedDataPoolUnitTest.java | 27 +- .../sting/gatk/refdata/PlinkRodUnitTest.java | 247 -------- .../gatk/refdata/TabularRODUnitTest.java | 236 ------- .../tracks/RMDTrackManagerUnitTest.java | 137 ----- .../builders/IndexPerformanceTests.java | 274 --------- ...Test.java => RMDTrackBuilderUnitTest.java} | 12 +- .../walkers/PileupWalkerIntegrationTest.java | 2 +- .../RecalibrationWalkersIntegrationTest.java | 5 +- ...nomValidationConverterIntegrationTest.java | 7 +- .../report/templates/TextTableUnitTest.java | 1 - .../utils/genotype/vcf/VCFWriterUnitTest.java | 4 +- 46 files changed, 664 insertions(+), 3830 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java create mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/rodGELI.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManager.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/tracks/RODRMDTrack.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java delete mode 100644 java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/Alignability.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/CoverageAcrossBaitsWalker.java delete mode 100755 java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java delete mode 100755 java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodUnitTest.java delete mode 100755 java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java delete mode 100644 java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java delete mode 100644 java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java rename java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/{TribbleRMDTrackBuilderUnitTest.java => RMDTrackBuilderUnitTest.java} (96%) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index c88de62c9..755b58c36 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -30,9 +30,11 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.*; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; + import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.sample.SampleFileParser; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -50,7 +52,6 @@ import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.*; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager; import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; @@ -381,12 +382,8 @@ public class GenomeAnalysisEngine { referenceDataSource = openReferenceSequenceFile(argCollection.referenceFile); if (argCollection.DBSNPFile != null) bindConvenienceRods(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, "dbsnp", argCollection.DBSNPFile); - // TODO: The ROD iterator currently does not understand multiple intervals file. Fix this by cleaning the ROD system. - if (argCollection.intervals != null && argCollection.intervals.size() == 1) { - bindConvenienceRods("interval", "Intervals", argCollection.intervals.get(0).replaceAll(",", "")); - } - RMDTrackManager manager = new RMDTrackManager(); + RMDTrackBuilder manager = new RMDTrackBuilder(); List tracks = manager.getReferenceMetaDataSources(this,argCollection.RODBindings); validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java index f67ed96f6..4c0d3b4ac 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataSource.java @@ -3,9 +3,8 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broad.tribble.FeatureSource; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.FlashBackIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; @@ -51,7 +50,7 @@ public class ReferenceOrderedDataSource implements SimpleDataSource { public ReferenceOrderedDataSource( Walker walker, RMDTrack rod) { this.rod = rod; if (rod.supportsQuery()) - iteratorPool = new ReferenceOrderedQueryDataPool(new TribbleRMDTrackBuilder(), (TribbleTrack)rod); + iteratorPool = new ReferenceOrderedQueryDataPool(new RMDTrackBuilder(),rod); else iteratorPool = new ReferenceOrderedDataPool( walker, rod ); } @@ -112,7 +111,7 @@ class ReferenceOrderedDataPool extends ResourcePool %d%n", getLocation(), that.getLocation(), getLocation().compareTo(that.getLocation())); - return getLocation().compareTo(that.getLocation()); - } - - public Object initialize(final File source) throws FileNotFoundException { - //System.out.printf("Initialize called with %s%n", source); - return null; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java deleted file mode 100755 index 63ee1a62b..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRod.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.io.File; -import java.io.IOException; - -public class IntervalRod extends BasicReferenceOrderedDatum { - private GenomeLoc loc = null; - - public IntervalRod(String name) { - super(name); - } - - public IntervalRod(String name, GenomeLoc loc) { - super(name); - this.loc = loc; - } - - public GenomeLoc getLocation() { return loc; } - - /** Required by ReferenceOrderedDatum interface; this method does nothing (always returns false), - * since this rod provides its own iterator for reading underlying data files. - */ - public boolean parseLine(Object header, String[] parts) { - return false; // this rod has its own iterator - } - - public String repl() { - throw new ReviewedStingException("repl() is not implemented yet"); - } - - public String toSimpleString() { return loc.toString(); } - public String toString() { return toSimpleString(); } - - public static IntervalRodIterator createIterator(String trackName, File f) throws IOException { - return IntervalRodIterator.IntervalRodIteratorFromLocsFile(trackName, f); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java deleted file mode 100755 index 944aca7f6..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/IntervalRodIterator.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import java.io.File; -import java.util.*; - -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.interval.IntervalUtils; - -public class IntervalRodIterator implements Iterator { - //private List locations = null; - private Iterator iter; - private String trackName = null; - - //public static RODIterator IntervalRodIteratorFromLocs(List locs) { - // IntervalRodIterator it = new IntervalRodIterator("interval", locs); - // return new RODIterator(it); - //} - - public static IntervalRodIterator IntervalRodIteratorFromLocsFile(final String trackName, final File file) { - //System.out.printf("Parsing %s for intervals %s%n", file, trackName); - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(IntervalUtils.parseIntervalArguments(Collections.singletonList(file.getPath())), - GenomeAnalysisEngine.instance.getArguments().intervalMerging); - //System.out.printf(" => got %d entries %n", locs.size()); - return new IntervalRodIterator(trackName, locs); - } - - public IntervalRodIterator(String trackName, GenomeLocSortedSet locs) { - this.trackName = trackName; - //locations = locs; - iter = locs.iterator(); - } - - @Override - public boolean hasNext() { - return iter.hasNext(); - } - - /** - * @return the next element in the iteration. - * @throws NoSuchElementException - iterator has no more elements. - */ - @Override - public IntervalRod next() { - if (!this.hasNext()) throw new NoSuchElementException("IntervalRodIterator next called on iterator with no more elements"); - IntervalRod r = new IntervalRod(trackName, iter.next()); - //System.out.printf("IntervalRod next is %s%n", r); - return r; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java b/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java deleted file mode 100644 index 3b879b232..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/PlinkRod.java +++ /dev/null @@ -1,579 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl, ebanks - * - */ -public class PlinkRod extends BasicReferenceOrderedDatum implements Iterator { - - public static final String SEQUENOM_NO_CALL = "0"; - public static final String SEQUENOM_NO_BASE = "-"; - - private final Set headerEntries = new HashSet(Arrays.asList("#Family ID","Individual ID","Sex", - "Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE","#Individual ID")); - private final byte SNP_MAJOR_MODE = 1; - - private PlinkVariantInfo currentVariant; - private ListIterator variantIterator; - - private PlinkFileType plinkFileType; - - private List sampleNames; - - private String[] fileHeader; - - public enum PlinkFileType { - STANDARD_PED, RAW_PED, BINARY_PED - } - - public PlinkRod(String name) { - super(name); - } - - public PlinkRod(String name, PlinkVariantInfo record, ListIterator iter, List sampleNames) { - super(name); - currentVariant = record; - variantIterator = iter; - this.sampleNames = sampleNames; - } - - @Override - public Object initialize(final File plinkFile) throws FileNotFoundException { - if ( ! plinkFile.exists() ) { - throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist."); - } - - ArrayList variants = parsePlinkFile(plinkFile); - if ( variants == null ) - throw new IllegalStateException("Error parsing Plink file"); - - variantIterator = variants.listIterator(); - return null; - } - - public static PlinkRod createIterator(String name, File file) { - PlinkRod plink = new PlinkRod(name); - try { - plink.initialize(file); - } catch (FileNotFoundException e) { - throw new ReviewedStingException("Unable to find file " + file); - } - return plink; - } - - private void assertNotNull() { - if ( currentVariant == null ) { - throw new IllegalStateException("Current variant information is null"); - } - } - - public boolean hasNext() { - return variantIterator.hasNext(); - } - - public PlinkRod next() { - if ( !this.hasNext() ) - throw new NoSuchElementException("PlinkRod next called on iterator with no more elements"); - - // get the next record - currentVariant = variantIterator.next(); - return new PlinkRod(name, currentVariant, variantIterator, sampleNames); - } - - @Override - public boolean parseLine(Object obj, String[] args) { - throw new UnsupportedOperationException("PlinkRod does not support the parseLine method"); - } - - public void remove() { - throw new UnsupportedOperationException("The remove operation is not supported for a PlinkRod"); - } - - @Override - public GenomeLoc getLocation() { - assertNotNull(); - return currentVariant.getLocation(); - } - - @Override - public String toString() { - assertNotNull(); - return currentVariant.toString(); - } - - public String getVariantName() { - assertNotNull(); - return currentVariant.getName(); - - } - - /* Get the mapping from sample name to genotypes (array of Alleles). - * Important note: none of the Alleles returned here are annotated as reference - * (since the rod doesn't know offhand what the reference allele is). - * - * @return mapping from sample name to genotype - */ - public Map getGenotypes() { - return currentVariant.getGenotypes(); - } - - public List getSampleNames() { - return sampleNames; - } - - public boolean isIndel() { - assertNotNull(); - return currentVariant.isIndel(); - } - - public boolean isInsertion() { - assertNotNull(); - return currentVariant.isInsertion(); - } - - public int getLength() { - assertNotNull(); - return currentVariant.getLength(); - } - - -// AM I PARSING A TEXT OR A BINARY FILE ?? - - private ArrayList parsePlinkFile(File file) { - String[] splitFileName = file.getName().split("\\."); - String extension = splitFileName[splitFileName.length-1]; - if ( extension.equals("ped") || extension.equals("raw") ) { - return parseTextFormattedPlinkFile(file); - } else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) { - plinkFileType = PlinkFileType.BINARY_PED; - return parseBinaryFormattedPlinkFile(file); - } else { - System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format"); - return parseTextFormattedPlinkFile(file); - } - - } - - /* *** *** *** *** *** ** *** ** *** ** *** ** *** ** *** - * * PARSING STANDARD TEXT PED FILES * ** - * *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/ - - private ArrayList parseTextFormattedPlinkFile( File file ) { - try { - BufferedReader reader = new BufferedReader( new FileReader ( file ) ); - ArrayList seqVars = new ArrayList(); - int headerFieldCount = instantiateVariantListFromHeader(seqVars, reader.readLine()); - - sampleNames = new ArrayList(); - - String line; - do { - line = reader.readLine(); - incorporateInfo(seqVars, line, headerFieldCount); - } while ( line != null ); - - - java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these - // are sorted in standard reference order - - return seqVars; - - } catch ( FileNotFoundException e ) { - throw new ReviewedStingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e); - } catch ( IOException e ) { - throw new ReviewedStingException("Error reading file "+file.getAbsolutePath()+".",e); - } - } - - private void incorporateInfo(List vars, String plinkLine, int headerFieldCount) { - if ( plinkLine == null ) { - return; - } - - if ( plinkFileType != PlinkFileType.STANDARD_PED ) - throw new ReviewedStingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); - - StringTokenizer st = new StringTokenizer(plinkLine, "\t"); - int offset = 0; - String sample = st.nextToken(); - while ( ! fileHeader[offset].equals("Individual ID") && ! fileHeader[offset].equals("#Individual ID") ) { - sample = st.nextToken(); // kill nonstandard tokens - offset ++; - } - - sampleNames.add(sample); - for (int i = offset+1; i < headerFieldCount ; i++) - st.nextToken(); - - int snpNumber = 0; - while ( snpNumber < vars.size() ) { - vars.get(snpNumber++).addGenotypeEntry(st.nextToken().split("\\s+")); - } - } - - private int instantiateVariantListFromHeader(ArrayList seqVars, String header) { - // if the first line is not a comment (what we're used to seeing), - // then it's the raw header (comes from de-binary-ing a .bed file) - if ( !header.startsWith("#") ) - throw new ReviewedStingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file."); - - plinkFileType = PlinkFileType.STANDARD_PED; - - String[] headerFields = header.split("\t"); - fileHeader = headerFields; - int skippedFields = 0; - for ( String field : headerFields ) { - if ( headerEntries.contains(field) ) - skippedFields++; - else - // not a standard header, so a variant - seqVars.add(new PlinkVariantInfo(field)); - } - - return skippedFields; - } - - /* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** - * * PARSING BINARY PLINK BED/BIM/FAM FILES * * - * *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/ - - private ArrayList parseBinaryFormattedPlinkFile(File file) { - PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file); - ArrayList parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile); - sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile); - ArrayList updatedVariants = getGenotypesFromBedFile(parsedVariants, binaryFiles.bedFile); - - java.util.Collections.sort(updatedVariants); - - return updatedVariants; - } - - private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) { - // just gonna parse the name - PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta(); - String absolute_path = file.getAbsolutePath(); - String[] directory_tree = absolute_path.split("/"); - String file_name = directory_tree[directory_tree.length-1].split("\\.")[0]; - StringBuilder pathBuilder = new StringBuilder(); - for ( int i = 0; i < directory_tree.length - 1; i ++ ) { - pathBuilder.append(String.format("%s/",directory_tree[i])); - } - String path = pathBuilder.toString(); - trifecta.bedFile = new File(path+file_name+".bed"); - trifecta.bimFile = new File(path+file_name+".bim"); - trifecta.famFile = new File(path+file_name+".fam"); - - return trifecta; - - } - - private ArrayList instantiateVariantsFromBimFile(File bimFile) { - BufferedReader reader; - try { - reader = new BufferedReader( new FileReader( bimFile )); - } catch ( FileNotFoundException e) { - throw new ReviewedStingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+ - "Please ensure that it is in the same directory as the .bed and .fam files. The file we "+ - "Were looking for was "+bimFile.getAbsolutePath(),e); - } - - ArrayList variants = new ArrayList(); - - try { - String line = reader.readLine(); - while ( line != null ) { - String[] snpInfo = line.split("\\s+"); - PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1]); - variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[3]), Long.valueOf(snpInfo[3]))); - variant.setAlleles(snpInfo[4],snpInfo[5]); - variants.add(variant); - - line = reader.readLine(); - } - } catch ( IOException e ) { - throw new ReviewedStingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e); - } - - return variants; - } - - private static ArrayList getSampleNameOrderingFromFamFile(File famFile) { - BufferedReader reader; - try { - reader = new BufferedReader( new FileReader( famFile )); - } catch ( FileNotFoundException e) { - throw new ReviewedStingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+ - "Please ensure that it is in the same directory as the .bed and .bim files. The file we "+ - "Were looking for was "+famFile.getAbsolutePath(),e); - } - - ArrayList sampleNames = new ArrayList(); - - try { - String line; - do { - line = reader.readLine(); - if ( line != null ) { - sampleNames.add(line.split("\\s+")[1]); - } - } while ( line != null ); - } catch (IOException e) { - throw new ReviewedStingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e); - } - - return sampleNames; - } - - private ArrayList getGenotypesFromBedFile(ArrayList variants, File bedFile) { - FileInputStream inStream; - try { - inStream = new FileInputStream(bedFile); - } catch (FileNotFoundException e) { - throw new ReviewedStingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+ - "Please ensure that it is in the same directory as the .bim and .fam files. The file we "+ - "Were looking for was "+bedFile.getAbsolutePath(),e); - } - - try { - byte genotype; - long bytesRead = 0; - int snpOffset = 0; - int sampleOffset = 0; - boolean snpMajorMode = true; - do { - genotype = (byte) inStream.read(); - bytesRead++; - if ( genotype != -1 ) { - if ( bytesRead > 3 ) { - addGenotypeByte(genotype,variants,snpOffset,sampleOffset, snpMajorMode); - - if ( snpMajorMode ) { - sampleOffset = sampleOffset + 4; - if ( sampleOffset > sampleNames.size() -1 ) { - snpOffset ++; - sampleOffset = 0; - } - } else { - snpOffset = snpOffset + 4; - if ( snpOffset > variants.size() -1 ) { - sampleOffset ++; - snpOffset = 0; - } - } - - } else { - if ( bytesRead == 3) { - snpMajorMode = genotype == SNP_MAJOR_MODE; - } - } - } - } while ( genotype != -1 ); - } catch ( IOException e) { - throw new ReviewedStingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e); - } - - return variants; - } - - private void addGenotypeByte(byte genotype, ArrayList variants, int snpOffset, int sampleOffset, boolean major) { - // four genotypes encoded in this byte - int[] genotypes = parseGenotypes(genotype); - for ( int g : genotypes ) { - variants.get(snpOffset).addBinaryGenotypeEntry(g); - - if ( major ) { - sampleOffset++; - if ( sampleOffset > sampleNames.size()-1 ) {// using offsets for comparison; size 5 == offset 4 - return; - } - } else { - snpOffset++; - if( snpOffset > variants.size()-1 ) { - return; - } - } - } - } - - private int[] parseGenotypes(byte genotype) { - int[] genotypes = new int[4]; - genotypes[0] = ( genotype & 3 ); - genotypes[1] = ( ( genotype & 12 ) >>> 2 ); - genotypes[2] = ( ( genotype & 48 ) >>> 4 ); - genotypes[3] = ( ( genotype & 192 ) >>> 6 ); - return genotypes; - } - - class PlinkVariantInfo implements Comparable { - - private String variantName; - private GenomeLoc loc; - - // the list of genotypes in the same order as in sampleNames (using a map here is inefficient) - private List genotypes = new ArrayList(); - - // map of Alleles seen (so that we can share Allele objects among samples) - HashMap alleles = new HashMap(4); - - // for indels - private boolean isIndel = false; - private boolean isInsertion = false; - private int length = 1; - - // for binary parsing - private String locAllele1; - private String locAllele2; - - - public PlinkVariantInfo(String variantName) { - this.variantName = variantName; - parseName(); - } - - public GenomeLoc getLocation() { - return loc; - } - - public String getName() { - return variantName; - } - - public Map getGenotypes() { - Map genotypeMap = new HashMap(); - int index = 0; - for ( Allele[] myAlleles : genotypes ) - genotypeMap.put(sampleNames.get(index++), myAlleles); - return genotypeMap; - } - - public boolean isIndel() { - return isIndel; - } - - public boolean isInsertion() { - return isInsertion; - } - - public int getLength() { - return length; - } - - public void setGenomeLoc(GenomeLoc loc) { - this.loc = loc; - } - - private void parseName() { - int chromIdx = variantName.indexOf("|c"); - if ( chromIdx == -1 ) - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c...)"); - String[] pieces = variantName.substring(chromIdx+2).split("_"); - if ( pieces.length < 2 ) - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); - - String chrom = pieces[0].trim(); - if ( pieces[1].charAt(0) != 'p' ) - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p...)"); - - String pos = pieces[1].substring(1).trim(); - loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos); - - if ( pieces.length > 2 && (pieces[2].startsWith("gI") || pieces[2].startsWith("gD")) ) { - // it's an indel - isIndel = true; - isInsertion = pieces[2].startsWith("gI"); - try { - // length of insertion on reference is still 1 - if ( !isInsertion ) - length = Integer.parseInt(pieces[2].substring(2)); - } catch (NumberFormatException e) { - throw new IllegalArgumentException("Variant name " + variantName + " does not adhere to required convention (...|c..._p..._g[I/D][length])"); - } - } - } - - public void setAlleles(String al1, String al2) { - if ( al1.equals(PlinkRod.SEQUENOM_NO_CALL) ) { - // encoding for a site at which no variants were detected - locAllele1 = al2; - } else { - locAllele1 = al1; - } - locAllele2 = al2; - } - - public void addGenotypeEntry(String[] alleleStrings) { - - Allele[] myAlleles = new Allele[2]; - - for (int i = 0; i < 2; i++) { - if ( alleleStrings.length <= i ) { - myAlleles[i] = null; - continue; - } - - String alleleString = alleleStrings[i]; - - Allele allele; - if ( alleles.containsKey(alleleString) ) { - allele = alleles.get(alleleString); - } else { - if ( PlinkRod.SEQUENOM_NO_CALL.equals(alleleString) ) - allele = Allele.NO_CALL; - else - allele = Allele.create(alleleString); - alleles.put(alleleString, allele); - } - - myAlleles[i] = allele; - } - - genotypes.add(myAlleles); - } - - public void addBinaryGenotypeEntry(int genoTYPE) { - String[] alleleStr = new String[2]; - if ( genoTYPE == 0 ) { - alleleStr[0] = locAllele1; - alleleStr[1] = locAllele1; - } else if (genoTYPE == 2) { - alleleStr[0] = locAllele1; - alleleStr[1] = locAllele2; - } else if (genoTYPE == 3 ) { - alleleStr[0] = locAllele2; - alleleStr[1] = locAllele2; - } else { - alleleStr[0] = "0"; - alleleStr[1] = "0"; - } - - addGenotypeEntry(alleleStr); - } - - public int compareTo(Object obj) { - if ( ! ( obj instanceof PlinkVariantInfo) ) { - return 1; - } - - return loc.compareTo(((PlinkVariantInfo) obj).getLocation()); - } - } -} - -class PlinkBinaryTrifecta { - - public PlinkBinaryTrifecta() {} - - public File bedFile; - public File bimFile; - public File famFile; - -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java deleted file mode 100755 index 43a237c53..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import java.util.*; -import java.util.regex.Pattern; -import java.util.regex.Matcher; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.apache.log4j.Logger; - -/** - * Class for representing arbitrary reference ordered data sets - * - * User: mdepristo - * Date: Feb 27, 2009 - * Time: 10:47:14 AM - * - * System for interacting with tabular formatted data of the following format: - * - * # comment line - * # must include HEADER KEYWORD - * HEADER COL1 ... COLN - * chr:pos data1 ... dataN - * - * The system supports the rod interface. You can just access tabularRODs through the normal ROD system. - * - * You can also write your own files, as such: - * - * ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); - * assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); - * String rowData = String.format("%d %d %d", 1, 2, 3); - * TabularROD row = new TabularROD("myName", header, GenomeLoc.parseGenomeLoc("chrM", 1), rowData.split(" ")); - * assertTrue(row.toString().equals("chrM:1\t1\t2\t3")); - */ -public class TabularROD extends BasicReferenceOrderedDatum implements Map { - private static Logger logger = Logger.getLogger(TabularROD.class); - - protected GenomeLoc loc; - private HashMap attributes; - private ArrayList header; - - public static String DEFAULT_DELIMITER = "\t"; - public static String DEFAULT_DELIMITER_REGEX = "\\s+"; - - public static String DELIMITER = DEFAULT_DELIMITER; - public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX; - - private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000; - private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*((HEADER)|(loc)|(rs#)).*"); - private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - - private static int parsedRecords = 0; - final static boolean printRecordsParsed = false; - - /** - * Set the global tabular ROD delimiter and the regex to split the delimiter. - * - * The delimiter to put between fields, while the regex is used to split lines - * - * @param delimiter - * @param delimeterRegex - */ - public static void setDelimiter(final String delimiter, final String delimeterRegex) { - DELIMITER = delimiter; - DELIMITER_REGEX = delimeterRegex; - } - - /** - * Returns a parsable string representation for the - * @param header - */ - public static String headerString(final ArrayList header) { - requireGoodHeader(header); - return Utils.join(DELIMITER, header); - } - - /** - * Returns a comment line containing the *single line* string msg - * - * @param msg - * @return - */ - public static String commentString(final String msg) { - return "# " + msg; - } - - private static boolean headerIsGood(final ArrayList header) { - if ( header.size() == 0 ) return false; - //if ( ! header.get(0).equals("HEADER") ) return false; - return true; - } - - private static void requireGoodHeader(final ArrayList header) { - if ( ! headerIsGood(header) ) - throw new RuntimeException("Header must begin with HEADER keyword"); - } - - // ---------------------------------------------------------------------- - // - // Constructors - // - // ---------------------------------------------------------------------- - public TabularROD(final String name) { - super(name); - attributes = new HashMap(); - } - - public TabularROD(final String name, ArrayList header) { - this(name); - this.header = header; - } - - /** - * Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data - * must be bound to each corresponding header[i] field before the object is really usable. - * - * @param name - * @param header - * @param loc - */ - public TabularROD(final String name, ArrayList header, GenomeLoc loc) { - this(name); - this.header = header; - this.loc = loc; - requireGoodHeader(this.header); - } - - /** - * Make a new TabularROD with name, using header columns header, at loc, with data associated with the - * header columns. data and header are assumed to be in the same order, and bindings will be established - * from header[i] = data[i]. The TabularROD at this stage can be printed, manipulated, it is considered - * a full fledged, initialized object. - * - * @param name - * @param header - * @param loc - * @param data - */ - public TabularROD(final String name, ArrayList header, GenomeLoc loc, String[] data) { - this(name, header, loc); - - if ( header.size() != data.length + 1 ) - throw new RuntimeException(String.format("Incorrect tabular data format: header has %d columns but %d data elements were provided: %s", - header.size(), data.length, Utils.join("\t", data))); - for ( int i = 0; i < data.length; i++ ) { - put(header.get(i+1), data[i]); - } - } - - /** - * Walks through the source files looking for the header line, which it returns as a - * list of strings. - * - * @param source - * @return - */ - public Object initialize(final File source) throws FileNotFoundException { - return readHeader(source); - } - - /** - * By default, all records (i.e. RODs) must contain the same number of fields as specified - * by the header. Subclasses should override this method to disable this requirement. - * - * @return true if incomplete records are allowed; false otherwise - */ - public boolean allowIncompleteRecords() { - return false; - } - - public static ArrayList readHeader(final File source) throws FileNotFoundException { - ArrayList header = null; - int linesLookedAt = 0; - XReadLines reader = new XReadLines(source); - - for ( String line : reader ) { - Matcher m = HEADER_PATTERN.matcher(line); - if ( m.matches() ) { - //System.out.printf("Found a header line: %s%n", line); - header = new ArrayList(Arrays.asList(line.split(DELIMITER_REGEX))); - //System.out.printf("HEADER IS %s%n", Utils.join(":", header)); - } - - if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER ) - break; - } - - // check that we found the header - if ( header != null ) { - logger.debug(String.format("HEADER IS %s%n", Utils.join(":", header))); - } else { - // use the indexes as the header fields - logger.debug("USING INDEXES FOR ROD HEADER"); - // reset if necessary - if ( !reader.hasNext() ) - reader = new XReadLines(source); - header = new ArrayList(); - int tokens = reader.next().split(DELIMITER_REGEX).length; - for ( int i = 0; i < tokens; i++) - header.add(Integer.toString(i)); - } - - try { - reader.close(); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - - return header; - } - - - // ---------------------------------------------------------------------- - // - // ROD accessors - // - // ---------------------------------------------------------------------- - public GenomeLoc getLocation() { - if ( loc != null ) - return loc; - String s = get(header.get(0)); - if ( s == null ) - return null; - return GenomeLocParser.parseGenomeLoc(s); - } - - public ArrayList getHeader() { - return header; - } - - public String get(final Object key) { - return attributes.get(key); - } - - public String put(final String key, final String object) { - return attributes.put(key, object); - } - - public boolean containsKey(final Object key) { - return attributes.containsKey(key); - } - - public HashMap getAttributes() { - return attributes; - } - - public String getAttributeString() { - List strings = new ArrayList(attributes.size()); - for ( String key : header ) { - if ( containsKey(key) ) { // avoid the header - strings.add(this.get(key)); - //System.out.printf("Adding %s%n", this.get(key)); - } - } - return Utils.join("\t", strings); - } - - // ---------------------------------------------------------------------- - // - // map functions - // - // ---------------------------------------------------------------------- - public int size() { return attributes.size(); } - public boolean isEmpty() { return attributes.isEmpty(); } - public boolean containsValue(Object o) { return attributes.containsValue(o); } - public String remove(Object o) { return attributes.remove(o); } - public void clear() { attributes.clear(); } - public java.util.Set keySet() { return attributes.keySet(); } - public java.util.Collection values() { return attributes.values(); } - - public void putAll(java.util.Map map) { - attributes.putAll(map); - } - - public java.util.Set> entrySet() { - return attributes.entrySet(); - } - - // ---------------------------------------------------------------------- - // - // formatting - // - // ---------------------------------------------------------------------- - public String toString() { - if ( loc != null ) - return String.format("%s\t%s", loc, getAttributeString()); - return String.format("%s", getAttributeString()); - } - - /** - * The delimiter regular expression that should be used to separate fields in data rows - * and header. - * - * @return - */ - public String delimiterRegex() { - return DELIMITER_REGEX; - } - - /** - * Used by ROD management system to set the data in this ROD associated with a line in a rod - * - * @param headerObj - * @param parts - * @return - * @throws IOException - */ - public boolean parseLine(final Object headerObj, final String[] parts) throws IOException { - header = (ArrayList)(headerObj); - - //System.out.printf("parts [len=%d] is '%s'%n", parts.length, Utils.join(":", parts)); - - if ( parts.length == 0 || COMMENT_PATTERN.matcher(parts[0]).matches() || HEADER_PATTERN.matcher(parts[0]).matches() ) - return false; - - if ( !allowIncompleteRecords() && header.size() != parts.length) { - throw new IOException(String.format("Header length %d not equal to Tabular parts length %d", header.size(), parts.length)); - } - - for ( int i = 0; i < parts.length; i++ ) { - put(header.get(i), parts[i]); - } - - if ( printRecordsParsed ) System.out.printf("Parsed %d records %s%n", ++parsedRecords, this); - - return true; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 462aa3d3e..89d332608 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -42,10 +42,8 @@ public class VariantContextAdaptors { static { adaptors.put(DbSNPFeature.class, new DBSnpAdaptor()); - adaptors.put(PlinkRod.class, new PlinkRodAdaptor()); adaptors.put(HapMapFeature.class, new HapMapAdaptor()); adaptors.put(GeliTextFeature.class, new GeliTextAdaptor()); - adaptors.put(rodGELI.class, new GeliAdaptor()); adaptors.put(VariantContext.class, new VariantContextAdaptor()); } @@ -134,96 +132,6 @@ public class VariantContextAdaptors { return new VCFHeader(hInfo == null ? new HashSet() : hInfo, names); } - // -------------------------------------------------------------------------------------------------------------- - // - // Plink to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class PlinkRodAdaptor extends VCAdaptor { -// VariantContext convert(String name, Object input) { -// return convert(name, input, null); -// } - - VariantContext convert(String name, Object input, ReferenceContext ref) { - if ( ref == null ) - throw new UnsupportedOperationException("Conversion from Plink to VariantContext requires a reference context"); - - PlinkRod plink = (PlinkRod)input; - - HashSet VCAlleles = new HashSet(); - Allele refAllele = determineRefAllele(plink, ref); - VCAlleles.add(refAllele); - - // mapping from Plink Allele to VC Allele (since the PlinkRod does not annotate any of the Alleles as reference) - HashMap plinkAlleleToVCAllele = new HashMap(); - - Set genotypes = new HashSet(); - - Map genotypeSets = plink.getGenotypes(); - - // We need to iterate through this list and recreate the Alleles since the - // PlinkRod does not annotate any of the Alleles as reference. - // for each sample... - for ( Map.Entry genotype : genotypeSets.entrySet() ) { - ArrayList myVCAlleles = new ArrayList(2); - - // for each allele... - for ( Allele myPlinkAllele : genotype.getValue() ) { - Allele VCAllele; - if ( myPlinkAllele.isNoCall() ) { - VCAllele = Allele.NO_CALL; - } else { - if ( plinkAlleleToVCAllele.containsKey(myPlinkAllele) ) { - VCAllele = plinkAlleleToVCAllele.get(myPlinkAllele); - } else { - if ( !plink.isIndel() ) { - VCAllele = Allele.create(myPlinkAllele.getBases(), refAllele.equals(myPlinkAllele, true)); - } else if ( myPlinkAllele.isNull() ) { - VCAllele = Allele.create(Allele.NULL_ALLELE_STRING, plink.isInsertion()); - } else { - VCAllele = Allele.create(myPlinkAllele.getBases(), !plink.isInsertion()); - } - plinkAlleleToVCAllele.put(myPlinkAllele, VCAllele); - VCAlleles.add(VCAllele); - } - } - - myVCAlleles.add(VCAllele); - } - - // create the genotype - genotypes.add(new Genotype(genotype.getKey(), myVCAlleles)); - } - - // create the variant context - try { - GenomeLoc loc = GenomeLocParser.setStop(plink.getLocation(), plink.getLocation().getStop() + plink.getLength()-1); - VariantContext vc = VariantContextUtils.toVC(plink.getVariantName(), loc, VCAlleles, genotypes); - return vc; - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; please make sure that e.g. a sample isn't present more than one time in your ped file"); - } - } - - private Allele determineRefAllele(PlinkRod plink, ReferenceContext ref) { - Allele refAllele; - if ( !plink.isIndel() ) { - refAllele = Allele.create(ref.getBase(), true); - } else if ( plink.isInsertion() ) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - } else { - long maxLength = ref.getWindow().getStop() - ref.getLocus().getStop(); - if ( plink.getLength() > maxLength ) - throw new UnsupportedOperationException("Plink conversion currently can only handle indels up to length " + maxLength); - refAllele = deletionAllele(ref, 1, plink.getLength()); - - } - return refAllele; - } - } - - // -------------------------------------------------------------------------------------------------------------- // // GELI to VariantContext @@ -291,78 +199,6 @@ public class VariantContextAdaptors { } } - // -------------------------------------------------------------------------------------------------------------- - // - // GELI to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class GeliAdaptor extends VCAdaptor { - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @return a VariantContext object - */ -// VariantContext convert(String name, Object input) { -// return convert(name, input, null); -// } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @param ref the reference context - * @return a VariantContext object - */ - VariantContext convert(String name, Object input, ReferenceContext ref) { - GenotypeLikelihoods geli = ((rodGELI)input).getGeliRecord(); - if ( ! Allele.acceptableAlleleBases(String.valueOf((char)geli.getReferenceBase()),true) ) - return null; - Allele refAllele = Allele.create(String.valueOf((char)geli.getReferenceBase()), true); - - // add the reference allele - List alleles = new ArrayList(); - alleles.add(refAllele); - - // add the two alt alleles - if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele1()),false)) { - return null; - } - Allele allele1 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele1()), false); - if (!alleles.contains(allele1) && !allele1.equals(refAllele, true)) alleles.add(allele1); - - // add the two alt alleles - if (!Allele.acceptableAlleleBases(String.valueOf((char) geli.getBestGenotype().getAllele2()),false)) { - return null; - } - Allele allele2 = Allele.create(String.valueOf((char) geli.getBestGenotype().getAllele2()), false); - if (!alleles.contains(allele2) && !allele2.equals(refAllele, true)) alleles.add(allele2); - - - Map attributes = new HashMap(); - Collection genotypes = new ArrayList(); - MutableGenotype call = new MutableGenotype(name, alleles); - - // setup the genotype likelihoods - double[] post = new double[10]; - String[] gTypes = {"AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT"}; - for (int x = 0; x < 10; x++) - post[x] = Double.valueOf(geli.getLikelihood(DiploidGenotype.fromBases((byte) gTypes[x].charAt(0), (byte) gTypes[x].charAt(1)))); - - // set the likelihoods, depth, and RMS mapping quality values - //call.putAttribute(CalledGenotype.POSTERIORS_ATTRIBUTE_KEY, post); - //call.putAttribute(GeliTextWriter.MAXIMUM_MAPPING_QUALITY_ATTRIBUTE_KEY, (int) geli.getMaxMappingQuality()); - //call.putAttribute(GeliTextWriter.READ_COUNT_ATTRIBUTE_KEY, geli.getNumReads()); - - // add the call to the genotype list, and then use this list to create a VariantContext - genotypes.add(call); - VariantContext vc = VariantContextUtils.toVC(name, ((rodGELI) input).getLocation(), alleles, genotypes, geli.getBestToReferenceLod(), null, attributes); - return vc; - - } - } - // -------------------------------------------------------------------------------------------------------------- // // HapMap to VariantContext diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java new file mode 100644 index 000000000..c24eb94e6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableCodec.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.gatk.refdata.features.table; + +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.readers.LineReader; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.io.IOException; +import java.util.*; + +/** + * implementation of a simple table (tab or comma delimited format) input files... more improvements to come + */ +public class TableCodec implements FeatureCodec { + private String delimiterRegex = "\\s+"; + private String headerDelimiter = "HEADER"; + private String commentDelimiter = "#"; + private ArrayList header = new ArrayList(); + + @Override + public Feature decodeLoc(String line) { + return decode(line); + } + + @Override + public Feature decode(String line) { + if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter)) + return null; + String[] split = line.split(delimiterRegex); + if (split.length < 1) + throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); + + + return new TableFeature(GenomeLocParser.parseGenomeLoc(split[0]),Arrays.asList(split),header); + } + + @Override + public Class getFeatureType() { + return TableFeature.class; + } + + @Override + public Object readHeader(LineReader reader) { + String line = ""; + try { + while ((line = reader.readLine()) != null) { + if (line.startsWith(headerDelimiter)) { + if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines. The second is = " + line); + String spl[] = line.split(delimiterRegex); + for (String s : spl) header.add(s); + } else if (!line.startsWith(commentDelimiter)) { + break; + } + } + } catch (IOException e) { + throw new UserException.MalformedFile("unable to parse header from TableCodec file",e); + } + return header; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java new file mode 100644 index 000000000..6ba983ebc --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/features/table/TableFeature.java @@ -0,0 +1,56 @@ +package org.broadinstitute.sting.gatk.refdata.features.table; + +import org.broad.tribble.Feature; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.util.*; + +/** + * A feature representing a single row out of a text table + */ +public class TableFeature implements Feature { + // stores the values for the columns seperated out + private final List values; + + // if we have column names, we store them here + private final List keys; + + // our location + private final GenomeLoc position; + + public TableFeature(GenomeLoc position, List values, List keys) { + this.values = values; + this.keys = keys; + this.position = position; + } + + @Override + public String getChr() { + return position.getContig(); + } + + @Override + public int getStart() { + return (int)position.getStart(); + } + + @Override + public int getEnd() { + return (int)position.getStop(); + } + + public String getValue(int columnPosition) { + if (columnPosition >= values.size()) throw new IllegalArgumentException("We only have " + values.size() + "columns, the requested column = " + columnPosition); + return values.get(columnPosition); + } + + public String get(String columnName) { + int position = keys.indexOf(columnName); + if (position < 0) throw new IllegalArgumentException("We don't have a column named " + columnName); + return values.get(position); + } + + public GenomeLoc getLocation() { + return this.position; + } +} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodGELI.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodGELI.java deleted file mode 100755 index ed83035c1..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodGELI.java +++ /dev/null @@ -1,162 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; - -import java.util.*; -import java.io.IOException; -import java.io.File; - -import edu.mit.broad.picard.genotype.geli.GeliFileReader; -import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; -import edu.mit.broad.picard.genotype.DiploidGenotype; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import net.sf.samtools.util.CloseableIterator; - - -/** - * This class wraps Picard Geli CHiP data and presents it as a ROD. - */ - -public class rodGELI extends BasicReferenceOrderedDatum { - // ---------------------------------------------------------------------- - // - // Constructors - // - // ---------------------------------------------------------------------- - - private GenotypeLikelihoods gh = null; - - public rodGELI(final String name, GenotypeLikelihoods gh) { - super(name); - this.gh = gh; - } - - @Override - public GenomeLoc getLocation() { - return GenomeLocParser.createGenomeLoc(gh.getSequenceIndex(), gh.getPosition()); - } - - public GenotypeLikelihoods getGeliRecord() { - return gh; - } - - /** Required by ReferenceOrderedDatum interface. This implementation provides its own iterator, - * so this method does nothing at all (always returns false). - * - */ - @Override - public boolean parseLine(Object header, String[] parts) throws IOException { - return false; - } - - @Override - public String toString() { - final StringBuilder builder = new StringBuilder(); - builder.append(gh.getSequenceName() + "\t"); - builder.append(gh.getPosition() + "\t"); - builder.append(Character.toString((char)(gh.getReferenceBase() & 0xff)) + "\t"); - builder.append(gh.getNumReads() + "\t"); - builder.append(gh.getMaxMappingQuality() + "\t"); - builder.append(gh.getBestGenotype().name() + "\t"); - builder.append(gh.getBestToReferenceLod() + "\t"); - builder.append(gh.getBestToSecondBestLod() + "\t"); - builder.append("\t"); // no dbSNP info in GenotypeLikelihoods class - for (final DiploidGenotype genotype : DiploidGenotype.values()) - builder.append(gh.getLikelihood(genotype) + "\t"); - builder.append("\n"); - return builder.toString(); - } - - private static class rodGELIIterator implements Iterator { - - private String rodName = null; - private GeliFileReader parser = null; - private CloseableIterator iterator = null; - - rodGELIIterator(String name, File f) { - rodName = name; - parser = new GeliFileReader(f); - iterator = parser.iterator(); - } - - public boolean hasNext() { - return iterator.hasNext(); - } - /** - * @return the next element in the iteration. - * @throws NoSuchElementException - iterator has no more elements. - */ - public rodGELI next() { - if (!this.hasNext()) throw new NoSuchElementException("RodGELI next called on iterator with no more elements"); - return new rodGELI(rodName, iterator.next()); - } - - public void remove() { - throw new UnsupportedOperationException("'remove' operation is not supported for GELIs"); - } - - } - - public static Iterator createIterator(String name, File file) { - return new rodGELI.rodGELIIterator(name,file); - } - - /**** - * We no longer want to use this class as truth data (sigh) - * - public List getFWDAlleles() { - return new ArrayList(); - } - - public String getFWDRefBases() { - return ""; - } - - public char getRef() { - return (char)(gh.getReferenceBase() & 0xff); - } - - public boolean isPointGenotype() { return true; } - public boolean isIndelGenotype() { return false; } - public boolean isSNP() { return true; } - public boolean isReference() { return gh.isHomozygousReference(); } - public boolean isInsertion() { return false; } - public boolean isDeletion() { return false; } - public boolean isIndel() { return false; } - public boolean isBiallelic() { return false; } - public boolean isHom() { return gh.isHomozyous(); } - public boolean isHet() { return !isHom(); } - - public double getVariantConfidence() { - return gh.getBestToReferenceLod(); - } - - public double getConsensusConfidence() { - return gh.getBestToSecondBestLod(); - } - * - */ - - public static void main(String argv[]) { - String testFile = "NA12878.geli"; - - Iterator it = createIterator("test-geli", new File(testFile)); - - net.sf.picard.reference.ReferenceSequenceFileWalker reference = new net.sf.picard.reference.ReferenceSequenceFileWalker(new File( "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); - - if ( reference.getSequenceDictionary() == null ) { - System.out.println("No reference sequence dictionary found. Abort."); - System.exit(1); - } - - GenomeLocParser.setupRefContigOrdering(reference.getSequenceDictionary()); - - int counter = 0; - - while ( it.hasNext() && counter < 500 ) { - rodGELI rg = it.next(); - System.out.println(rg.toString()); - counter++; - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java index 1dea7fbf0..6801842a5 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrack.java @@ -25,9 +25,16 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.util.CloseableIterator; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.FeatureSource; +import org.broad.tribble.source.BasicFeatureSource; +import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.File; +import java.io.IOException; import java.lang.reflect.Type; import java.util.Iterator; @@ -39,7 +46,7 @@ import java.util.Iterator; *

* the basics of what a reference metadata track must contain. */ -public abstract class RMDTrack { +public class RMDTrack { // the basics of a track: private final Class type; // our type @@ -47,20 +54,14 @@ public abstract class RMDTrack { private final String name; // the name private final File file; // the associated file we create the reader from - /** - * Create a track - * - * @param type the type of track, used for track lookup - * @param recordType the type of record produced - * @param name the name of this specific track - * @param file the associated file, for reference or recreating the reader - */ - protected RMDTrack(Class type, Class recordType, String name, File file) { - this.type = type; - this.recordType = recordType; - this.name = name; - this.file = file; - } + // our feature reader - allows queries + private BasicFeatureSource reader; + + // our sequence dictionary, which can be null + private final SAMSequenceDictionary dictionary; + + // our codec type + private final FeatureCodec codec; public Class getType() { return type; @@ -78,12 +79,6 @@ public abstract class RMDTrack { return recordType; } - /** - * @return how to get an iterator of the underlying data. This is all a track has to support, - * but other more advanced tracks support the query interface - */ - public abstract CloseableIterator getIterator(); - /** * helper function for determining if we are the same track based on name and record type * @@ -96,21 +91,90 @@ public abstract class RMDTrack { return (name.equals(this.name) && (type.getClass().isAssignableFrom(this.type.getClass()))); } + + /** + * Create a track + * + * @param type the type of track, used for track lookup + * @param name the name of this specific track + * @param file the associated file, for reference or recreating the reader + * @param reader the feature reader to use as the underlying data source + * @param dict the sam sequence dictionary + * @param codec the feature codec we use to decode this type + */ + public RMDTrack(Class type, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict, FeatureCodec codec) { + this.type = type; + this.recordType = codec.getFeatureType(); + this.name = name; + this.file = file; + this.reader = reader; + this.dictionary = dict; + this.codec = codec; + } + + /** + * @return how to get an iterator of the underlying data. This is all a track has to support, + * but other more advanced tracks support the query interface + */ + public CloseableIterator getIterator() { + try { + return new FeatureToGATKFeatureIterator(reader.iterator(),this.getName()); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e); + } + } + /** * do we support the query interface? - * @return true if we can be cast to the QueryableTrack interface + * + * @return true */ - public abstract boolean supportsQuery(); + public boolean supportsQuery() { + return true; + } + + public CloseableIterator query(GenomeLoc interval) throws IOException { + return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName()); + } + + public CloseableIterator query(GenomeLoc interval, boolean contained) throws IOException { + return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName()); + } + + public CloseableIterator query(String contig, int start, int stop) throws IOException { + return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName()); + } + + public CloseableIterator query(String contig, int start, int stop, boolean contained) throws IOException { + return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName()); + } + + public void close() { + try { + reader.close(); + } catch (IOException e) { + throw new UserException.MalformedFile("Unable to close reader " + reader.toString(),e); + } + reader = null; + } + + public FeatureSource getReader() { + return reader; + } /** * get the sequence dictionary from the track, if available * @return a SAMSequenceDictionary if available, null if unavailable */ public SAMSequenceDictionary getSequenceDictionary() { - return null; // default, others can override this + return dictionary; } public Object getHeader() { - return null; + return reader.getHeader(); + } + + public FeatureCodec getCodec() { + return codec; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManager.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManager.java deleted file mode 100644 index b96ff102c..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManager.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.util.*; - -/** - * Find the available track builders, and create the requisite tracks from the command line. - * - * In Tribble RMD tracks have two classes: - * - a Feature that is the model/view for the data - * - a Codec that is the controller to generate the Feature. - * - * In this class, the track types are the Codecs. The track record types are the Features. - */ -public class RMDTrackManager extends PluginManager { - // the input strings we use to create RODs from - List inputs = new ArrayList(); - - // create an active mapping of builder instances, and a map of the name -> class for convenience - /** the tracks that are available to us, associated with their builder */ - Map availableTrackBuilders; - /** the classes names, with their class description (think the Controller Codecs) */ - Map availableTrackTypes; - /** the available track record types (think the Model/View Features) */ - Map availableTrackRecordTypes; - - /** Create a new track plugin manager. */ - public RMDTrackManager() { - super(RMDTrackBuilder.class, "TrackBuilders", null); - } - - /** - * find the associated reference meta data - * - * @param bindings the bindings of strings from the -B command line option - * - * @return a list of RMDTracks, one for each -B option - */ - public List getReferenceMetaDataSources(GenomeAnalysisEngine engine,List bindings) { - initializeTrackTypes(); - initializeBindings(engine,bindings); - // try and make the tracks given their requests - return createRequestedTrackObjects(); - } - - - /** - * Returns a collection of track names that match the record type. - * @param trackRecordType the record type specified in the @RMD annotation - * @return a collection of available track record type names that match the record type - */ - public Collection getTrackRecordTypeNames(Class trackRecordType) { - initializeTrackTypes(); - initializeTrackRecordTypes(); - Set names = new TreeSet(); - for (Map.Entry availableTrackRecordType: availableTrackRecordTypes.entrySet()) { - if (trackRecordType.isAssignableFrom(availableTrackRecordType.getValue())) - names.add(availableTrackRecordType.getKey()); - } - return names; - } - - /** - * initialize our lists of bindings - * @param engine The engine, used to populate tags. - * @param bindings the input to the GATK, as a list of strings passed in through the -B options - */ - private void initializeBindings(GenomeAnalysisEngine engine,List bindings) { - // NOTE: Method acts as a static. Once the inputs have been passed once they are locked in. - if (inputs.size() > 0 || bindings.size() == 0) - return; - - for (String binding: bindings) { - if(engine != null && engine.getTags(binding).size() == 2) { - // Assume that if tags are present, those tags are name and type. - // Name is always first, followed by type. - List parameters = engine.getTags(binding); - String name = parameters.get(0); - String type = parameters.get(1); - inputs.add(new RMDTriplet(name,type,binding)); - } - else { - // Otherwise, use old-format bindings. - String[] split = binding.split(","); - if (split.length != 3) throw new IllegalArgumentException(binding + " is not a valid reference metadata track description"); - inputs.add(new RMDTriplet(split[0], split[1], split[2])); - } - } - } - - /** - * initialize our lists of tracks and builders - */ - private void initializeTrackTypes() { - if (availableTrackBuilders != null && availableTrackTypes != null) - return; - - // create an active mapping of builder instances, and a map of the name -> class for convenience - availableTrackBuilders = new HashMap(); - availableTrackTypes = new HashMap(); - createBuilderObjects(); - } - - /** - * create the builder objects from the retrieved list - */ - private void createBuilderObjects() { - // create a track builder instance for each track builder, and find out what tracks we can make - for (String builderName : this.pluginsByName.keySet()) { - RMDTrackBuilder builder = this.createByName(builderName); - Map mapping = builder.getAvailableTrackNamesAndTypes(); - for (String name : mapping.keySet()) { - availableTrackBuilders.put(name.toUpperCase(), builder); - availableTrackTypes.put(name.toUpperCase(), mapping.get(name)); - } - } - } - - /** - * initialize our list of track record types - */ - private void initializeTrackRecordTypes() { - if (availableTrackRecordTypes != null) - return; - - availableTrackRecordTypes = new HashMap(); - for (RMDTrackBuilder builder : availableTrackBuilders.values()) { - Map mapping = builder.getAvailableTrackNamesAndRecordTypes(); - for (String name : mapping.keySet()) { - availableTrackRecordTypes.put(name.toUpperCase(), mapping.get(name)); - } - } - } - - /** - * create the requested track objects - * - * @return a list of the tracks, one for each of the requested input tracks - */ - private List createRequestedTrackObjects() { - // create of live instances of the tracks - List tracks = new ArrayList(); - - // create instances of each of the requested types - for (RMDTriplet trip : inputs) { - RMDTrackBuilder b = availableTrackBuilders.get(trip.getType().toUpperCase()); - if (b == null) throw new UserException.CommandLineException("Unable to find track for " + trip.getType()); - tracks.add(b.createInstanceOfTrack(availableTrackTypes.get(trip.getType().toUpperCase()), trip.getName(), new File(trip.getFile()))); - } - return tracks; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RODRMDTrack.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RODRMDTrack.java deleted file mode 100644 index eee6093bc..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RODRMDTrack.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; - -import java.io.File; -import java.util.Iterator; - - - -/** - * - * @author aaron - * - * Class RODRMDTrack - * - * wrap a reference ordered data object in the new track style. This track will hopefully be phased-out as we move to - * a FeatureReader based system. - */ -public class RODRMDTrack extends RMDTrack { - - // our ROD - private ReferenceOrderedData data; - - /** - * Create a track - * - * @param type the type of track, used for track lookup - * @param name the name of this specific track - * @param file the associated file, for reference or recreating the reader - * @param data the ROD to use as the underlying data source for this track - */ - public RODRMDTrack(Class type, String name, File file, ReferenceOrderedData data) { - super(type, type, name, file); // the decoder type and the record type are the same in this case - this.data = data; - } - - /** - * @return how to get an iterator of the underlying data. This is all a track has to support, - * but other more advanced tracks support the query interface - */ - @Override - public CloseableIterator getIterator() { - return new GATKFeatureIterator(data.iterator()); - } - - /** - * do we support the query interface? - * - * @return false - */ - @Override - public boolean supportsQuery() { - return false; // sadly no, we don't - } -} - diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java deleted file mode 100644 index 4890bb9a0..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/TribbleTrack.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.util.CloseableIterator; -import org.broad.tribble.FeatureCodec; -import org.broad.tribble.FeatureSource; -import org.broad.tribble.source.BasicFeatureSource; -import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; - - -/** - * - * @author aaron - * - * Class TribbleTrack - * - * A feature reader track, implementing the RMDTrack for tracks that are generated out of Tribble - */ -public class TribbleTrack extends RMDTrack implements QueryableTrack { - // our feature reader - allows queries - private BasicFeatureSource reader; - - // our sequence dictionary, which can be null - private final SAMSequenceDictionary dictionary; - - // our codec type - private final FeatureCodec codec; - /** - * Create a track - * - * @param type the type of track, used for track lookup - * @param name the name of this specific track - * @param file the associated file, for reference or recreating the reader - * @param reader the feature reader to use as the underlying data source - * @param dict the sam sequence dictionary - * @param codec the feature codec we use to decode this type - */ - public TribbleTrack(Class type, String name, File file, BasicFeatureSource reader, SAMSequenceDictionary dict, FeatureCodec codec) { - super(type, codec.getFeatureType(), name, file); - this.reader = reader; - this.dictionary = dict; - this.codec = codec; - } - - /** - * @return how to get an iterator of the underlying data. This is all a track has to support, - * but other more advanced tracks support the query interface - */ - @Override - public CloseableIterator getIterator() { - try { - return new FeatureToGATKFeatureIterator(reader.iterator(),this.getName()); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e); - } - } - - /** - * do we support the query interface? - * - * @return true - */ - @Override - public boolean supportsQuery() { - return true; - } - - public CloseableIterator query(GenomeLoc interval) throws IOException { - return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName()); - } - - public CloseableIterator query(GenomeLoc interval, boolean contained) throws IOException { - return new FeatureToGATKFeatureIterator(reader.query(interval.getContig(),(int)interval.getStart(),(int)interval.getStop()),this.getName()); - } - - public CloseableIterator query(String contig, int start, int stop) throws IOException { - return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName()); - } - - public CloseableIterator query(String contig, int start, int stop, boolean contained) throws IOException { - return new FeatureToGATKFeatureIterator(reader.query(contig,start,stop),this.getName()); - } - - public void close() { - try { - reader.close(); - } catch (IOException e) { - throw new ReviewedStingException("Unable to close reader " + reader.toString(),e); - } - reader = null; - } - - public FeatureSource getReader() { - return reader; - } - - /** - * get the sequence dictionary from the track, if available - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public SAMSequenceDictionary getSequenceDictionary() { - return dictionary; - } - - public Object getHeader() { - return reader.getHeader(); - } - - public FeatureCodec getCodec() { - return codec; - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index 17b778f45..8ed3d2c7f 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -1,5 +1,6 @@ /* - * Copyright (c) 2010. The Broad Institute + * Copyright (c) 2010 The Broad Institute + * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -11,41 +12,88 @@ * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.refdata.tracks.builders; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broad.tribble.*; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.source.BasicFeatureSource; +import org.broad.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; +import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.file.FSLockWithShared; +import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; -import java.io.File; -import java.util.Map; - +import java.io.*; +import java.util.*; /** - * @author aaron - *

- * Interface RMDTrackBuilder - *

- * The basic interface for finding and parsing RMDTracks. Track builders present an interface that allows - * the track manager to find and create tracks of the specified type. + * + * @author aaron + * + * Class RMDTrackBuilder + * + * This class keeps track of the available codecs, and knows how to put together a track of + * that gets iterators from the FeatureReader using Tribble. + * */ -public interface RMDTrackBuilder { +public class RMDTrackBuilder extends PluginManager { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - /** @return a list of all available tracks types we currently have access to create */ - public Map getAvailableTrackNamesAndTypes(); + // the input strings we use to create RODs from + List inputs = new ArrayList(); + + // the linear index extension + public static final String indexExtension = ".idx"; + + private Map classes = null; + + /** Create a new plugin manager. */ + public RMDTrackBuilder() { + super(FeatureCodec.class, "Codecs", "Codec"); + } + + /** @return a list of all available track types we currently have access to create */ + public Map getAvailableTrackNamesAndTypes() { + classes = new HashMap(); + for (String name: this.pluginsByName.keySet()) { + classes.put(name.toUpperCase(), pluginsByName.get(name)); + } + return classes; + } /** @return a list of all available track record types we currently have access to create */ - public Map getAvailableTrackNamesAndRecordTypes(); + public Map getAvailableTrackNamesAndRecordTypes() { + HashMap classToRecord = new HashMap(); + for (String name: this.pluginsByName.keySet()) { + FeatureCodec codec = this.createByName(name); + classToRecord.put(name, codec.getFeatureType()); + } + return classToRecord; + } /** * create a RMDTrack of the specified type @@ -55,7 +103,317 @@ public interface RMDTrackBuilder { * @param inputFile the input file * * @return an instance of the track - * @throws RMDTrackCreationException if we don't know of the target class or we couldn't create it + * @throws RMDTrackCreationException + * if we don't know of the target class or we couldn't create it */ - public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException; + public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException { + // return a feature reader track + Pair pair = createFeatureReader(targetClass, name, inputFile); + if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); + return new RMDTrack(targetClass, name, inputFile, pair.first, pair.second, createCodec(targetClass, name)); + } + + /** + * create a tribble feature reader class, given the target class and the input file + * @param targetClass the target class, of a Tribble Codec type + * @param inputFile the input file, that corresponds to the feature type + * @return a pair of + */ + public Pair createFeatureReader(Class targetClass, File inputFile) { + return createFeatureReader(targetClass, "anonymous", inputFile); + } + + /** + * create a feature reader of the specified type + * @param targetClass the target codec type + * @param name the target name + * @param inputFile the input file to create the track from (of the codec type) + * @return the FeatureReader instance + */ + public Pair createFeatureReader(Class targetClass, String name, File inputFile) { + Pair pair; + if (inputFile.getAbsolutePath().endsWith(".gz")) + pair = createBasicFeatureSourceNoAssumedIndex(targetClass, name, inputFile); + else + pair = getLinearFeatureReader(targetClass, name, inputFile); + return pair; + } + + /** + * create a feature reader, without assuming there exists an index. This code assumes the feature + * reader of the appropriate type will figure out what the right index type is, and determine if it + * exists. + * + * @param targetClass the codec class type + * @param name the name of the track + * @param inputFile the file to load + * @return a feature reader implementation + */ + private Pair createBasicFeatureSourceNoAssumedIndex(Class targetClass, String name, File inputFile) { + // we might not know the index type, try loading with the default reader constructor + logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); + try { + return new Pair(BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create feature reader from file", e); + } + } + + /** + * add a name to the codec, if it takes one + * @param targetClass the class to create a codec for + * @param name the name to assign this codec + * @return the feature codec itself + */ + private FeatureCodec createCodec(Class targetClass, String name) { + FeatureCodec codex = this.createByType(targetClass); + if ( codex instanceof NameAwareCodec ) + ((NameAwareCodec)codex).setName(name); + return codex; + } + + /** + * create a linear feature reader, where we create the index ahead of time + * @param targetClass the target class + * @param name the name of the codec + * @param inputFile the tribble file to parse + * @return the input file as a FeatureReader + */ + private Pair getLinearFeatureReader(Class targetClass, String name, File inputFile) { + Pair reader; + try { + Index index = loadIndex(inputFile, createCodec(targetClass, name), true); + reader = new Pair(new BasicFeatureSource(inputFile.getAbsolutePath(), + index, + createCodec(targetClass, name)), + sequenceSetToDictionary(index.getSequenceNames())); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create reader with file", e); + } catch (IOException e) { + throw new UserException("Unable to make the index file for " + inputFile, e); + } + return reader; + } + + /** + * create an index for the input file + * @param inputFile the input file + * @param codec the codec to use + * @param onDisk write the index to disk? + * @return a linear index for the specified type + * @throws IOException if we cannot write the index file + */ + public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException { + + // create the index file name, locking on the index file name + File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension); + FSLockWithShared lock = new FSLockWithShared(indexFile); + + // acquire a lock on the file + Index idx = null; + if (indexFile.canRead()) + idx = attemptIndexFromDisk(inputFile, codec, indexFile, lock); + + // if we managed to make an index, return + if (idx != null) return idx; + + // we couldn't read the file, or we fell out of the conditions above, continue on to making a new index + return createNewIndex(inputFile, codec, onDisk, indexFile, lock); + } + + /** + * attempt to read the index from disk + * @param inputFile the input file + * @param codec the codec to read from + * @param indexFile the index file itself + * @param lock the lock file + * @return an index, or null if we couldn't load one + * @throws IOException if we fail for FS issues + */ + protected static Index attemptIndexFromDisk(File inputFile, FeatureCodec codec, File indexFile, FSLockWithShared lock) throws IOException { + boolean locked; + try { + locked = lock.sharedLock(); + } + catch(FileSystemInabilityToLockException ex) { + throw new UserException.MissortedFile(inputFile, "Unexpected inability to lock exception", ex); + } + Index idx; + try { + if (!locked) // can't lock file + idx = createIndexInMemory(inputFile, codec); + else + idx = loadFromDisk(inputFile, indexFile); + } finally { + if (locked) lock.unlock(); + } + return idx; + } + + /** + * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) + * @param inputFile the input file + * @param indexFile the input file, plus the index extension + * @return an Index, or null if we're unable to load + */ + public static Index loadFromDisk(File inputFile, File indexFile) { + logger.info("Loading Tribble index from disk for file " + inputFile); + Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + // check if the file is up-to date (filestamp and version check) + if (index.isCurrentVersion() && indexFile.lastModified() > inputFile.lastModified()) + return index; + else if (indexFile.lastModified() < inputFile.lastModified()) + logger.warn("Index file " + indexFile + " is out of date (index older than input file), deleting and updating the index file"); + else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable + logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file"); + + // however we got here, remove the index and return null + boolean deleted = indexFile.delete(); + + if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); + return null; + } + + + /** + * attempt to create the index, and write it to disk + * @param inputFile the input file + * @param codec the codec to use + * @param onDisk if they asked for disk storage or now + * @param indexFile the index file location + * @param lock the locking object + * @return the index object + * @throws IOException when unable to create the new index + */ + private static Index createNewIndex(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, FSLockWithShared lock) throws IOException { + Index index = createIndexInMemory(inputFile, codec); + + boolean locked = false; // could we exclusive lock the file? + try { + locked = lock.exclusiveLock(); + if (locked) { + logger.info("Writing Tribble index to disk for file " + inputFile); + LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + index.write(stream); + stream.close(); + } + else // we can't write it to disk, just store it in memory, tell them this + if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only"); + return index; + } + catch(FileSystemInabilityToLockException ex) { + throw new UserException.MissortedFile(inputFile,"Unexpected inability to lock exception", ex); + } + finally { + if (locked) lock.unlock(); + } + + } + + /** + * create the index in memory, given the input file and feature codec + * @param inputFile the input file + * @param codec the codec + * @return a LinearIndex, given the file location + * @throws IOException when unable to create the index in memory + */ + private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException { + // this can take a while, let them know what we're doing + logger.info("Creating Tribble index in memory for file " + inputFile); + return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + } + + /** + * convert a list of Strings into a sequence dictionary + * @param contigList the contig list, in coordinate order, this is allowed to be null + * @return a SAMSequenceDictionary, WITHOUT contig sizes + */ + private static SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet contigList) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + if (contigList == null) return dict; + + for (String name : contigList) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + /** + * Returns a collection of track names that match the record type. + * @param trackRecordType the record type specified in the @RMD annotation + * @return a collection of available track record type names that match the record type + */ + public Collection getTrackRecordTypeNames(Class trackRecordType) { + Set names = new TreeSet(); + if (trackRecordType == null) + throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); + + for (Map.Entry availableTrackRecordType: getAvailableTrackNamesAndRecordTypes().entrySet()) { + if (availableTrackRecordType.getValue() != null && trackRecordType.isAssignableFrom(availableTrackRecordType.getValue())) + names.add(availableTrackRecordType.getKey()); + } + return names; + } + + /** + * find the associated reference meta data + * + * @param bindings the bindings of strings from the -B command line option + * + * @return a list of RMDTracks, one for each -B option + */ + public List getReferenceMetaDataSources(GenomeAnalysisEngine engine,List bindings) { + initializeBindings(engine,bindings); + // try and make the tracks given their requests + return createRequestedTrackObjects(); + } + + /** + * initialize our lists of bindings + * @param engine The engine, used to populate tags. + * @param bindings the input to the GATK, as a list of strings passed in through the -B options + */ + private void initializeBindings(GenomeAnalysisEngine engine,List bindings) { + // NOTE: Method acts as a static. Once the inputs have been passed once they are locked in. + if (inputs.size() > 0 || bindings.size() == 0) + return; + + for (String binding: bindings) { + if(engine != null && engine.getTags(binding).size() == 2) { + // Assume that if tags are present, those tags are name and type. + // Name is always first, followed by type. + List parameters = engine.getTags(binding); + String name = parameters.get(0); + String type = parameters.get(1); + inputs.add(new RMDTriplet(name,type,binding)); + } + else { + // Otherwise, use old-format bindings. + String[] split = binding.split(","); + if (split.length != 3) throw new IllegalArgumentException(binding + " is not a valid reference metadata track description"); + inputs.add(new RMDTriplet(split[0], split[1], split[2])); + } + } + } + + /** + * create the requested track objects + * + * @return a list of the tracks, one for each of the requested input tracks + */ + private List createRequestedTrackObjects() { + // create of live instances of the tracks + List tracks = new ArrayList(); + + // create instances of each of the requested types + for (RMDTriplet trip : inputs) { + Class featureCodecClass = getAvailableTrackNamesAndTypes().get(trip.getType().toUpperCase()); + if (featureCodecClass == null) + throw new UserException.BadArgumentValue("-B",trip.getType()); + tracks.add(createInstanceOfTrack(featureCodecClass, trip.getName(), new File(trip.getFile()))); + } + return tracks; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java deleted file mode 100644 index b04b2dad1..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RODTrackBuilder.java +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks.builders; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; -import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack; - -import java.io.File; -import java.util.HashMap; -import java.util.Map; - - -/** - * @author aaron - *

- * Class RODTrackBuilder - *

- * the builder for tracks of the current ROD system, a holdover until Tribble supports binary and multi-line formats - */ -public class RODTrackBuilder implements RMDTrackBuilder { - - /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(ReferenceOrderedData.class); - - /** - * the bindings from track name to the ROD class we use - */ - private static HashMap> Types = new HashMap>(); - - static { - // All known ROD types - Types.put("GELI", rodGELI.class); - Types.put("Table", TabularROD.class); - Types.put("Intervals", IntervalRod.class); - Types.put("Plink", PlinkRod.class); - } - - /** - * create a RMDTrack of the specified type - * - * @param targetClass the target class of track - * @param name what to call the track - * @param inputFile the input file - * - * @return an instance of the track - * @throws org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException - * if we don't know of the target class or we couldn't create it - */ - //@Override - public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException { - return new RODRMDTrack(targetClass, name, inputFile, createROD(name,targetClass,inputFile)); - } - - /** @return a map of all available track types we currently have access to create */ - @Override - public Map getAvailableTrackNamesAndTypes() { - return new HashMap(Types); - } - - /** @return a map of all available track record types we currently have access to create */ - @Override - public Map getAvailableTrackNamesAndRecordTypes() { - return new HashMap(Types); - } - - /** - * Helpful function that parses a single triplet of and returns the corresponding ROD with - * , of type that reads its input from . - * - * @param trackName the name of the track to create - * @param type the type of the track to create - * @param fileName the filename to create the track from - * @return a reference ordered data track - */ - public ReferenceOrderedData createROD(final String trackName, Class type, File fileName) { - - // Create the ROD - ReferenceOrderedData rod = new ReferenceOrderedData(trackName, fileName, type ); - logger.info(String.format("Created binding from %s to %s of type %s", trackName, fileName, type)); - return rod; - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java deleted file mode 100644 index e0fc5dd52..000000000 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java +++ /dev/null @@ -1,334 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks.builders; - -import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broad.tribble.*; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.source.BasicFeatureSource; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackCreationException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.classloader.PluginManager; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.file.FSLockWithShared; -import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; - -import java.io.*; -import java.util.*; - - -/** - * - * @author aaron - * - * Class TribbleRMDTrackBuilder - * - * This class keeps track of the available codecs, and knows how to put together a track of - * that gets iterators from the FeatureReader using Tribble. - * - */ -public class TribbleRMDTrackBuilder extends PluginManager implements RMDTrackBuilder { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(TribbleRMDTrackBuilder.class); - - // the linear index extension - public static final String indexExtension = ".idx"; - - /** Create a new plugin manager. */ - public TribbleRMDTrackBuilder() { - super(FeatureCodec.class, "Codecs", "Codec"); - } - - /** @return a list of all available track types we currently have access to create */ - public Map getAvailableTrackNamesAndTypes() { - return new HashMap(this.pluginsByName); - } - - /** @return a list of all available track record types we currently have access to create */ - public Map getAvailableTrackNamesAndRecordTypes() { - Map classes = new HashMap(); - for (String name: this.pluginsByName.keySet()) { - FeatureCodec codec = this.createByName(name); - classes.put(name, codec.getFeatureType()); - } - return classes; - } - - /** - * create a RMDTrack of the specified type - * - * @param targetClass the target class of track - * @param name what to call the track - * @param inputFile the input file - * - * @return an instance of the track - * @throws RMDTrackCreationException - * if we don't know of the target class or we couldn't create it - */ - public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException { - // return a feature reader track - Pair pair = createFeatureReader(targetClass, name, inputFile); - if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); - return new TribbleTrack(targetClass, name, inputFile, pair.first, pair.second, createCodec(targetClass, name)); - } - - /** - * create a tribble feature reader class, given the target class and the input file - * @param targetClass the target class, of a Tribble Codec type - * @param inputFile the input file, that corresponds to the feature type - * @return a pair of - */ - public Pair createFeatureReader(Class targetClass, File inputFile) { - return createFeatureReader(targetClass, "anonymous", inputFile); - } - - /** - * create a feature reader of the specified type - * @param targetClass the target codec type - * @param name the target name - * @param inputFile the input file to create the track from (of the codec type) - * @return the FeatureReader instance - */ - public Pair createFeatureReader(Class targetClass, String name, File inputFile) { - Pair pair; - if (inputFile.getAbsolutePath().endsWith(".gz")) - pair = createBasicFeatureSourceNoAssumedIndex(targetClass, name, inputFile); - else - pair = getLinearFeatureReader(targetClass, name, inputFile); - return pair; - } - - /** - * create a feature reader, without assuming there exists an index. This code assumes the feature - * reader of the appropriate type will figure out what the right index type is, and determine if it - * exists. - * - * @param targetClass the codec class type - * @param name the name of the track - * @param inputFile the file to load - * @return a feature reader implementation - */ - private Pair createBasicFeatureSourceNoAssumedIndex(Class targetClass, String name, File inputFile) { - // we might not know the index type, try loading with the default reader constructor - logger.info("Attempting to blindly load " + inputFile + " as a tabix indexed file"); - try { - return new Pair(BasicFeatureSource.getFeatureSource(inputFile.getAbsolutePath(), createCodec(targetClass, name)),null); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create feature reader from file", e); - } - } - - /** - * add a name to the codec, if it takes one - * @param targetClass the class to create a codec for - * @param name the name to assign this codec - * @return the feature codec itself - */ - private FeatureCodec createCodec(Class targetClass, String name) { - FeatureCodec codex = this.createByType(targetClass); - if ( codex instanceof NameAwareCodec ) - ((NameAwareCodec)codex).setName(name); - return codex; - } - - /** - * create a linear feature reader, where we create the index ahead of time - * @param targetClass the target class - * @param name the name of the codec - * @param inputFile the tribble file to parse - * @return the input file as a FeatureReader - */ - private Pair getLinearFeatureReader(Class targetClass, String name, File inputFile) { - Pair reader; - try { - Index index = loadIndex(inputFile, createCodec(targetClass, name), true); - reader = new Pair(new BasicFeatureSource(inputFile.getAbsolutePath(), - index, - createCodec(targetClass, name)), - sequenceSetToDictionary(index.getSequenceNames())); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(inputFile, "Unable to create reader with file", e); - } catch (IOException e) { - throw new UserException("Unable to make the index file for " + inputFile, e); - } - return reader; - } - - /** - * create an index for the input file - * @param inputFile the input file - * @param codec the codec to use - * @param onDisk write the index to disk? - * @return a linear index for the specified type - * @throws IOException if we cannot write the index file - */ - public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException { - - // create the index file name, locking on the index file name - File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension); - FSLockWithShared lock = new FSLockWithShared(indexFile); - - // acquire a lock on the file - Index idx = null; - if (indexFile.canRead()) - idx = attemptIndexFromDisk(inputFile, codec, indexFile, lock); - - // if we managed to make an index, return - if (idx != null) return idx; - - // we couldn't read the file, or we fell out of the conditions above, continue on to making a new index - return createNewIndex(inputFile, codec, onDisk, indexFile, lock); - } - - /** - * attempt to read the index from disk - * @param inputFile the input file - * @param codec the codec to read from - * @param indexFile the index file itself - * @param lock the lock file - * @return an index, or null if we couldn't load one - * @throws IOException if we fail for FS issues - */ - protected static Index attemptIndexFromDisk(File inputFile, FeatureCodec codec, File indexFile, FSLockWithShared lock) throws IOException { - boolean locked; - try { - locked = lock.sharedLock(); - } - catch(FileSystemInabilityToLockException ex) { - throw new ReviewedStingException("Unexpected inability to lock exception", ex); - } - Index idx; - try { - if (!locked) // can't lock file - idx = createIndexInMemory(inputFile, codec); - else - idx = loadFromDisk(inputFile, indexFile); - } finally { - if (locked) lock.unlock(); - } - return idx; - } - - /** - * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) - * @param inputFile the input file - * @param indexFile the input file, plus the index extension - * @return an Index, or null if we're unable to load - */ - public static Index loadFromDisk(File inputFile, File indexFile) { - logger.info("Loading Tribble index from disk for file " + inputFile); - Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - - // check if the file is up-to date (filestamp and version check) - if (index.isCurrentVersion() && indexFile.lastModified() > inputFile.lastModified()) - return index; - else if (indexFile.lastModified() < inputFile.lastModified()) - logger.warn("Index file " + indexFile + " is out of date (index older than input file), deleting and updating the index file"); - else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable - logger.warn("Index file " + indexFile + " is out of date (old version), deleting and updating the index file"); - - // however we got here, remove the index and return null - boolean deleted = indexFile.delete(); - - if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); - return null; - } - - - /** - * attempt to create the index, and write it to disk - * @param inputFile the input file - * @param codec the codec to use - * @param onDisk if they asked for disk storage or now - * @param indexFile the index file location - * @param lock the locking object - * @return the index object - * @throws IOException when unable to create the new index - */ - private static Index createNewIndex(File inputFile, FeatureCodec codec, boolean onDisk, File indexFile, FSLockWithShared lock) throws IOException { - Index index = createIndexInMemory(inputFile, codec); - - boolean locked = false; // could we exclusive lock the file? - try { - locked = lock.exclusiveLock(); - if (locked) { - logger.info("Writing Tribble index to disk for file " + inputFile); - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - } - else // we can't write it to disk, just store it in memory, tell them this - if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only"); - return index; - } - catch(FileSystemInabilityToLockException ex) { - throw new ReviewedStingException("Unexpected inability to lock exception", ex); - } - finally { - if (locked) lock.unlock(); - } - - } - - /** - * create the index in memory, given the input file and feature codec - * @param inputFile the input file - * @param codec the codec - * @return a LinearIndex, given the file location - * @throws IOException when unable to create the index in memory - */ - private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException { - // this can take a while, let them know what we're doing - logger.info("Creating Tribble index in memory for file " + inputFile); - return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - } - - /** - * convert a list of Strings into a sequence dictionary - * @param contigList the contig list, in coordinate order, this is allowed to be null - * @return a SAMSequenceDictionary, WITHOUT contig sizes - */ - private static SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet contigList) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - if (contigList == null) return dict; - - for (String name : contigList) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Alignability.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Alignability.java deleted file mode 100755 index bd52443b3..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/Alignability.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFHeaderLineType; -import org.broad.tribble.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.TabularROD; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; - -import java.util.HashMap; -import java.util.Map; -import java.util.List; -import java.util.Arrays; - - -public class Alignability implements InfoFieldAnnotation { - - public Map annotate(RefMetaDataTracker tracker, - ReferenceContext ref, - Map stratifiedContexts, - VariantContext vc) - { - TabularROD record = tracker.lookup("alignability",TabularROD.class); - if (record == null) - return null; - - if (record.get("alignability") == null) - throw new RuntimeException("ERROR: alignability column not defined in alignability input.\n"); - - int value = Integer.parseInt(record.get("alignability")); - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%d", value)); - return map; - } - - public List getKeyNames() { return Arrays.asList("Alignability"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Integer, "Alignability according to a mask file (3 is best, 0 is worst)")); } -} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index e93b579a6..f140abe3a 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; @@ -399,7 +399,7 @@ public class DepthOfCoverageWalker extends LocusWalker { if ( RefseqFileName != null ) { logger.info("Using RefSeq annotations from "+RefseqFileName); - TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + RMDTrackBuilder builder = new RMDTrackBuilder(); FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first; try { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java index 1d736a602..8c205070e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java @@ -31,11 +31,9 @@ import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; @@ -88,14 +86,13 @@ public class PickSequenomProbes extends RodWalker { logger.info("Loading SNP mask... "); ReferenceOrderedData snp_mask; if ( SNP_MASK.contains(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)) { - TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + RMDTrackBuilder builder = new RMDTrackBuilder(); CloseableIterator iter = builder.createInstanceOfTrack(DbSNPCodec.class,"snp_mask",new java.io.File(SNP_MASK)).getIterator(); snpMaskIterator = new SeekableRODIterator(iter); } else { - snp_mask = new ReferenceOrderedData("snp_mask", - new java.io.File(SNP_MASK), TabularROD.class); - snpMaskIterator = new SeekableRODIterator(new GATKFeatureIterator(snp_mask.iterator())); + // TODO: fix me when Plink is back + throw new IllegalArgumentException("We currently do not support other snp_mask tracks (like Plink)"); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java index c74f72210..f7c7997fa 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverter.java @@ -32,7 +32,6 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; -import org.broadinstitute.sting.gatk.refdata.PlinkRod; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors; import org.broadinstitute.sting.gatk.walkers.*; @@ -208,10 +207,6 @@ public class SequenomValidationConverter extends RodWalker(vContext, ref.getBase()); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java index dcc96d66f..3c5c27173 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DbSNPWindowCounter.java @@ -9,7 +9,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; @@ -47,7 +47,7 @@ public class DbSNPWindowCounter extends LocusWalker { public void initialize() { - TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + RMDTrackBuilder builder = new RMDTrackBuilder(); reader = builder.createFeatureReader(DbSNPCodec.class,myDbSNPFile).first; } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java index 2c907ebd1..a2ff62a72 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DesignFileGeneratorWalker.java @@ -47,7 +47,7 @@ public class DesignFileGeneratorWalker extends RodWalker { if ( intervalsList != null && intervalsList.size() > 0 ) { for ( Object interval : intervalsList ) { - GenomeLoc loc = ((IntervalRod) interval).getLocation(); + GenomeLoc loc = ((GATKFeature)interval).getLocation(); if ( ! intervalBuffer.keySet().contains(loc) ) { intervalBuffer.put(loc,new IntervalInfoBuilder()); } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GCCalculatorWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GCCalculatorWalker.java index 0d7b922f4..ab78c7dac 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GCCalculatorWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/GCCalculatorWalker.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.oneoffprojects.walkers; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.IntervalRod; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.RefWalker; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java index 2c8df083c..454014991 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/IndelAnnotator.java @@ -11,7 +11,7 @@ import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils import org.broadinstitute.sting.gatk.refdata.*; import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.walkers.RodWalker; @@ -36,7 +36,7 @@ public class IndelAnnotator extends RodWalker { public void initialize() { if ( RefseqFileName != null ) { - TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); + RMDTrackBuilder builder = new RMDTrackBuilder(); FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class,new File(RefseqFileName)).first; try { diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/VariantEvaluatorBySample.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/VariantEvaluatorBySample.java index 834ff7382..273603f8e 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/VariantEvaluatorBySample.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/varianteval/VariantEvaluatorBySample.java @@ -5,7 +5,6 @@ import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvaluator; import org.broadinstitute.sting.playground.utils.report.tags.DataPoint; import org.broadinstitute.sting.playground.utils.report.utils.TableType; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/CoverageAcrossBaitsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/CoverageAcrossBaitsWalker.java deleted file mode 100755 index 614e266d6..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/CoverageAcrossBaitsWalker.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.hybridselection; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.IntervalRod; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.io.PrintStream; - -/** - * Accumulates coverage across hybrid selection bait intervals to assess effect of bait adjacency and overlap on coverage - */ - -@By(DataSource.REFERENCE) -public class CoverageAcrossBaitsWalker extends LocusWalker, CoverageAcrossBaitsWalker.Coverage> { - @Output - public PrintStream out; - - /* Accumulates coverage across hybrid selection bait intervals to assess effect of bait adjacency. - Requires input bait intervals that have an overhang beyond the actual bait interval to capture coverage data at these points. - Outputs R parseable file that has all data in lists and then does some basic plotting. - */ - @Argument(fullName="include_duplicates", shortName="idup", required=false, doc="consider duplicate reads") - public boolean INCLUDE_DUPLICATE_READS = false; - - @Argument(fullName="min_mapq", shortName="mmq", required=false, doc="Minimum mapping quality of reads to consider") - public Integer MIN_MAPQ = 1; - - @Argument(fullName="min_len", shortName="min", required=false, doc="Minimum length of interval to consider") - public Integer MIN_LEN = Integer.MIN_VALUE; - - @Argument(fullName="max_len", shortName="max", required=false, doc="Maximum length of interval to consider") - public Integer MAX_LEN = 5000; - - @Argument(fullName="max_bait_count", shortName="mbc", required=false, doc="Maximum number of baits to consider") - public Integer MAX_BAIT_COUNT = 24; - - @Argument(fullName="free_standing_distance", shortName="fsd", required=false, doc="minimum distance to next interval to consider freestanding") - public Integer FREE_STANDING_DISTANCE = 500; - - public static class Coverage { - // Container for carrying positive and negative strand coverage - public ArrayList pos = new ArrayList(); - public ArrayList neg = new ArrayList(); - public void addCoveragePoints(Pair pn) { - pos.add(pn.first); - neg.add(pn.second); - } - } - - public Pair map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List reads = context.getReads(); - IntervalRod intervalROD = tracker.lookup("interval",IntervalRod.class); - - GenomeLoc interval = intervalROD == null ? null : intervalROD.getLocation(); - if (interval == null) { throw new ReviewedStingException("No intervals at locus; should not happen"); } - int offset = (int)(context.getPosition() - interval.getStart()); - - int depth[] = new int[2]; - for ( int i = 0; i < reads.size(); i++ ) - { - SAMRecord read = reads.get(i); - - if (read.getNotPrimaryAlignmentFlag()) { continue; } - if (read.getReadUnmappedFlag()) { continue; } - if (!INCLUDE_DUPLICATE_READS && read.getDuplicateReadFlag()) { continue; } - if (read.getMappingQuality() < MIN_MAPQ) { continue; } - - depth[read.getReadNegativeStrandFlag() ? 0 : 1]++; - } - - return new Pair(depth[0],depth[1]); - } - - public Coverage reduceInit() { return new Coverage(); } - public Coverage reduce(Pair depths, Coverage cov) { - cov.addCoveragePoints(depths); - return cov; - } - - /** - * Return true if your walker wants to reduce each interval separately. Default is false. - * If you set this flag, several things will happen. - * The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce - * Reduce will accumulate normally at each map unit in the interval - * However, onTraversalDone(reduce) will be called after each interval is processed. - * The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done, - * which is overloaded here to call onTraversalDone(reduce) for each location - */ - public boolean isReduceByInterval() { - return true; - } - - public void onTraversalDone(List> results) { - int TOTAL_BAIT_WINDOW = MAX_BAIT_COUNT*120+FREE_STANDING_DISTANCE*2; - int depths[][][] = new int[2][MAX_BAIT_COUNT+1][TOTAL_BAIT_WINDOW]; // Indices: pos/neg strand count, bait count, position in bait - int bait_count[] = new int[MAX_BAIT_COUNT+1]; - - for (Pair pair : results) { - GenomeLoc loc = pair.first; - //if (loc.size() < MIN_LEN || loc.size() > MAX_LEN) { continue; } - - long interval_width = loc.size() - FREE_STANDING_DISTANCE * 2; - int baits = (int)(interval_width / 120); - //out.format("#Interval_width: %d Bait_count: %d\n", interval_width, baits); - if (interval_width % 120 != 0) { continue; } - if (interval_width > MAX_BAIT_COUNT*120) { continue; } - - // This target is good; count it - bait_count[baits]++; - Coverage pn_cov = pair.second; - for (int pn=0; pn<2; pn++) { - ArrayList cov = pn == 0 ? pn_cov.neg : pn_cov.pos; - for (int i=0; i0; baits--) { - int norm_depth_width = baits*120+FREE_STANDING_DISTANCE*2; - float norm_depths[] = new float[norm_depth_width]; - for (int pn=0; pn<2; pn++) { - for (int i=0; i 0) { - String depth_str = Arrays.toString(norm_depths); - depth_str = depth_str.substring(1,depth_str.length()-1); - out.format("b%c%d <- c(%s)\n", pn_char, baits, depth_str); - if (plot_begun) { - out.format("points(b%c%d, type='l', col=\"%s\")\n", pn_char, baits, pn_color); - }else{ - out.format("plot(b%c%d, type='l', col=\"%s\", xlab='Position from start of 1st bait', ylab='Coverage')\n", pn_char, baits, pn_color); - plot_begun = true; - } - } - out.format("%ccounts <- c(%ccounts,%d)\n", pn_char, pn_char, bait_count[baits]); - } - } - } - -} - diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java deleted file mode 100755 index 067c30571..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/hybridselection/HybSelPerformanceWalker.java +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.hybridselection; - -import net.sf.picard.reference.ReferenceSequence; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.util.Interval; -import net.sf.picard.util.IntervalList; -import net.sf.picard.util.OverlapDetector; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.StringUtil; -import org.broad.tribble.FeatureSource; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec; -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.*; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.Collection; -import java.util.List; - -/** - * Given intervals corresponding to targets or baits in a hybrid selection experiment, this walker gives the following interval-by-interval data: - * coverage, %GC, corresponding gene name, bait quantity, number of adjacent baits, and whether the bait is boosted. - * It can be useful to manually disable the merging of intervals in the GATK when using this walker; this feature is not yet part of the GATK. - */ -@By(DataSource.REFERENCE) -public class HybSelPerformanceWalker extends LocusWalker implements TreeReducible { - @Output - public PrintStream out; - - @Argument(fullName="min_mapq", shortName="mmq", required=false, doc="Minimum mapping quality of reads to consider") - public Integer MIN_MAPQ = 1; - - @Argument(fullName="min_baseq", shortName="mbq", required=false, doc="Minimum base call quality of reads to consider (default: 0)") - public Integer MIN_BASEQ = 0; - - @Argument(fullName="include_duplicates", shortName="idup", required=false, doc="consider duplicate reads") - public boolean INCLUDE_DUPLICATE_READS = false; - - @Argument(fullName="free_standing_distance", shortName="fsd", required=false, doc="minimum distance to next interval to consider freestanding") - public Integer FREE_STANDING_DISTANCE = 500; - - @Argument(fullName="booster", required=false, doc="interval list of booster baits") - public File BOOSTER_FILE; - - @Argument(fullName="booster_distance", required=false, doc="distance up to which a booster can affect a target") - public Integer BOOSTER_DISTANCE = 100; // how far away can a booster be to "hit" its target? - - @Argument(fullName="bait_quantity", required=false, doc="interval list of baits with quantity/concentration (obtainied via sequencing, Nanostring)") - public File BAIT_QUANT_FILE; - - @Argument(fullName="refseq", shortName="refseq", - doc="Name of RefSeq transcript annotation file. If specified, intervals will be specified with gene names", required=false) - String REFSEQ_FILE = null; - - private LocationAwareSeekableRODIterator refseqIterator=null; - - public static class TargetInfo { - public int counts = 0; - - // did at least two reads hit this target - public boolean hitTwice = false; - - public int positionsOver2x = 0; - public int positionsOver8x = 0; - public int positionsOver10x = 0; - public int positionsOver20x = 0; - public int positionsOver30x = 0; - } - -// @Argument(fullName="suppressLocusPrinting",required=false,defaultValue="false") -// public boolean suppressPrinting; - - @Override - public void initialize() { - if ( REFSEQ_FILE != null ) { - TribbleRMDTrackBuilder builder = new TribbleRMDTrackBuilder(); - FeatureSource refseq = builder.createFeatureReader(RefSeqCodec.class, new File(REFSEQ_FILE)).first; - - try { - refseqIterator = new SeekableRODIterator(new FeatureToGATKFeatureIterator(refseq.iterator(), "refseq")); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(REFSEQ_FILE, e); - } - - logger.info("Using RefSeq annotations from "+REFSEQ_FILE); - } - - if ( refseqIterator == null ) logger.info("No annotations available"); - - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - List reads = context.getReads(); - List offsets = context.getOffsets(); - - int depth = 0; - for ( int i = 0; i < reads.size(); i++ ) - { - SAMRecord read = reads.get(i); - - if (read.getNotPrimaryAlignmentFlag()) { continue; } - if (read.getReadUnmappedFlag()) { continue; } - if (!INCLUDE_DUPLICATE_READS && read.getDuplicateReadFlag()) { continue; } - if (read.getMappingQuality() < MIN_MAPQ) { continue; } - if (read.getBaseQualities()[offsets.get(i)] < MIN_BASEQ) { continue; } - depth++; - } - - return depth; - } - - /** - * Return true if your walker wants to reduce each interval separately. Default is false. - * - * If you set this flag, several things will happen. - * - * The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce - * Reduce will accumulate normally at each map unit in the interval - * However, onTraversalDone(reduce) will be called after each interval is processed. - * The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done, - * which is overloaded here to call onTraversalDone(reduce) for each location - */ - public boolean isReduceByInterval() { - return true; - } - - public TargetInfo reduceInit() { return new TargetInfo(); } - - public TargetInfo reduce(Integer depth, TargetInfo sum) { - sum.counts += depth; - if (depth >= 2) { sum.hitTwice = true; sum.positionsOver2x++;} - if (depth >= 8) { sum.positionsOver8x++; } - if (depth >= 10) { sum.positionsOver10x++; } - if (depth >= 20) { sum.positionsOver20x++; } - if (depth >= 30) { sum.positionsOver30x++; } - return sum; - } - - public TargetInfo treeReduce(TargetInfo lhs, TargetInfo rhs) { - return lhs; // dummy function to test if tree reduce by interval is working - } - - public void onTraversalDone(TargetInfo result) { - } - - @Override - public void onTraversalDone(List> results) { - out.println("location\tlength\tgc\tavg_coverage\tnormalized_coverage\thit_twice\tfreestanding\tboosted\tbases_over_2x\tbases_over_8x\tbases_over_10x\tbases_over_20x\tbases_over_30x\tgene_name\tbait_quantity\tadjacent_baits"); - - // first zip through and build an overlap detector of all the intervals, so later - // we can calculate if this interval is free-standing - OverlapDetector od = new OverlapDetector(-FREE_STANDING_DISTANCE,0); - for(Pair pair : results) { - GenomeLoc target = pair.getFirst(); - Interval interval = makeInterval(target); - od.addLhs(interval, interval); - } - - OverlapDetector booster = new OverlapDetector(-BOOSTER_DISTANCE,0); - if (BOOSTER_FILE != null) { - IntervalList il = IntervalList.fromFile(BOOSTER_FILE); - List l = il.getUniqueIntervals(); - booster.addAll(l, l); - } - - // Create a interval detector that will give us the bait quantity specified in - // an optional bait quantity file - OverlapDetector bait_quant = new OverlapDetector(0,0); - if (BAIT_QUANT_FILE != null) { - IntervalList il = IntervalList.fromFile(BAIT_QUANT_FILE); - List l = il.getIntervals(); - bait_quant.addAll(l, l); - } - - // Create a detector of adjacent baits - OverlapDetector adjacent_bait_detector = new OverlapDetector(-1,-1); - for(Pair pair : results) { - GenomeLoc target = pair.getFirst(); - Interval interval = makeInterval(target); - adjacent_bait_detector.addLhs(interval, interval); - } - - // now zip through and calculate the total average coverage - long totalCoverage = 0; - long basesConsidered = 0; - for(Pair pair : results) { - GenomeLoc target = pair.getFirst(); - TargetInfo ti = pair.getSecond(); - - // as long as it was hit twice, count it - if(ti.hitTwice) { - long length = target.getStop() - target.getStart() + 1; - totalCoverage += ti.counts; - basesConsidered += length; - } - } - double meanTargetCoverage = (1.0*totalCoverage) / basesConsidered; - - - for(Pair pair : results) { - GenomeLoc target = pair.getFirst(); - TargetInfo ti = pair.getSecond(); - long length = target.getStop() - target.getStart() + 1; - - double avgCoverage = ((double)ti.counts / (double)length); - double normCoverage = avgCoverage / meanTargetCoverage; - - // calculate gc for the target - double gc = calculateGC(target); - - // if there is more than one hit on the overlap detector, it's not freestanding - Interval targetInterval = makeInterval(target); - - Collection hits = od.getOverlaps(targetInterval); - boolean freestanding = (hits.size() == 1); - - boolean boosted = (booster.getOverlaps(targetInterval).size() > 0); - - // look up the gene name info - String geneName = getGeneName(target); - - Collection bait_quant_hits = bait_quant.getOverlaps(targetInterval); - String bait_quant_string = (bait_quant_hits.size() == 1) ? bait_quant_hits.iterator().next().getName() : "0"; - if (bait_quant_hits.size() > 1) { out.printf("Warning: multiple bait quantity intervals detected; perhaps bait quantity interval lengths don't match primary interval list specified with -L\n"); } - int adjacent_baits = adjacent_bait_detector.getOverlaps(targetInterval).size() - 1; - - out.printf("%s:%d-%d\t%d\t%6.4f\t%6.4f\t%6.4f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%d\n", - target.getContig(), target.getStart(), target.getStop(), length, gc, - avgCoverage, normCoverage, ((ti.hitTwice)?1:0), ((freestanding)?1:0), ((boosted)?1:0), - ti.positionsOver2x, - ti.positionsOver8x, - ti.positionsOver10x, - ti.positionsOver20x, - ti.positionsOver30x, - geneName, - bait_quant_string, - adjacent_baits - ); - - - } - } - - private Interval makeInterval(GenomeLoc target) { - return new Interval(target.getContig(), (int) target.getStart(), (int) target.getStop()); - } - - private String getGeneName(GenomeLoc target) { - if (refseqIterator == null) { return "UNKNOWN"; } - - RODRecordList annotationList = refseqIterator.seekForward(target); - if (annotationList == null) { return "UNKNOWN"; } - - for(GATKFeature rec : annotationList) { - if ( ((RefSeqFeature)rec.getUnderlyingObject()).overlapsExonP(target) ) { - return ((RefSeqFeature)rec.getUnderlyingObject()).getGeneName(); - } - } - - return "UNKNOWN"; - - } - - IndexedFastaSequenceFile seqFile = null; - - private double calculateGC(GenomeLoc target) { - if (seqFile == null) { - seqFile = new IndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); - } - ReferenceSequence refSeq = seqFile.getSubsequenceAt(target.getContig(),target.getStart(), target.getStop()); - - - int gcCount = 0; - for(char base : StringUtil.bytesToString(refSeq.getBases()).toCharArray()) { - if (base == 'C' || base == 'c' || base == 'G' || base == 'g') { gcCount++; } - } - return ( (double) gcCount ) / ((double) refSeq.getBases().length); - - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/utils/report/utils/ComplexDataUtils.java b/java/src/org/broadinstitute/sting/playground/utils/report/utils/ComplexDataUtils.java index 05d3cd817..aa6f87938 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/report/utils/ComplexDataUtils.java +++ b/java/src/org/broadinstitute/sting/playground/utils/report/utils/ComplexDataUtils.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.playground.utils.report.utils; -import org.broadinstitute.sting.utils.Utils; - import java.util.*; diff --git a/java/src/org/broadinstitute/sting/playground/utils/report/utils/NodeUtils.java b/java/src/org/broadinstitute/sting/playground/utils/report/utils/NodeUtils.java index d7f4df109..1ffb66d67 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/report/utils/NodeUtils.java +++ b/java/src/org/broadinstitute/sting/playground/utils/report/utils/NodeUtils.java @@ -1,9 +1,7 @@ package org.broadinstitute.sting.playground.utils.report.utils; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; /** diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java b/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java index 92574d4ad..da62305a4 100644 --- a/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java +++ b/java/src/org/broadinstitute/sting/queue/extensions/gatk/GATKExtensionsGenerator.java @@ -37,7 +37,7 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.SAMFileReaderArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -65,7 +65,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { GenomeAnalysisEngine GATKEngine = new GenomeAnalysisEngine(); WalkerManager walkerManager = new WalkerManager(); FilterManager filterManager = new FilterManager(); - RMDTrackManager rmdTrackManager = new RMDTrackManager(); + RMDTrackBuilder trackBuilder = new RMDTrackBuilder(); /** * Required main method implementation. @@ -113,7 +113,7 @@ public class GATKExtensionsGenerator extends CommandLineProgram { List argumentFields = new ArrayList(); argumentFields.addAll(ArgumentDefinitionField.getArgumentFields(walkerType)); - argumentFields.addAll(RodBindField.getRodArguments(walkerType, rmdTrackManager)); + argumentFields.addAll(RodBindField.getRodArguments(walkerType, trackBuilder)); argumentFields.addAll(ReadFilterField.getFilterArguments(walkerType)); writeClass(COMMANDLINE_PACKAGE_NAME + "." + clpClassName, WALKER_PACKAGE_NAME, diff --git a/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java b/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java index a2bfe3f88..b98b2aa91 100644 --- a/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java +++ b/java/src/org/broadinstitute/sting/queue/extensions/gatk/RodBindField.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackManager; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.walkers.RMD; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -88,7 +88,7 @@ public class RodBindField extends ArgumentField { return exclusiveOf.toString(); } - public static List getRodArguments(Class walkerClass, RMDTrackManager rmdTrackManager) { + public static List getRodArguments(Class walkerClass, RMDTrackBuilder trackBuilder) { List argumentFields = new ArrayList(); List requires = WalkerManager.getRequiredMetaData(walkerClass); @@ -101,7 +101,7 @@ public class RodBindField extends ArgumentField { // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers //fields.add(new RodBindArgumentField(argumentDefinition, true)); } else { - for (String typeName: rmdTrackManager.getTrackRecordTypeNames(required.type())) + for (String typeName: trackBuilder.getTrackRecordTypeNames(required.type())) fields.add(new RodBindField(trackName, typeName, fields, true)); } argumentFields.addAll(fields); @@ -114,7 +114,7 @@ public class RodBindField extends ArgumentField { // TODO: Add the field triplet for name=* after @Allows and @Requires are fixed on walkers //fields.add(new RodBindArgumentField(argumentDefinition, false)); } else { - for (String typeName: rmdTrackManager.getTrackRecordTypeNames(allowed.type())) + for (String typeName: trackBuilder.getTrackRecordTypeNames(allowed.type())) fields.add(new RodBindField(trackName, typeName, fields, true)); } argumentFields.addAll(fields); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index 8a2066235..5cf701016 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -1,14 +1,15 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.datasources.shards.LocusShard; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.TabularROD; -import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack; +import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec; +import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; +import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.utils.GenomeLocParser; import org.junit.Assert; import org.junit.BeforeClass; @@ -43,6 +44,11 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { */ private static IndexedFastaSequenceFile seq; + /** + * our track builder + */ + RMDTrackBuilder builder = new RMDTrackBuilder(); + @BeforeClass public static void init() throws FileNotFoundException { // sequence @@ -69,8 +75,8 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { @Test public void testSingleBinding() { File file = new File(testDir + "TabularDataTest.dat"); - ReferenceOrderedData rod = new ReferenceOrderedData("tableTest", file, TabularROD.class); - ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(null, new RODRMDTrack(TabularROD.class,"tableTest",file,rod)); + RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest",file); + ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(null,track); Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30))); @@ -78,7 +84,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20)); - TabularROD datum = tracker.lookup("tableTest",TabularROD.class); + TableFeature datum = tracker.lookup("tableTest",TableFeature.class); Assert.assertEquals("datum parameter for COL1 is incorrect", "C", datum.get("COL1")); Assert.assertEquals("datum parameter for COL2 is incorrect", "D", datum.get("COL2")); @@ -92,10 +98,11 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { public void testMultipleBinding() { File file = new File(testDir + "TabularDataTest.dat"); - ReferenceOrderedData rod1 = new ReferenceOrderedData("tableTest1", file, TabularROD.class); - ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(null,new RODRMDTrack(TabularROD.class,"tableTest1",file,rod1)); - ReferenceOrderedData rod2 = new ReferenceOrderedData("tableTest2", file, TabularROD.class); - ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(null,new RODRMDTrack(TabularROD.class,"tableTest2",file,rod2));; + + RMDTrack track = builder.createInstanceOfTrack(TableCodec.class,"tableTest1",file); + ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(null,track); + RMDTrack track2 = builder.createInstanceOfTrack(TableCodec.class,"tableTest2",file); + ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(null,track2); Shard shard = new MockLocusShard(Collections.singletonList(GenomeLocParser.createGenomeLoc("chrM",1,30))); @@ -104,13 +111,13 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(GenomeLocParser.createGenomeLoc("chrM",20)); - TabularROD datum1 = tracker.lookup("tableTest1",TabularROD.class); + TableFeature datum1 = tracker.lookup("tableTest1",TableFeature.class); Assert.assertEquals("datum1 parameter for COL1 is incorrect", "C", datum1.get("COL1")); Assert.assertEquals("datum1 parameter for COL2 is incorrect", "D", datum1.get("COL2")); Assert.assertEquals("datum1 parameter for COL3 is incorrect", "E", datum1.get("COL3")); - TabularROD datum2 = tracker.lookup("tableTest2",TabularROD.class); + TableFeature datum2 = tracker.lookup("tableTest2", TableFeature.class); Assert.assertEquals("datum2 parameter for COL1 is incorrect", "C", datum2.get("COL1")); Assert.assertEquals("datum2 parameter for COL2 is incorrect", "D", datum2.get("COL2")); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java index a9c866bed..fe6460961 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceOrderedDataPoolUnitTest.java @@ -2,9 +2,10 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; -import org.broadinstitute.sting.gatk.refdata.TabularROD; +import org.broadinstitute.sting.gatk.refdata.features.table.TableCodec; +import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; -import org.broadinstitute.sting.gatk.refdata.tracks.RODRMDTrack; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -47,13 +48,13 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { public static void init() throws FileNotFoundException { File sequenceFile = new File(hg18Reference); GenomeLocParser.setupRefContigOrdering(new IndexedFastaSequenceFile(sequenceFile)); - TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX); } @Before public void setUp() { File file = new File(testDir + "TabularDataTest.dat"); - rod = new RODRMDTrack(TabularROD.class, "tableTest", file, new ReferenceOrderedData("tableTest", file, TabularROD.class)); + RMDTrackBuilder builder = new RMDTrackBuilder(); + rod = builder.createInstanceOfTrack(TableCodec.class, "tableTest", file); } @Test @@ -64,7 +65,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - TabularROD datum = (TabularROD)iterator.next().get(0).getUnderlyingObject(); + TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); @@ -90,26 +91,26 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { // Test out-of-order access: first iterator2, then iterator1. // Ugh...first call to a region needs to be a seek. - TabularROD datum = (TabularROD)iterator2.seekForward(testSite2).get(0).getUnderlyingObject(); + TableFeature datum = (TableFeature)iterator2.seekForward(testSite2).get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite2)); assertTrue(datum.get("COL1").equals("C")); assertTrue(datum.get("COL2").equals("D")); assertTrue(datum.get("COL3").equals("E")); - datum = (TabularROD)iterator1.next().get(0).getUnderlyingObject(); + datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); assertTrue(datum.get("COL2").equals("B")); assertTrue(datum.get("COL3").equals("C")); // Advance iterator2, and make sure both iterator's contents are still correct. - datum = (TabularROD)iterator2.next().get(0).getUnderlyingObject(); + datum = (TableFeature)iterator2.next().get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite3)); assertTrue(datum.get("COL1").equals("F")); assertTrue(datum.get("COL2").equals("G")); assertTrue(datum.get("COL3").equals("H")); - datum = (TabularROD)iterator1.next().get(0).getUnderlyingObject(); + datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite2)); assertTrue(datum.get("COL1").equals("C")); assertTrue(datum.get("COL2").equals("D")); @@ -135,7 +136,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - TabularROD datum = (TabularROD)iterator.next().get(0).getUnderlyingObject(); + TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); assertTrue(datum.get("COL2").equals("B")); @@ -150,7 +151,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - datum = (TabularROD)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); + datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite3)); assertTrue(datum.get("COL1").equals("F")); assertTrue(datum.get("COL2").equals("G")); @@ -170,7 +171,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 1, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 0, iteratorPool.numAvailableIterators()); - TabularROD datum = (TabularROD)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); + TableFeature datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite3)); assertTrue(datum.get("COL1").equals("F")); assertTrue(datum.get("COL2").equals("G")); @@ -185,7 +186,7 @@ public class ReferenceOrderedDataPoolUnitTest extends BaseTest { Assert.assertEquals("Number of iterators in the pool is incorrect", 2, iteratorPool.numIterators()); Assert.assertEquals("Number of available iterators in the pool is incorrect", 1, iteratorPool.numAvailableIterators()); - datum = (TabularROD)iterator.next().get(0).getUnderlyingObject(); + datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); assertTrue(datum.getLocation().equals(testSite1)); assertTrue(datum.get("COL1").equals("A")); assertTrue(datum.get("COL2").equals("B")); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodUnitTest.java deleted file mode 100755 index 342f05fac..000000000 --- a/java/test/org/broadinstitute/sting/gatk/refdata/PlinkRodUnitTest.java +++ /dev/null @@ -1,247 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata; -/* -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.oneoffprojects.variantcontext.Allele; -import org.broadinstitute.sting.oneoffprojects.variantcontext.Genotype; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.junit.BeforeClass; -import org.junit.Test; -import org.junit.Assert; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.BufferedReader; -import java.io.FileReader; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Jan 22, 2010 - * Time: 11:27:33 PM - * To change this template use File | Settings | File Templates. - * -public class PlinkRodUnitTest extends BaseTest { - // todo :: get the isIndel() isInsertion() and isDeletion() tests working again -- this may require new - // todo :: methods in the objects themselves - private static IndexedFastaSequenceFile seq; - - @BeforeClass - public static void beforeTests() { - try { - seq = new IndexedFastaSequenceFile(new File(b36KGReference)); - } catch (FileNotFoundException e) { - throw new StingException("unable to load the sequence dictionary"); - } - GenomeLocParser.setupRefContigOrdering(seq); - - } - - public BufferedReader openFile(String filename) { - try { - return new BufferedReader(new FileReader(filename)); - } catch (FileNotFoundException e) { - throw new StingException("Couldn't open file " + filename); - } - - } - - @Test - public void testStandardPedFile() { - PlinkRod rod = new PlinkRod("test"); - try { - rod.initialize( new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_test.ped") ); - } catch ( FileNotFoundException e ) { - throw new StingException("test file for testStandardPedFile() does not exist",e); - } - - // test that the sample names are correct - - List rodNames = rod.getVariantSampleNames(); - List expectedNames = Arrays.asList("NA12887","NAMELY","COWBA"); - - Assert.assertEquals("That there are as many samples in the rod as in the expected list",expectedNames.size(),rodNames.size()); - - boolean namesCorrect = true; - for ( int i = 0; i < expectedNames.size(); i++ ) { - namesCorrect = namesCorrect && ( rodNames.get(i).equals(expectedNames.get(i)) ); - } - - Assert.assertTrue("That the names are correct and in the proper order",namesCorrect); - - // test that rod can be iterated over - - ArrayList> genotypesInRod = new ArrayList>(); - ArrayList> sampleNamesInRod = new ArrayList>(); - ArrayList lociInRod = new ArrayList(); - do { - genotypesInRod.add(rod.getGenotypes()); - sampleNamesInRod.add(rod.getVariantSampleNames()); - lociInRod.add(rod.getLocation()); - } while ( rod.parseLine(null,null) ); - - Assert.assertEquals("That there are three SNPs in the ROD",3,genotypesInRod.size()); - - ArrayList snp1 = genotypesInRod.get(0); - ArrayList snp3 = genotypesInRod.get(2); - - Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size()); - Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size()); - Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size()); - - - List snp1_individual3_alleles = snp1.get(2).getAlleles(); - List snp3_individual2_alleles = snp3.get(1).getAlleles(); - - String alleleStr1 = new String(snp1_individual3_alleles.get(0).getBases())+new String(snp1_individual3_alleles.get(1).getBases()); - String alleleStr2 = new String(snp3_individual2_alleles.get(0).getBases())+new String(snp3_individual2_alleles.get(1).getBases()); - - Assert.assertEquals("That the third genotype of snp 1 is correctly no-call","00",alleleStr1); - Assert.assertEquals("That the second genotype of snp 3 is correctly G G","GG",alleleStr2); - - boolean name2isSame = true; - - for ( ArrayList names : sampleNamesInRod ) { - name2isSame = name2isSame && names.get(1).equals("NAMELY"); - } - - Assert.assertTrue("That the second name of all the genotypes is the same and is correct",name2isSame); - - // test that the loci are correctly parsed and in order - - List expectedLoci = Arrays.asList("1:123456","2:13274","3:11111"); - boolean lociCorrect = true; - for ( int i = 0; i < 3; i ++ ) { - lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i)); - } - } - - @Test - public void testStandardPedFileWithIndels() { - PlinkRod rod = new PlinkRod("test"); - try { - rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") ); - } catch ( FileNotFoundException e) { - throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e); - } - - // Iterate through the rod - - List> genotypesInRod = new ArrayList>(); - ArrayList> sampleNamesInRod = new ArrayList>(); - ArrayList lociInRod = new ArrayList(); - ArrayList snpSites = new ArrayList(); - do { - genotypesInRod.add(rod.getGenotypes()); - sampleNamesInRod.add(rod.getVariantSampleNames()); - lociInRod.add(rod.getLocation()); - snpSites.add(rod.variantIsSNP()); - } while ( rod.parseLine(null,null) ); - - boolean snpOrder = true; - List expectedOrder = Arrays.asList(true,false,true,false); - for ( int i = 0; i < 4; i ++ ) { - snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) ); - } - - Assert.assertTrue("That the variant type order is as expected", snpOrder); - // Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1). ); - // Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() ); - Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom()); - Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet()); - Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases()); - Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases()); - Assert.assertEquals("That the snp bases are correct","GC",new String(genotypesInRod.get(2).get(2).getAlleles().get(0).getBases())+new String(genotypesInRod.get(2).get(2).getAlleles().get(1).getBases())); - } - - @Test - public void testBinaryPedFileNoIndels() { - PlinkRod rod = new PlinkRod("binaryTest1"); - try { - rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_noindel_test.bed")); - } catch (FileNotFoundException e) { - throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e); - } - - // iterate through the ROD and get stuff - ArrayList lociInRod = new ArrayList(); - ArrayList> genotypesInRod = new ArrayList>(); - ArrayList> samplesInRod = new ArrayList>(); - - do { - lociInRod.add(rod.getLocation()); - genotypesInRod.add(rod.getGenotypes()); - samplesInRod.add(rod.getVariantSampleNames()); - } while ( rod.parseLine(null,null) ); - - List expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713"); - - for ( int i = 0; i < expecLoc.size(); i ++ ) { - Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString()); - } - - List expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT", - "GG","GG","AG","TT","CC","CT","TG","GG","GG"); - List expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false, - false,false,false,true,false,false,true,true,false,false); - List expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000", - "NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000", - "NA12878","NA12890","NA07000"); - int snpNo = 1; - int indiv = 1; - int alleleOffset = 0; - for ( ArrayList snp : genotypesInRod ) { - for ( Genotype gen : snp ) { - String alStr = new String(gen.getAlleles().get(0).getBases())+new String(gen.getAlleles().get(1).getBases()); - Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+ - "(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr); - Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet()); - Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1)); - indiv++; - alleleOffset++; - } - indiv = 1; - snpNo++; - } - } - - @Test - public void testIndelBinary() { - PlinkRod rod = new PlinkRod("binaryTest2"); - try { - rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/binary_indel_test.bed")); - } catch (FileNotFoundException e) { - throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e); - } - - ArrayList> genotypesInRod = new ArrayList>(); - do { - genotypesInRod.add(rod.getGenotypes()); - } while ( rod.parseLine(null,null) ); - - List expecAlleles = Arrays.asList("ACCA","","ACCAACCA","GGGG","GG","","AA","TA","00","","CCTCCT","CCT", - "TC","CC","TT","GG","GG","AG","","CTTGCTTG","CTTG","TG","GG","GG"); - List expecDeletion = Arrays.asList(false,false,false,false,false,false,false,false,false,true,false,true, - false,false,false,false,false,false,true,false,true,false,false,false); - List expecInsertion = Arrays.asList(true,false,true,true,true,false,false,false,false,false,false,false, - false,false,false,false,false,false,false,false,false,false,false,false); - - int al = 0; - for ( ArrayList snp : genotypesInRod ) { - for ( Genotype gen : snp ) { - String alStr = new String(gen.getAlleles().get(0).getBases())+new String(gen.getAlleles().get(1).getBases()); - Allele firstAl = gen.getAlleles().get(0); - Allele secondAl = gen.getAlleles().get(1); - // boolean isInsertion = ( firstAl.getType() == Allele.AlleleType.INSERTION || secondAl.getType() == Allele.AlleleType.INSERTION ); - // boolean isDeletion = ( firstAl.getType() == Allele.AlleleType.DELETION || secondAl.getType() == Allele.AlleleType.DELETION ); - Assert.assertEquals("That the indel file has the correct alleles for genotype "+al,expecAlleles.get(al), alStr); - // Assert.assertEquals("That the insertion property of genotype "+al+" is correct",expecInsertion.get(al),isInsertion); - // Assert.assertEquals("That the deletion property of genotype "+al+" is correct", expecDeletion.get(al), isDeletion); - al++; - } - } - } -}*/ diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java deleted file mode 100755 index d4af0eebc..000000000 --- a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODUnitTest.java +++ /dev/null @@ -1,236 +0,0 @@ -// our package -package org.broadinstitute.sting.gatk.refdata; - - -// the imports for unit testing. - -import net.sf.picard.reference.ReferenceSequenceFile; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeatureIterator; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; - -import static org.junit.Assert.assertTrue; - -/** - * Basic unit test for TabularROD - * - */ -public class TabularRODUnitTest extends BaseTest { - private static ReferenceSequenceFile seq; - private ReferenceOrderedData ROD; - private LocationAwareSeekableRODIterator iter; - - - @BeforeClass - public static void init() throws FileNotFoundException { - // sequence - seq = new IndexedFastaSequenceFile(new File(hg18Reference)); - GenomeLocParser.setupRefContigOrdering(seq); - } - - @Before - public void setupTabularROD() { - TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX); - File file = new File(testDir + "TabularDataTest.dat"); - ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class); - iter = new SeekableRODIterator(new GATKFeatureIterator(ROD.iterator())); - - } - - @Test - public void test1() { - logger.warn("Executing test1"); - RODRecordList oneList = iter.next(); - TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject(); - assertTrue(one.size() == 4); - assertTrue(one.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10))); - assertTrue(one.get("COL1").equals("A")); - assertTrue(one.get("COL2").equals("B")); - assertTrue(one.get("COL3").equals("C")); - } - - @Test - public void test2() { - logger.warn("Executing test2"); - RODRecordList oneList = iter.next(); - RODRecordList twoList = iter.next(); - TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject(); - TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject(); - assertTrue(two.size() == 4); - assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 20))); - assertTrue(two.get("COL1").equals("C")); - assertTrue(two.get("COL2").equals("D")); - assertTrue(two.get("COL3").equals("E")); - } - - @Test - public void test3() { - logger.warn("Executing test3"); - RODRecordList oneList = iter.next(); - RODRecordList twoList = iter.next(); - RODRecordList threeList = iter.next(); - TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject(); - TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject(); - TabularROD three = (TabularROD)threeList.get(0).getUnderlyingObject(); - assertTrue(three.size() == 4); - assertTrue(three.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 30))); - assertTrue(three.get("COL1").equals("F")); - assertTrue(three.get("COL2").equals("G")); - assertTrue(three.get("COL3").equals("H")); - } - - @Test - public void testDone() { - logger.warn("Executing testDone"); - RODRecordList oneList = iter.next(); - RODRecordList twoList = iter.next(); - RODRecordList threeList = iter.next(); - TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject(); - TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject(); - TabularROD three = (TabularROD)threeList.get(0).getUnderlyingObject(); - assertTrue(!iter.hasNext()); - } - - @Test - public void testSeek() { - logger.warn("Executing testSeek"); - RODRecordList twoList = iter.seekForward(GenomeLocParser.createGenomeLoc("chrM", 20)); - TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject(); - assertTrue(two.size() == 4); - assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 20))); - assertTrue(two.get("COL1").equals("C")); - assertTrue(two.get("COL2").equals("D")); - assertTrue(two.get("COL3").equals("E")); - } - - @Test - public void testToString() { - logger.warn("Executing testToString"); - RODRecordList oneList = iter.next(); - TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject(); - assertTrue(one.toString().equals("chrM:10\tA\tB\tC")); - } - - // Didn't change the delimiter - @Test (expected = RuntimeException.class) - public void testDelim1() { - File file2 = new File(testDir + "TabularDataTest2.dat"); - ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - LocationAwareSeekableRODIterator iter_commas = new SeekableRODIterator(new GATKFeatureIterator(ROD_commas.iterator())); - - logger.warn("Executing testDelim1"); - RODRecordList one2List = iter_commas.next(); - TabularROD one2 = (TabularROD)one2List.get(0).getUnderlyingObject(); - assertTrue(one2.size() == 5); - assertTrue(one2.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10))); - assertTrue(one2.get("COL1").equals("A")); - assertTrue(one2.get("COL2").equals("B")); - assertTrue(one2.get("COL3").equals("C")); - assertTrue(one2.get("COL4").equals("1")); - } - - @Test - public void testDelim2() { - TabularROD.setDelimiter(",",","); - File file2 = new File(testDir + "TabularDataTest2.dat"); - ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - LocationAwareSeekableRODIterator iter_commas = new SeekableRODIterator(new GATKFeatureIterator(ROD_commas.iterator())); - - logger.warn("Executing testDelim1"); - RODRecordList one2List = iter_commas.next(); - TabularROD one2 = (TabularROD)one2List.get(0).getUnderlyingObject(); - assertTrue(one2.size() == 5); - assertTrue(one2.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 10))); - assertTrue(one2.get("COL1").equals("A")); - assertTrue(one2.get("COL2").equals("B")); - assertTrue(one2.get("COL3").equals("C")); - assertTrue(one2.get("COL4").equals("1")); - } - - @Test - public void testCreation() { - logger.warn("Executing testCreation"); - ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); - assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); - String rowData = String.format("%d %d %d", 1, 2, 3); - TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1), rowData.split(" ")); - System.out.println(">>>>> " + row.toString()); - assertTrue(row.toString().equals("chrM:1\t1\t2\t3")); - } - - @Test - public void testCreationAndWriting() throws FileNotFoundException { - logger.warn("Executing testCreationAndWriting"); - - File outputFile = new File(testDir + "testTabularRodOutputTemp.dat"); - PrintStream out = new PrintStream(new FileOutputStream(outputFile)); - - ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); - out.println(TabularROD.commentString("Hello, created from test")); - out.println(TabularROD.commentString("")); - out.println(TabularROD.headerString(header)); - - String rowData = String.format("%d %d %d", 1, 2, 3); - TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1), rowData.split(" ")); - out.println(row.toString()); - - rowData = String.format("%d %d %d", 3, 4, 5); - row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 2), rowData.split(" ")); - out.println(row.toString()); - - ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class); - LocationAwareSeekableRODIterator iter_commas = new SeekableRODIterator(new GATKFeatureIterator(ROD_commas.iterator())); - - RODRecordList oneList = iter_commas.next(); - TabularROD one = (TabularROD)oneList.get(0).getUnderlyingObject(); - assertTrue(one.size() == 4); - assertTrue(one.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 1))); - assertTrue(one.get("col1").equals("1")); - assertTrue(one.get("col2").equals("2")); - assertTrue(one.get("col3").equals("3")); - - RODRecordList twoList = iter_commas.next(); - TabularROD two = (TabularROD)twoList.get(0).getUnderlyingObject(); - assertTrue(two.size() == 4); - assertTrue(two.getLocation().equals(GenomeLocParser.createGenomeLoc("chrM", 2))); - assertTrue(two.get("col1").equals("3")); - assertTrue(two.get("col2").equals("4")); - assertTrue(two.get("col3").equals("5")); - } - -/* @Test (expected=RuntimeException.class ) - public void testBadHeader1() { - logger.warn("Executing testBadHeader1"); - ArrayList header = new ArrayList(); - TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1)); - }*/ - -/* @Test (expected=RuntimeException.class ) - public void testBadHeader2() { - logger.warn("Executing testBadHeader2"); - ArrayList header = new ArrayList(Arrays.asList("col1", "col2", "col3")); - TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1)); - }*/ - - @Test (expected=RuntimeException.class ) - public void testBadData1() { - logger.warn("Executing testBadData1"); - ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); - assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); - String rowData = String.format("%d %d %d %d", 1, 2, 3, 4); - TabularROD row = new TabularROD("myName", header, GenomeLocParser.createGenomeLoc("chrM", 1), rowData.split(" ")); - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java deleted file mode 100644 index 8e81d83f0..000000000 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackManagerUnitTest.java +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata.tracks; - -import net.sf.samtools.SAMSequenceRecord; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -/** - * class RMDTrackManagerUnitTest - * tests out the ability of the RMDTrackManager to correctly create RMDtracks based on the requested types. - */ -public class RMDTrackManagerUnitTest extends BaseTest { - List triplets; - List tracks; - - @Before - public void setup() { - RMDTrackManager manager = new RMDTrackManager(); - triplets = new ArrayList(); - - // add our db snp data - triplets.add("MyDbSNP,DBSNP,testdata/small.dbsnp.rod"); - // TODO: Aaron remove following comment, reinstate line - //tracks = manager.getReferenceMetaDataSources(triplets); - } - - @Test // TODO: Aaron remove me - public void voidTest() { - - } - - //@Test -- TODO: Aaron fix with next round of Tribble integration - public void testBuilderQuery() { - for (RMDTrack t : tracks) { - System.err.println("name = " + t.getName() + " type = " + t.getType().getSimpleName() + " file = " + t.getFile()); - int count = 0; - Iterator fIter; - try { - fIter = ((TribbleTrack) t).query("1", 1, 5000); - } catch (IOException e) { - throw new ReviewedStingException("blah I/O exception"); - } - while (fIter.hasNext()) { - fIter.next(); - count++; - } - Assert.assertEquals(100, count); - } - } - - //@Test - public void testBuilderIterator() { - for (RMDTrack t : tracks) { - System.err.println("name = " + t.getName() + " type = " + t.getType().getSimpleName() + " file = " + t.getFile()); - int count = 0; - Iterator fIter = t.getIterator(); - while (fIter.hasNext()) { - fIter.next(); - count++; - } - Assert.assertEquals(100, count); - } - } - - // @Test used only to determine how fast queries are, don't uncomment! (unless you know what you're doing). - public void testSpeedOfRealQuery() { - IndexedFastaSequenceFile file = null; - file = new IndexedFastaSequenceFile(new File(b36KGReference)); - final int intervalSize = 10000000; - GenomeLocParser.setupRefContigOrdering(file.getSequenceDictionary()); - RMDTrackManager manager = new RMDTrackManager(); - // add our db snp data - triplets.clear(); - triplets.add("db"); - triplets.add("DBSNP"); - triplets.add("../../GATK_Data/dbsnp_130_b36.rod"); - Assert.assertEquals(1, manager.getReferenceMetaDataSources(null,triplets).size()); - RMDTrack t = manager.getReferenceMetaDataSources(null,triplets).get(0); - // make sure we have a single track - // lets test the first and 20th contigs of the human reference - - for (int loop = 1; loop <= 22; loop++) { - SAMSequenceRecord seqRec = GenomeLocParser.getContigInfo(String.valueOf(loop)); - String name = seqRec.getSequenceName(); - Iterator fIter; - for (int x = 1; x < seqRec.getSequenceLength() - intervalSize; x += intervalSize) { - long firstTime = System.currentTimeMillis(); - long count = 0; - try { - fIter = ((TribbleTrack) t).query("1", x, x + intervalSize); - } catch (IOException e) { - throw new ReviewedStingException("blah I/O exception"); - } - while (fIter.hasNext()) { - fIter.next(); - count++; - } - System.err.println(name + "," + count + "," + (System.currentTimeMillis() - firstTime)); - } - } - } -} - diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java deleted file mode 100644 index 11a90d5d8..000000000 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java +++ /dev/null @@ -1,274 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata.tracks.builders; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMSequenceDictionary; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.broad.tribble.Feature; -import org.broad.tribble.index.Index; -import org.broad.tribble.index.linear.LinearIndex; -import org.broad.tribble.iterators.CloseableTribbleIterator; -import org.broad.tribble.source.BasicFeatureSource; -import org.broad.tribble.util.LittleEndianOutputStream; -import org.broad.tribble.vcf.VCFCodec; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; -import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.collections.Pair; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.*; -import java.util.*; - -/** - * performance tests for different index types - */ -public class IndexPerformanceTests extends BaseTest { - // the RMD track builder - private TribbleRMDTrackBuilder builder; - - // set the logger level - Logger logger = Logger.getLogger(IndexPerformanceTests.class); - - // the input files to test - Map inputFiles = new LinkedHashMap(); - - // the input types - Map inputTypes = new HashMap(); - - // where the vcf files are located - String fileLocation = validationDataLocation + "Index_Performance_Data/"; - - // bin sizes to try - int[] binSizes = {10, 100, 1000, 5000, 10000, 50000}; - - PrintWriter writer; - PrintWriter writer2; - /** setup the files we're going to run with, including their names */ - @Before - public void setupFilesAndIndexes() { - logger.setLevel(Level.INFO); - builder = new TribbleRMDTrackBuilder(); - IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference)); - GenomeLocParser.setupRefContigOrdering(seq); - - // the input files - /*inputFiles.put("\"10\"",new File(fileLocation + "tip10.vcf")); - inputFiles.put("\"100\"",new File(fileLocation + "tip100.vcf")); - inputFiles.put("\"1,000\"",new File(fileLocation + "tip1000.vcf")); - inputFiles.put("\"10,000\"",new File(fileLocation + "tip10000.vcf")); - inputFiles.put("\"100,000\"",new File(fileLocation + "tip100000.vcf")); - inputFiles.put("\"1,000,000\"",new File(fileLocation + "tip1000000.vcf"));*/ - - for (String name : inputFiles.keySet()) { - inputTypes.put(name,VCFCodec.class); - } - inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt")); - inputTypes.put("Big Table", AnnotatorInputTableCodec.class); - } - - @Test - public void emptyTest() { - // do nothing - } - - //@Test - public void performanceTest() { - try { - writer = new PrintWriter(new FileWriter("testOutput_linear.txt")); - writer2 = new PrintWriter(new FileWriter("testOutput_tree.txt")); - } catch (IOException e) { - Assert.fail("Unable to open file testOutput.txt"); - } - writer.println("name,index,binSize,createTime,seekTime,thousandPerThousand,record_count,index_size"); - writer2.println("name,index,binSize,createTime,seekTime,thousandPerThousand,record_count,index_size"); - for (String name : inputFiles.keySet()) { - for (int size : binSizes) { - System.err.println("running " + name + " with bin size " + size); - printTestLine(name, true, size); - printTestLine(name, false, size); - } - } - writer.close(); - writer2.close(); - } - - private void printTestLine(String name, boolean useLinear, int size) { - PrintWriter wr = (useLinear) ? writer : writer2; - List values = performIndexTest(name,useLinear, size); - wr.print(name + "," + ((useLinear) ? "linear" : "tree") + "," + size); - for (Long l : values) { - wr.print(","); - wr.print(l); - } - wr.println(); - } - - /** - * time various tasks using the specified index - * @param name the name to get - * @return a five-piece: the time to create the index, the time to seek to chromosome 1, and the time to process reading - * every other 1000 bases of chr1 (of the first 100M), the count of records seen in the last operation, and the index size - */ - public List performIndexTest(String name, boolean useLinear, int size) { - deleteIndex(inputFiles.get(name)); - // time creating the index - long createTime = System.currentTimeMillis(); - Pair pairing = builder.createFeatureReader(inputTypes.get(name),inputFiles.get(name)); - createTime = System.currentTimeMillis() - createTime; - //System.err.println("index creation took " + createTime); - - // seek to chr1 - long seekTo1 = seekToChr1(pairing); - - // seek every 1000 bases in Chr1 - long count = 0; - long thousandEveryThousand = System.currentTimeMillis(); - try { - for (int x = 1; x < 1000000; x = x + 1000) { - //CloseableTribbleIterator iter = pairing.first.query("chr1", x+(int)Math.floor(Math.random()*1000), x+1000); // query - CloseableTribbleIterator iter = pairing.first.query("chr1", x, x+1000); // query - for (Feature feat : iter) { - count++; - } - } - - } catch (IOException e) { - Assert.fail("Unable to load file for query!!"); - } - thousandEveryThousand = System.currentTimeMillis() - thousandEveryThousand; - //System.err.println("thousand every thousand (for first million) took " + thousandEveryThousand); - return Arrays.asList(createTime,seekTo1,thousandEveryThousand,count,new File(inputFiles.get(name) + ".idx").length()); - } - - private long seekToChr1(Pair pairing) { - // time seeking to the first 1M bases of Chr1 - long seekTo1 = System.currentTimeMillis(); - try { - CloseableTribbleIterator iter = pairing.first.query("chr1",1,10000000); // query - } catch (IOException e) { - Assert.fail("Unable to load file for query!!"); - } - seekTo1 = System.currentTimeMillis() - seekTo1; - //System.err.println("seeking to chr1 took " + seekTo1); - return seekTo1; - } - - //@Test - public void testBigTable() { - // store the mapping of location -> variant; this will get big - Map features = new TreeMap(); - - Map bucketToCount = getMapOfFeatures(features,false); - Pair pairing; - - Map features2 = new TreeMap(); - Map bucketToCount2 = getMapOfFeatures(features2,true); - - System.err.println("Summary: "); - System.err.println("Summary: tree " + features.size()); - System.err.println("Summary: linear " + features2.size()); - - // compare the two - for (Map.Entry entry: features.entrySet()) { - if (!features2.containsKey(entry.getKey())) { - System.err.println("key " + entry + " missing from linear, count " + entry.getValue()); - - } - else if (features2.get(entry.getKey()) != entry.getValue()) { - System.err.println("counts are not equal at " + - entry.getKey() + - " features2.get(entry.getKey()) = " + - features2.get(entry.getKey()) + - " feature1 = " + entry.getValue()); - } - if (features2.containsKey(entry.getKey())) features2.remove(entry.getKey()); - } - System.err.println("Missing from the tree :"); - for (Map.Entry entry2: features2.entrySet()) { - System.err.println("Position " + entry2.getKey() + " count = " + entry2.getValue()); - } - - for (Integer bucket : bucketToCount.keySet()) { - if (!bucketToCount2.get(bucket).equals(bucketToCount.get(bucket))) { - System.err.println("Bucket " + bucket + " tree != linear, " + bucketToCount2.get(bucket) + " " + bucketToCount.get(bucket)); - } - } - } - - private Map getMapOfFeatures(Map features, boolean useLinear) { - File bigTable = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"); - - deleteIndex(inputFiles.get("Big Table")); - // time creating the index - logger.warn("creating index, linear = " + useLinear); - - Map bucketToCount = new TreeMap(); - Pair pairing = builder.createFeatureReader(inputTypes.get("Big Table"),inputFiles.get("Big Table")); - logger.warn("created index, traversing"); - try { - for (Integer x = 5000; x < 6000; x = x + 1000) { - int bucketCount = 0; - CloseableTribbleIterator iter = pairing.first.query("chr1", x, x+1000); // query - for (Feature feat : iter) { - GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd()); - if (loc.getStop() < 5000 || loc.getStart() > 6000) continue; - int count = 0; - if (features.containsKey(loc)) - count = features.get(loc)+1; - features.put(loc,count); - if (bucketToCount.containsKey(x)) bucketToCount.put(x,bucketToCount.get(x)+1); - else bucketToCount.put(x,1); - } - //bucketToCount.put(x,bucketCount); - } - logger.warn("Done, returning"); - } catch (IOException e) { - Assert.fail("Unable to load file for query!!"); - } - return bucketToCount; - } - - //@Test - public void testGetTreeIndexLocation() { - File bigTable = new File("small.table.txt"); - - deleteIndex(bigTable); - // time creating the index - logger.warn("creating index"); - - Map bucketToCount = new TreeMap(); - Pair pairing = builder.createFeatureReader(inputTypes.get("Big Table"),bigTable); - logger.warn("created index, traversing"); - try { - int count= 0; - CloseableTribbleIterator iter = null; - for (int x = 5000; x < 6000; x = x + 1000) - iter = pairing.first.query("chr1", x, x+1000); // query - for (Feature feat : iter) { - GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd()); - if (loc.getStop() < 5000 || loc.getStart() > 6000) continue; - count++; - System.err.println(feat.toString()); - } - System.err.println(count); - } catch (IOException e) { - Assert.fail("Unable to load file for query!!"); - } - } - - - private void deleteIndex(File fl) { - File indexFile = new File(fl + TribbleRMDTrackBuilder.indexExtension); - boolean deleted = true; - if (indexFile.exists()) - deleted = indexFile.delete(); - if (!deleted) - Assert.fail("Unable to delete index file"); - } - -} diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java similarity index 96% rename from java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java rename to java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java index 6b0e0a9d3..873455df7 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java @@ -42,16 +42,16 @@ import java.util.Map; /** * @author aaron *

- * Class TribbleRMDTrackBuilderUnitTest + * Class RMDTrackBuilderUnitTest *

* Testing out the builder for tribble Tracks */ -public class TribbleRMDTrackBuilderUnitTest extends BaseTest { - private TribbleRMDTrackBuilder builder; +public class RMDTrackBuilderUnitTest extends BaseTest { + private RMDTrackBuilder builder; @Before public void setup() { - builder = new TribbleRMDTrackBuilder(); + builder = new RMDTrackBuilder(); IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(b36KGReference)); GenomeLocParser.setupRefContigOrdering(seq); } @@ -73,8 +73,8 @@ public class TribbleRMDTrackBuilderUnitTest extends BaseTest { Assert.fail("IO exception unexpected" + e.getMessage()); } // make sure we didn't write the file (check that it's timestamp is within bounds) - //System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified()); - Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified()) < 100); + //System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()); + Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()) < 100); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 1dc166193..8ff48bc0e 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -19,7 +19,7 @@ public class PileupWalkerIntegrationTest extends WalkerTest { String gatk_args = "-T Pileup -I " + validationDataLocation + "FHS_Pileup_Test.bam " + "-R " + hg18Reference + " -L chr15:46,347,148 -o %s"; - String expected_md5 = "052187dd2bf2516a027578c8775856a8"; + String expected_md5 = "872e89df166b90e06dd2737535c5d8b3"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(expected_md5)); executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java index 9e8b6f85e..014dd93ac 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationWalkersIntegrationTest.java @@ -83,7 +83,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { @Test public void testCountCovariatesUseOriginalQuals() { HashMap e = new HashMap(); - e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "72b79646061d78674a3752272823d47f"); + e.put( validationDataLocation + "originalQuals.1kg.chr1.1-1K.bam", "db6c0dca1ec121f8c2e6d20f4969dee5"); for ( Map.Entry entry : e.entrySet() ) { String bam = entry.getKey(); @@ -96,7 +96,8 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest { " -L 1:1-1,000" + " -standard" + " -OQ" + - " -recalFile %s", + " -recalFile %s" + + " --DBSNP /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod", 1, // just one output file Arrays.asList(md5)); executeTest("testCountCovariatesUseOriginalQuals", spec); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java index b2416cf13..5c965f735 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/sequenom/SequenomValidationConverterIntegrationTest.java @@ -6,7 +6,12 @@ import org.junit.Test; import java.util.Arrays; public class SequenomValidationConverterIntegrationTest extends WalkerTest { - @Test + + public void testEmpty() { + System.err.println("Reinstate these tests when plink is back in"); + } + + //@Test TODO: reinstate the test when the Plink rod is back public void testSNPs() { String testPedFile = validationDataLocation + "Sequenom_Test_File.txt"; String testArgs = "-R "+b36KGReference + " -T SequenomValidationConverter -B:sequenom,Plink "+testPedFile+" -o %s"; diff --git a/java/test/org/broadinstitute/sting/playground/utils/report/templates/TextTableUnitTest.java b/java/test/org/broadinstitute/sting/playground/utils/report/templates/TextTableUnitTest.java index 0369a16cf..b77438b27 100644 --- a/java/test/org/broadinstitute/sting/playground/utils/report/templates/TextTableUnitTest.java +++ b/java/test/org/broadinstitute/sting/playground/utils/report/templates/TextTableUnitTest.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.playground.utils.report.templates; -import org.broadinstitute.sting.playground.utils.report.templates.TextTable; import org.junit.Assert; import org.junit.Test; diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index ab01ae5fc..e68a7cbb9 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -6,7 +6,7 @@ import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.tracks.builders.TribbleRMDTrackBuilder; +import org.broadinstitute.sting.gatk.refdata.tracks.builders.RMDTrackBuilder; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -76,7 +76,7 @@ public class VCFWriterUnitTest extends BaseTest { counter++; } Assert.assertEquals(2,counter); - new File(fakeVCFFile + TribbleRMDTrackBuilder.indexExtension).delete(); + new File(fakeVCFFile + RMDTrackBuilder.indexExtension).delete(); fakeVCFFile.delete(); } catch (IOException e ) {