From ea0a826f8f970ee3a4fa80f5e3b76a58c801d68d Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 10 Mar 2009 19:34:00 +0000 Subject: [PATCH] Clean up 3rd party dependencies. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@27 348d0f76-0448-11de-a6fe-93d51630548a --- java/build.xml | 27 +- java/lib/edu/mit/broad/arachne/Alignment.java | 242 --- .../edu/mit/broad/arachne/Fastb2Fasta.java | 132 -- .../edu/mit/broad/arachne/FastbReader.java | 220 --- .../lib/edu/mit/broad/arachne/GenomeMask.java | 83 - .../mit/broad/arachne/LookAlignReader.java | 136 -- java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java | 437 ----- .../edu/mit/broad/cnv/CountAlignments.java | 283 ---- java/lib/edu/mit/broad/cnv/CountKMers.java | 1301 -------------- java/lib/edu/mit/broad/cnv/CountKMers3.java | 1426 ---------------- .../edu/mit/broad/cnv/GatherAlignments.java | 399 ----- .../edu/mit/broad/cnv/kmer/CountKMers.java | 1494 ----------------- .../cnv/kmer/DistributedKMerCounter.java | 151 -- .../mit/broad/cnv/util/GenomeBaseIndex.java | 184 -- .../mit/broad/cnv/util/GenomeBinIndex.java | 167 -- .../mit/broad/cnv/util/SequenceIterator.java | 145 -- java/lib/edu/mit/broad/dcp/CallStatus.java | 18 - java/lib/edu/mit/broad/dcp/CommandRunner.java | 309 ---- .../mit/broad/dcp/DistributedAlgorithm.java | 618 ------- .../broad/dcp/DistributedAlgorithmWorker.java | 134 -- .../mit/broad/dcp/DistributedCallServer.java | 133 -- .../mit/broad/dcp/DistributedCallService.java | 25 - .../dcp/message/DistributedCallMessage.java | 90 - .../broad/dcp/message/DistributedMessage.java | 54 - .../edu/mit/broad/picard/PicardException.java | 27 - .../picard/aligner/AbstractBaseAligner.java | 97 -- .../edu/mit/broad/picard/aligner/Aligner.java | 45 - .../picard/aligner/maq/BamToBfqWriter.java | 319 ---- .../picard/aligner/maq/MapFileIterator.java | 357 ---- .../broad/picard/aligner/maq/MaqAligner.java | 211 --- .../picard/aligner/maq/MaqConstants.java | 39 - .../picard/aligner/maq/MaqMapMerger.java | 125 -- .../mit/broad/picard/aligner/maq/RunMaq.java | 133 -- .../cmdline/CommandLineParseException.java | 27 - .../picard/cmdline/CommandLineParser.java | 638 ------- .../CommandLineParserDefinitionException.java | 27 - .../picard/cmdline/CommandLineProgram.java | 141 -- .../picard/cmdline/CommandLineUtils.java | 39 - .../edu/mit/broad/picard/cmdline/Option.java | 60 - .../picard/cmdline/PositionalArguments.java | 38 - .../edu/mit/broad/picard/cmdline/Usage.java | 26 - .../directed/ArachneMapToIntervalList.java | 62 - .../picard/directed/CalculateHsMetrics.java | 51 - .../mit/broad/picard/directed/GenomeMask.java | 52 - .../picard/directed/GenomeMaskFactory.java | 47 - .../mit/broad/picard/directed/HsMetrics.java | 108 -- .../picard/directed/HsMetricsCalculator.java | 207 --- .../broad/picard/directed/IntervalList.java | 240 --- .../broad/picard/filter/AggregateFilter.java | 46 - .../filter/FailsVendorReadQualityFilter.java | 28 - .../picard/filter/FilteringIterator.java | 94 -- .../broad/picard/filter/SamRecordFilter.java | 26 - .../picard/filter/SolexaNoiseFilter.java | 37 - .../mit/broad/picard/filter/TagFilter.java | 56 - .../broad/picard/genotype/GeliException.java | 30 - .../picard/genotype/GeliFileConstants.java | 20 - .../broad/picard/genotype/GeliFileReader.java | 103 -- .../GeliFileReaderImplementation.java | 189 --- .../broad/picard/genotype/GeliFileWriter.java | 168 -- .../picard/genotype/GenotypeLikelihoods.java | 164 -- .../genotype/GenotypeLikelihoodsCodec.java | 126 -- .../genotype/caller/AbstractAlleleCaller.java | 192 --- .../picard/genotype/caller/CallGenotypes.java | 93 - .../genotype/caller/DiploidGenotype.java | 27 - .../caller/FlatQualityAlleleCaller.java | 76 - .../genotype/caller/GenotypeTheory.java | 46 - .../caller/QualityScoreAlleleCaller.java | 82 - .../picard/illumina/BustardFileParser.java | 257 --- .../illumina/BustardFilenameComparator.java | 78 - .../picard/illumina/BustardReadData.java | 128 -- .../broad/picard/illumina/BustardToSam.java | 58 - .../picard/illumina/BustardToSamWriter.java | 138 -- .../broad/picard/illumina/GeraldParser.java | 235 --- .../picard/illumina/GeraldParserFactory.java | 58 - .../broad/picard/illumina/GeraldToSam.java | 348 ---- .../broad/picard/illumina/SimpleMapping.java | 117 -- .../illumina/SolexaQualityConverter.java | 58 - .../illumina/SquashedCoordinateMap.java | 75 - .../importer/genotype/BedFileReader.java | 82 - .../picard/importer/genotype/BedToGeli.java | 371 ---- .../broad/picard/importer/genotype/SNP.java | 35 - java/lib/edu/mit/broad/picard/io/IoUtil.java | 183 -- .../metrics/AggregateMetricCollector.java | 50 - .../edu/mit/broad/picard/metrics/Header.java | 17 - .../mit/broad/picard/metrics/MetricBase.java | 77 - .../broad/picard/metrics/MetricCollector.java | 24 - .../mit/broad/picard/metrics/MetricsFile.java | 370 ---- .../broad/picard/metrics/StringHeader.java | 43 - .../broad/picard/metrics/VersionHeader.java | 50 - .../quality/CalibrateQualityScores.java | 148 -- .../quality/QualityScoreCalibrator.java | 155 -- .../picard/quality/QualityScoreMatrix.java | 133 -- .../picard/reference/FastaSequenceFile.java | 137 -- .../picard/reference/ReferenceSequence.java | 48 - .../reference/ReferenceSequenceFile.java | 29 - .../ReferenceSequenceFileFactory.java | 28 - .../sam/CollectAlignmentSummaryMetrics.java | 352 ---- .../picard/sam/CollectInsertSizeMetrics.java | 154 -- .../sam/ComparableSamRecordIterator.java | 64 - .../picard/sam/CreateSequenceDictionary.java | 145 -- .../broad/picard/sam/DuplicationMetrics.java | 116 -- .../broad/picard/sam/InsertSizeMetrics.java | 38 - .../mit/broad/picard/sam/MarkDuplicates.java | 461 ----- .../mit/broad/picard/sam/MarkDuplicates2.java | 461 ----- .../mit/broad/picard/sam/MergeSamFiles.java | 95 -- .../picard/sam/MergingSamRecordIterator.java | 136 -- .../picard/sam/ReservedTagConstants.java | 18 - .../broad/picard/sam/SamFileHeaderMerger.java | 286 ---- .../broad/picard/sam/SamLocusIterator.java | 280 --- .../picard/util/AbstractTextFileParser.java | 203 --- .../edu/mit/broad/picard/util/ArrayUtil.java | 33 - .../picard/util/BasicTextFileParser.java | 102 -- .../picard/util/CloseableIteratorWrapper.java | 42 - .../edu/mit/broad/picard/util/CloserUtil.java | 50 - .../edu/mit/broad/picard/util/CoordMath.java | 59 - .../edu/mit/broad/picard/util/Coverage.java | 36 - .../picard/util/CreateAnalysisDirectory.java | 88 - .../edu/mit/broad/picard/util/FormatUtil.java | 135 -- .../edu/mit/broad/picard/util/Histogram.java | 152 -- .../edu/mit/broad/picard/util/Interval.java | 139 -- .../mit/broad/picard/util/IntervalTree.java | 1304 -------------- .../edu/mit/broad/picard/util/ListMap.java | 24 - java/lib/edu/mit/broad/picard/util/Log.java | 182 -- .../edu/mit/broad/picard/util/MathUtil.java | 33 - .../broad/picard/util/OverlapDetector.java | 96 -- .../mit/broad/picard/util/PasteParser.java | 132 -- .../broad/picard/util/PeekableIterator.java | 65 - .../broad/picard/util/ProcessExecutor.java | 121 -- .../edu/mit/broad/picard/util/RExecutor.java | 93 - .../mit/broad/picard/util/SamPairUtil.java | 74 - .../mit/broad/picard/util/SequenceUtil.java | 76 - .../util/StringSortingCollectionFactory.java | 121 -- .../edu/mit/broad/picard/util/StringUtil.java | 108 -- .../picard/util/TabbedTextFileParser.java | 41 - .../picard/variation/DbSnpFileGenerator.java | 172 -- .../picard/variation/DbSnpFileReader.java | 149 -- .../picard/variation/GenerateDbSnpFile.java | 51 - .../broad/picard/variation/KnownVariant.java | 115 -- .../picard/variation/KnownVariantCodec.java | 179 -- .../variation/KnownVariantIterator.java | 31 - .../broad/picard/variation/VariantType.java | 30 - .../lib/edu/mit/broad/sam/AlignmentBlock.java | 31 - .../edu/mit/broad/sam/BAMFileConstants.java | 33 - java/lib/edu/mit/broad/sam/BAMFileIndex.java | 277 --- java/lib/edu/mit/broad/sam/BAMFileReader.java | 317 ---- java/lib/edu/mit/broad/sam/BAMFileWriter.java | 64 - java/lib/edu/mit/broad/sam/BAMRecord.java | 280 --- .../lib/edu/mit/broad/sam/BAMRecordCodec.java | 163 -- .../edu/mit/broad/sam/BinaryCigarCodec.java | 68 - .../lib/edu/mit/broad/sam/BinaryTagCodec.java | 211 --- java/lib/edu/mit/broad/sam/Cigar.java | 93 - java/lib/edu/mit/broad/sam/CigarElement.java | 52 - java/lib/edu/mit/broad/sam/CigarOperator.java | 113 -- .../broad/sam/NotPrimarySkippingIterator.java | 37 - java/lib/edu/mit/broad/sam/SAMFileHeader.java | 191 --- java/lib/edu/mit/broad/sam/SAMFileReader.java | 213 --- java/lib/edu/mit/broad/sam/SAMFileWriter.java | 23 - .../mit/broad/sam/SAMFileWriterFactory.java | 64 - .../edu/mit/broad/sam/SAMFileWriterImpl.java | 157 -- .../edu/mit/broad/sam/SAMFormatException.java | 30 - .../edu/mit/broad/sam/SAMLocusIterator.java | 308 ---- .../edu/mit/broad/sam/SAMProgramRecord.java | 85 - .../edu/mit/broad/sam/SAMReadGroupRecord.java | 84 - java/lib/edu/mit/broad/sam/SAMRecord.java | 732 -------- .../mit/broad/sam/SAMRecordComparator.java | 23 - .../sam/SAMRecordCoordinateComparator.java | 58 - .../sam/SAMRecordQueryNameComparator.java | 38 - .../mit/broad/sam/SAMRecordSetBuilder.java | 274 --- .../edu/mit/broad/sam/SAMSequenceRecord.java | 148 -- java/lib/edu/mit/broad/sam/SAMTag.java | 16 - .../edu/mit/broad/sam/SAMTextHeaderCodec.java | 323 ---- java/lib/edu/mit/broad/sam/SAMTextReader.java | 336 ---- java/lib/edu/mit/broad/sam/SAMTextWriter.java | 121 -- java/lib/edu/mit/broad/sam/SAMTools.java | 106 -- java/lib/edu/mit/broad/sam/SAMUtils.java | 269 --- .../lib/edu/mit/broad/sam/TextCigarCodec.java | 78 - java/lib/edu/mit/broad/sam/TextTagCodec.java | 96 -- .../broad/sam/apps/AccumulateCoverage.java | 132 -- .../edu/mit/broad/sam/apps/CompareSAMs.java | 486 ------ .../allelecaller/AbstractAlleleCaller.java | 166 -- .../sam/apps/allelecaller/AlleleCaller.java | 93 - .../apps/allelecaller/DiploidGenotype.java | 27 - .../allelecaller/FlatQualityAlleleCaller.java | 74 - .../sam/apps/allelecaller/GenotypeTheory.java | 46 - .../QualityScoreAlleleCaller.java | 80 - .../mit/broad/sam/util/AsciiLineReader.java | 172 -- .../edu/mit/broad/sam/util/AsciiWriter.java | 55 - .../edu/mit/broad/sam/util/BinaryCodec.java | 478 ------ .../sam/util/BlockCompressedInputStream.java | 258 --- .../sam/util/BlockCompressedOutputStream.java | 177 -- .../util/BlockCompressedStreamConstants.java | 63 - .../mit/broad/sam/util/CloseableIterator.java | 32 - .../lib/edu/mit/broad/sam/util/CoordMath.java | 75 - .../edu/mit/broad/sam/util/LineReader.java | 33 - .../sam/util/NonDestructiveIterator.java | 48 - .../edu/mit/broad/sam/util/PeekIterator.java | 49 - .../broad/sam/util/RuntimeEOFException.java | 27 - .../broad/sam/util/RuntimeIOException.java | 27 - .../mit/broad/sam/util/SortingCollection.java | 369 ---- .../mit/broad/sam/util/StringLineReader.java | 65 - .../edu/mit/broad/sam/util/StringUtil.java | 136 -- java/{jars => lib}/functionalj.jar | Bin java/lib/picard.jar | Bin 0 -> 302876 bytes java/lib/sam-1.0.jar | Bin 0 -> 266761 bytes java/src/edu/mit/broad/sting/ValidateSAM.java | 8 +- .../edu/mit/broad/sting/atk/AnalysisTK.java | 2 +- .../edu/mit/broad/sting/atk/LocusContext.java | 2 +- .../mit/broad/sting/atk/LocusIterator.java | 4 +- .../edu/mit/broad/sting/atk/PrepareROD.java | 6 +- .../edu/mit/broad/sting/atk/ReadWalker.java | 4 +- .../mit/broad/sting/atk/TraversalEngine.java | 8 +- .../atk/modules/BaseQualityHistoWalker.java | 4 +- .../sting/atk/modules/BasicLociWalker.java | 4 +- .../sting/atk/modules/BasicReadWalker.java | 4 +- .../sting/atk/modules/CountReadsWalker.java | 2 +- .../broad/sting/atk/modules/PileupWalker.java | 4 +- .../sting/atk/modules/PrintReadsWalker.java | 4 +- .../broad/sting/utils/ReferenceIterator.java | 2 +- java/src/edu/mit/broad/sting/utils/Utils.java | 4 +- .../edu/mit/broad/sting/utils/rodDbSNP.java | 6 +- .../src/edu/mit/broad/sting/utils/rodGFF.java | 6 +- 221 files changed, 58 insertions(+), 31995 deletions(-) delete mode 100755 java/lib/edu/mit/broad/arachne/Alignment.java delete mode 100644 java/lib/edu/mit/broad/arachne/Fastb2Fasta.java delete mode 100755 java/lib/edu/mit/broad/arachne/FastbReader.java delete mode 100644 java/lib/edu/mit/broad/arachne/GenomeMask.java delete mode 100755 java/lib/edu/mit/broad/arachne/LookAlignReader.java delete mode 100755 java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java delete mode 100644 java/lib/edu/mit/broad/cnv/CountAlignments.java delete mode 100644 java/lib/edu/mit/broad/cnv/CountKMers.java delete mode 100644 java/lib/edu/mit/broad/cnv/CountKMers3.java delete mode 100644 java/lib/edu/mit/broad/cnv/GatherAlignments.java delete mode 100644 java/lib/edu/mit/broad/cnv/kmer/CountKMers.java delete mode 100644 java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java delete mode 100644 java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java delete mode 100644 java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java delete mode 100644 java/lib/edu/mit/broad/cnv/util/SequenceIterator.java delete mode 100644 java/lib/edu/mit/broad/dcp/CallStatus.java delete mode 100644 java/lib/edu/mit/broad/dcp/CommandRunner.java delete mode 100644 java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java delete mode 100644 java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java delete mode 100644 java/lib/edu/mit/broad/dcp/DistributedCallServer.java delete mode 100644 java/lib/edu/mit/broad/dcp/DistributedCallService.java delete mode 100644 java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java delete mode 100644 java/lib/edu/mit/broad/dcp/message/DistributedMessage.java delete mode 100644 java/lib/edu/mit/broad/picard/PicardException.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/Aligner.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java delete mode 100644 java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/Option.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java delete mode 100644 java/lib/edu/mit/broad/picard/cmdline/Usage.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/GenomeMask.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/HsMetrics.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java delete mode 100644 java/lib/edu/mit/broad/picard/directed/IntervalList.java delete mode 100644 java/lib/edu/mit/broad/picard/filter/AggregateFilter.java delete mode 100644 java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java delete mode 100644 java/lib/edu/mit/broad/picard/filter/FilteringIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java delete mode 100644 java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java delete mode 100644 java/lib/edu/mit/broad/picard/filter/TagFilter.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GeliException.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java delete mode 100644 java/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/BustardFileParser.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/BustardReadData.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/BustardToSam.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/GeraldParser.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/GeraldParserFactory.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/GeraldToSam.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/SimpleMapping.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java delete mode 100644 java/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java delete mode 100644 java/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java delete mode 100644 java/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java delete mode 100644 java/lib/edu/mit/broad/picard/importer/genotype/SNP.java delete mode 100644 java/lib/edu/mit/broad/picard/io/IoUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/Header.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/MetricBase.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/MetricCollector.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/MetricsFile.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/StringHeader.java delete mode 100644 java/lib/edu/mit/broad/picard/metrics/VersionHeader.java delete mode 100644 java/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java delete mode 100644 java/lib/edu/mit/broad/picard/quality/QualityScoreCalibrator.java delete mode 100644 java/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java delete mode 100644 java/lib/edu/mit/broad/picard/reference/FastaSequenceFile.java delete mode 100644 java/lib/edu/mit/broad/picard/reference/ReferenceSequence.java delete mode 100644 java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java delete mode 100644 java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/MergeSamFiles.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java delete mode 100644 java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java delete mode 100644 java/lib/edu/mit/broad/picard/util/ArrayUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/BasicTextFileParser.java delete mode 100644 java/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java delete mode 100644 java/lib/edu/mit/broad/picard/util/CloserUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/CoordMath.java delete mode 100644 java/lib/edu/mit/broad/picard/util/Coverage.java delete mode 100644 java/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java delete mode 100644 java/lib/edu/mit/broad/picard/util/FormatUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/Histogram.java delete mode 100644 java/lib/edu/mit/broad/picard/util/Interval.java delete mode 100644 java/lib/edu/mit/broad/picard/util/IntervalTree.java delete mode 100644 java/lib/edu/mit/broad/picard/util/ListMap.java delete mode 100644 java/lib/edu/mit/broad/picard/util/Log.java delete mode 100644 java/lib/edu/mit/broad/picard/util/MathUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/OverlapDetector.java delete mode 100644 java/lib/edu/mit/broad/picard/util/PasteParser.java delete mode 100644 java/lib/edu/mit/broad/picard/util/PeekableIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/util/ProcessExecutor.java delete mode 100644 java/lib/edu/mit/broad/picard/util/RExecutor.java delete mode 100644 java/lib/edu/mit/broad/picard/util/SamPairUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/SequenceUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java delete mode 100644 java/lib/edu/mit/broad/picard/util/StringUtil.java delete mode 100644 java/lib/edu/mit/broad/picard/util/TabbedTextFileParser.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/KnownVariant.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java delete mode 100644 java/lib/edu/mit/broad/picard/variation/VariantType.java delete mode 100644 java/lib/edu/mit/broad/sam/AlignmentBlock.java delete mode 100644 java/lib/edu/mit/broad/sam/BAMFileConstants.java delete mode 100644 java/lib/edu/mit/broad/sam/BAMFileIndex.java delete mode 100644 java/lib/edu/mit/broad/sam/BAMFileReader.java delete mode 100644 java/lib/edu/mit/broad/sam/BAMFileWriter.java delete mode 100644 java/lib/edu/mit/broad/sam/BAMRecord.java delete mode 100644 java/lib/edu/mit/broad/sam/BAMRecordCodec.java delete mode 100644 java/lib/edu/mit/broad/sam/BinaryCigarCodec.java delete mode 100644 java/lib/edu/mit/broad/sam/BinaryTagCodec.java delete mode 100644 java/lib/edu/mit/broad/sam/Cigar.java delete mode 100644 java/lib/edu/mit/broad/sam/CigarElement.java delete mode 100644 java/lib/edu/mit/broad/sam/CigarOperator.java delete mode 100644 java/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMFileHeader.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMFileReader.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMFileWriter.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMFileWriterFactory.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMFileWriterImpl.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMFormatException.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMLocusIterator.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMProgramRecord.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMReadGroupRecord.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMRecord.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMRecordComparator.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMSequenceRecord.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMTag.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMTextReader.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMTextWriter.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMTools.java delete mode 100644 java/lib/edu/mit/broad/sam/SAMUtils.java delete mode 100755 java/lib/edu/mit/broad/sam/TextCigarCodec.java delete mode 100644 java/lib/edu/mit/broad/sam/TextTagCodec.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/CompareSAMs.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/allelecaller/AlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java delete mode 100644 java/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java delete mode 100644 java/lib/edu/mit/broad/sam/util/AsciiLineReader.java delete mode 100644 java/lib/edu/mit/broad/sam/util/AsciiWriter.java delete mode 100644 java/lib/edu/mit/broad/sam/util/BinaryCodec.java delete mode 100755 java/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java delete mode 100644 java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java delete mode 100644 java/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java delete mode 100755 java/lib/edu/mit/broad/sam/util/CloseableIterator.java delete mode 100644 java/lib/edu/mit/broad/sam/util/CoordMath.java delete mode 100644 java/lib/edu/mit/broad/sam/util/LineReader.java delete mode 100644 java/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java delete mode 100644 java/lib/edu/mit/broad/sam/util/PeekIterator.java delete mode 100644 java/lib/edu/mit/broad/sam/util/RuntimeEOFException.java delete mode 100644 java/lib/edu/mit/broad/sam/util/RuntimeIOException.java delete mode 100644 java/lib/edu/mit/broad/sam/util/SortingCollection.java delete mode 100644 java/lib/edu/mit/broad/sam/util/StringLineReader.java delete mode 100644 java/lib/edu/mit/broad/sam/util/StringUtil.java rename java/{jars => lib}/functionalj.jar (100%) create mode 100644 java/lib/picard.jar create mode 100644 java/lib/sam-1.0.jar diff --git a/java/build.xml b/java/build.xml index bb02df9b3..826208bbd 100644 --- a/java/build.xml +++ b/java/build.xml @@ -3,10 +3,10 @@ simple build file - - + + - + @@ -18,16 +18,31 @@ - + + + + + + + - + - + + + + + + + + + + = seqALength || seqAEnd > seqALength || seqAStart >= seqAEnd) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - if (seqBStart < 0 || seqBEnd <= 0 || seqBLength <= 0 || - seqBStart >= seqBLength || seqBEnd > seqBLength || seqBStart >= seqBEnd) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - if (orientation < 0 || orientation > 1) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - if (fields.length != (11 + 3*blockCount)) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - - int[] alignmentBlocks = new int[3*blockCount]; - for (int i = 0; i < 3*blockCount; i++) { - alignmentBlocks[i] = parseIntField(fields[11 + i]); - } - - Alignment alignment = new Alignment(); - alignment.setASequenceId(seqAId); - alignment.setASequenceLength(seqALength); - alignment.setAStart(seqAStart+1); - alignment.setAEnd(seqAEnd); - alignment.setBSequenceId(seqBId); - alignment.setBSequenceLength(seqBLength); - alignment.setBStart(seqBStart+1); - alignment.setBEnd(seqBEnd); - alignment.setOrientation((orientation == 0) ? '+' : '-'); - alignment.setAlignmentBlocks(alignmentBlocks); - return alignment; - } - - private static int parseIntField(String text) { - try { - return Integer.parseInt(text); - } catch (NumberFormatException exc) { - throw new IllegalArgumentException("Illegal alignment field: " + text); - } - } - - public String arachneFormat() { - StringBuilder builder = new StringBuilder(); - builder.append("QUERY"); - builder.append(TAB); - builder.append(mASequenceId); - builder.append(TAB); - builder.append(mAStart-1); // zero based - builder.append(TAB); - builder.append(mAEnd); - builder.append(TAB); - builder.append(mASequenceLength); - builder.append(TAB); - builder.append(mOrientation == '+' ? 0 : 1); - builder.append(TAB); - builder.append(mBSequenceId); - builder.append(TAB); - builder.append(mBStart-1); // zero based - builder.append(TAB); - builder.append(mBEnd); - builder.append(TAB); - builder.append(mBSequenceLength); - builder.append(TAB); - builder.append(mAlignmentBlocks.length / 3); - for (int i = 0; i < mAlignmentBlocks.length; i++) { - builder.append(TAB); - builder.append(mAlignmentBlocks[i]); - } - return builder.toString(); - } - - public String format() { - StringBuilder builder = new StringBuilder(); - builder.append("Alignment"); - builder.append(' '); - builder.append(mASequenceId); - builder.append(' '); - builder.append(mAStart); - builder.append(' '); - builder.append(mAEnd); - builder.append(' '); - builder.append(mOrientation); - builder.append(' '); - builder.append(mBSequenceId); - builder.append(' '); - builder.append(mBStart); - builder.append(' '); - builder.append(mBEnd); - builder.append(' '); - builder.append(mAlignmentBlocks.length / 3); - for (int i = 0; i < mAlignmentBlocks.length; i++) { - builder.append(' '); - builder.append(mAlignmentBlocks[i]); - } - return builder.toString(); - } -} diff --git a/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java b/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java deleted file mode 100644 index 964e054ef..000000000 --- a/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.arachne; - -import java.io.*; - -/** - * Utility to convert fastb to fasta files. - * More importantly, can be used to extract a subset of the reads. - */ -public class Fastb2Fasta { - - private boolean mVerbose = false; - private boolean mDebug = false; - private String mInputPath = null; - private String mIdListFilePath = null; - - - public static void main(String[] args) - throws Exception { - new Fastb2Fasta().run(args); - } - - private void usage() { - System.out.println("Usage: Fastb2Fasta ... "); - System.out.println(" -idlist "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-idlist") && argsleft > 1) { - argpos++; - mIdListFilePath = args[argpos++]; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 1) { - usage(); - return false; - } - - mInputPath = args[argpos]; - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - FastbReader fastbReader = new FastbReader(new File(mInputPath)); - try { - if (mIdListFilePath != null) { - LineNumberReader reader = new LineNumberReader(new FileReader(mIdListFilePath)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - Integer id = parseReadId(line); - if (id == null) { - continue; - } - if (id < 0 || id >= fastbReader.getSequenceCount()) { - System.out.println("ERROR: Illegal sequence id: " + id); - System.exit(1); - } - String sequence = fastbReader.readSequence(id); - System.out.println(">" + id); - System.out.println(sequence); - } - } else { - int id = 0; - while (fastbReader.hasNext()) { - String sequence = fastbReader.next(); - System.out.println(">" + id); - System.out.println(sequence); - id++; - } - } - } finally { - fastbReader.close(); - } - } - - private Integer parseReadId(String line) { - String text = line.trim(); - if (text.length() == 0 || text.charAt(0) == '#') { - return null; - } - String token = text.split("\\s+")[0]; - Integer id = null; - try { - id = new Integer(token); - } catch (NumberFormatException exc) { - System.out.println("ERROR: Invalid sequence id: " + token); - System.exit(1); - } - return id; - } -} diff --git a/java/lib/edu/mit/broad/arachne/FastbReader.java b/java/lib/edu/mit/broad/arachne/FastbReader.java deleted file mode 100755 index 0d6cd3dd5..000000000 --- a/java/lib/edu/mit/broad/arachne/FastbReader.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.arachne; - - -import edu.mit.broad.sam.util.CloseableIterator; - -import java.io.*; - - -/** - * Reader for arachne Fastb files. - */ -public class FastbReader - implements CloseableIterator { - - // Notes on fastb file format - // - // Fastb files contain the serialized contents of an arachne vecbasevector, - // which is a typedef for mastervec. - // The serialization of mastervec objects starts with a 24 byte mv_file_control_block, - // followed by N variable length segments (one for each element of the mastervec vector), - // followed by an offset table containing N 8-byte file offsets to the N variable length - // segments, followed by N fixed length data segments, one for each vector element. - // Thus, reading a single element of the mastervec vector requires reading from three - // separate places in the file (the offset table, the variable length section and the - // fixed length section). - // - // The mastervec file header is 24 bytes arranged as follows: - // n 4-byte signed(?) integer (number of entries) - // c1 1-byte unsigned bit mask (see below) - // reserved 1-byte unused - // sizeX 1-byte unsigned, sizeof first template parameter (16 for fastb files) - // sizeA 1-byte unsigned, sizeof second template parameter (4 for fastb files) - // offsets_start 8-byte signed(?) integer, file offset of offset table - // static_start 8-byte signed(?) integer, file offset of static data (fixed size section) - // - // For fastb files, the fixed size section contains 4 bytes for each object, which is the - // unsigned(?) count of the number of bases in this entry. - // For fastb files, the variable length section contains a bit vector with two bits per base. - // The bases are encoded as follows: A = 0, C = 1, G = 2, T = 3. - // - // For fastb files, in the file header N is the number of entries in the fastb file. - // c1 is unused/unimplemented except that the two low-order bits should be 0x01, indicating - // that we are using the single-file representation. There is also apparently a three-file - // representation that looks the same except that the offset table and static (fixed length) - // table are in separate files named .offsets and .static. - // The sizeX should be 16 for fastb files and sizeA should be 4. - // - // Note that in fastb files, the sequences are not identified by name or id, only by index - // (zero based) into the mastervec object. There is no representation for bases other than - // ACGT (i.e. Ns cannot be encoded). - - private static final char[] BASES = { 'A', 'C', 'G', 'T' }; - - private File mFile; - private RandomAccessFile mRandomFile; - private int mEntryCount; - private long mOffsetTableOffset; - private long mLengthTableOffset; - private int mCurrentPosition; - private byte[] mIOBuffer = new byte[8]; - - - public FastbReader(File file) - throws IOException { - mFile = file; - mRandomFile = new RandomAccessFile(mFile, "r"); - readHeader(); - } - - public int getSequenceCount() { - return mEntryCount; - } - - public boolean hasNext() { - return (mCurrentPosition < mEntryCount); - } - - public String next() { - if (!hasNext()) { - throw new IllegalStateException("Iterator exhausted"); - } - try { - return readSequence(mCurrentPosition); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - public void close() { - if (mRandomFile != null) { - mEntryCount = 0; - mCurrentPosition = 0; - try { - mRandomFile.close(); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } finally { - mRandomFile = null; - } - } - } - - public String readSequence(int n) - throws IOException { - if (mRandomFile == null) { - throw new IllegalStateException("Reader is closed"); - } - if (n < 0 || n >= mEntryCount) { - throw new IndexOutOfBoundsException("Illegal index: " + n); - } - long offset = getEntryOffset(n); - int length = getEntryBaseCount(n); - String result = readBases(offset, length); - mCurrentPosition = n+1; - return result; - } - - private void readHeader() - throws IOException { - - byte[] fileControlBlock = new byte[24]; - mRandomFile.readFully(fileControlBlock, 0, 24); - - int word2 = deserializeInt(fileControlBlock, 4); - int nFiles = word2 & 0x3; - int sizeX = (word2 >> 16) & 0xFF; - int sizeA = (word2 >> 24) & 0xFF; - if (nFiles != 1) { - throw new RuntimeException(mFile + ": Invalid file header: nFiles = " + nFiles); - } - if (sizeX != 16) { - throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeX); - } - if (sizeA != 4) { - throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeA); - } - mEntryCount = deserializeInt(fileControlBlock, 0); - mOffsetTableOffset = deserializeLong(fileControlBlock, 8); - mLengthTableOffset = deserializeLong(fileControlBlock, 16); - } - - private long getEntryOffset(int n) - throws IOException { - mRandomFile.seek(mOffsetTableOffset + 8 * n); - mRandomFile.readFully(mIOBuffer, 0, 8); - return deserializeLong(mIOBuffer, 0); - } - - private int getEntryBaseCount(int n) - throws IOException { - mRandomFile.seek(mLengthTableOffset + 4 * n); - mRandomFile.readFully(mIOBuffer, 0, 4); - return deserializeInt(mIOBuffer, 0); - } - - private String readBases(long fileOffset, int baseCount) - throws IOException { - - - int byteCount = (baseCount + 3) / 4; - byte[] data = new byte[byteCount]; - mRandomFile.seek(fileOffset); - mRandomFile.readFully(data, 0, byteCount); - - int baseIndex = 0; - int dataIndex = 0; - char[] baseBuffer = new char[baseCount]; - while (baseIndex < baseCount) { - int b = data[dataIndex++]; - int count = Math.min(4, baseCount - baseIndex); - for (int i = 0; i < count; i++) { - baseBuffer[baseIndex++] = BASES[b & 0x3]; - b = b >> 2; - } - } - return new String(baseBuffer); - } - - private int deserializeInt(byte[] buffer, int offset) { - int byte1 = buffer[offset] & 0xFF; - int byte2 = buffer[offset+1] & 0xFF; - int byte3 = buffer[offset+2] & 0xFF; - int byte4 = buffer[offset+3] & 0xFF; - return (byte1 | (byte2 << 8) | (byte3 << 16) | (byte4 << 24)); - } - - private long deserializeLong(byte[] buffer, int offset) { - long int1 = deserializeInt(buffer, offset) & 0xFFFFFFFFL; - long int2 = deserializeInt(buffer, offset+4) & 0xFFFFFFFFL; - return (int1 | (int2 << 32)); - } - - // Stub for interactive use (see also Fastb2Fasta) - public static void main(String[] args) - throws Exception { - FastbReader reader = new FastbReader(new File(args[0])); - int readId = 0; - while (reader.hasNext()) { - System.out.println(">" + readId); - System.out.println(reader.next()); - readId++; - } - reader.close(); - } -} - diff --git a/java/lib/edu/mit/broad/arachne/GenomeMask.java b/java/lib/edu/mit/broad/arachne/GenomeMask.java deleted file mode 100644 index 7e7ebdcb0..000000000 --- a/java/lib/edu/mit/broad/arachne/GenomeMask.java +++ /dev/null @@ -1,83 +0,0 @@ -package edu.mit.broad.arachne; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.BitSet; -import java.util.SortedMap; -import java.util.TreeMap; - -/** - * Utility class to read in a set of contig-based genomic intervals in zero-based end inclusive - * and store them efficiently in memory as a 1-based bit-mask - */ -public class GenomeMask { - - // if memory usage becomes a problem... this could be changed to a SparseBitSet - // http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html - private SortedMap data = new TreeMap(); - - - public GenomeMask(File maskFile) throws IOException { - BufferedReader baitReader = null; - try { - baitReader = new BufferedReader(new FileReader(maskFile)); - String line; - while ((line = baitReader.readLine()) != null) { - String[] arr = line.split(" "); - int contig = Integer.parseInt(arr[0]); - - // covert the coordinates from 0-based, end inclusive to - // 1-based end inclusive - int startPos = Integer.parseInt(arr[1]) + 1; - int endPos = Integer.parseInt(arr[2]) + 1; - - BitSet bits = data.get(contig); - if (bits == null) { - bits = new BitSet(endPos); - data.put(contig,bits); - } - - bits.set(startPos, endPos + 1); // set method is end exclusive - } - } finally { - if (baitReader != null) { baitReader.close(); } - } - } - - /** - * This ctor is useful if initializing a GenomeMask externally. - */ - public GenomeMask() { - } - - public boolean get(int contig, int position) { - BitSet bits = data.get(contig); - return (bits != null) && bits.get(position); - } - - public BitSet get(int contig) { - return data.get(contig); - } - - /** - * Get an existing BitSet for the given contig, or create one if not already present. This is - * useful when initializing a GenomeMask from an external source. - * @param contig which BitSet - * @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size. - * @return the BitSet for the given contig, creating one if necessary - */ - public BitSet getOrCreate(int contig, int numBits) { - BitSet ret = data.get(contig); - if (ret == null) { - ret = new BitSet(numBits); - data.put(contig, ret); - } - return ret; - } - - public int getMaxContig() { - return data.lastKey(); - } -} diff --git a/java/lib/edu/mit/broad/arachne/LookAlignReader.java b/java/lib/edu/mit/broad/arachne/LookAlignReader.java deleted file mode 100755 index a00efcb7c..000000000 --- a/java/lib/edu/mit/broad/arachne/LookAlignReader.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.arachne; - - -import edu.mit.broad.sam.util.CloseableIterator; - -import java.io.*; - - -/** - * Reader for arachne LookAlign text format alignment files. - * Supports filtering of the input by genomic locus. - */ -public class LookAlignReader - implements CloseableIterator { - - private LineNumberReader mReader = null; - private Alignment mNextAlignment = null; - private int mBSequenceId = -1; - private int mBStart = 0; - private int mBEnd = 0; - - - public LookAlignReader(File file) - throws IOException { - this(new FileReader(file)); - } - - public LookAlignReader(Reader reader) { - if (reader instanceof LineNumberReader) { - mReader = (LineNumberReader) reader; - } else { - mReader = new LineNumberReader(reader); - } - } - - public void setBSequenceId(int value) { - mBSequenceId = value; - } - - public void setBStart(int value) { - mBStart = value; - } - - public void setBEnd(int value) { - mBEnd = value; - } - - public boolean hasNext() { - if (mNextAlignment != null) { - return true; - } - try { - mNextAlignment = nextAlignment(); - return (mNextAlignment != null); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - public Alignment next() { - if (!hasNext()) { - throw new IllegalStateException("Iterator exhausted"); - } - try { - Alignment result = mNextAlignment; - mNextAlignment = nextAlignment(); - return result; - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - public void close() { - if (mReader != null) { - try { - mReader.close(); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - mReader = null; - } - } - - private Alignment nextAlignment() - throws IOException { - if (mReader == null) { - return null; - } - while (true) { - String line = mReader.readLine(); - if (line == null) { - close(); - break; - } - if (!line.startsWith("QUERY")) { - continue; - } - Alignment alignment = Alignment.parse(line); - if (matchesFilters(alignment)) { - return alignment; - } - } - return null; - } - - private boolean matchesFilters(Alignment alignment) { - if (mBSequenceId < 0) { - return true; - } - if (alignment.getBSequenceId() != mBSequenceId) { - return false; - } - if (mBStart > 0 && alignment.getBEnd() < mBStart) { - return false; - } - if (mBEnd > 0 && alignment.getBStart() > mBEnd) { - return false; - } - return true; - } -} - diff --git a/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java b/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java deleted file mode 100755 index 07e9b79de..000000000 --- a/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java +++ /dev/null @@ -1,437 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import edu.mit.broad.arachne.Alignment; -import edu.mit.broad.arachne.LookAlignReader; - -import java.io.*; -import java.util.*; - - -/** - * Utility class to do data reduction on CNV data. - */ -public class AnalyzeCnvs { - - public static void main(String[] args) - throws Exception { - new AnalyzeCnvs().run(args); - } - - private void usage() { - System.out.println("Usage: AnalyzeCnvs ..."); - System.out.println(" -action "); - System.out.println(" -alignments or -"); - System.out.println(" -alignmentList "); - System.out.println(" -chromosome "); - System.out.println(" -start "); - System.out.println(" -end "); - System.out.println(" -bestAlignments"); - System.out.println(" -mismatchThreshold "); - System.out.println(" -binsize "); - System.out.println(" -output "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-alignments") && argsleft > 1) { - argpos++; - mAlignmentFilePath = args[argpos++]; - } else if (arg.equals("-alignmentList") && argsleft > 1) { - argpos++; - mAlignmentListFilePath = args[argpos++]; - } else if (arg.equals("-chromosome") && argsleft > 1) { - argpos++; - mChromosome = args[argpos++]; - } else if (arg.equals("-start") && argsleft > 1) { - argpos++; - mStartPosition = new Integer(args[argpos++]); - } else if (arg.equals("-end") && argsleft > 1) { - argpos++; - mEndPosition = new Integer(args[argpos++]); - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-mismatchThreshold") && argsleft > 1) { - argpos++; - mMismatchThreshold = new Integer(args[argpos++]); - } else if (arg.equals("-bestAlignments")) { - argpos++; - mReturnBestHits = true; - } else if (arg.equals("-binsize") && argsleft > 1) { - argpos++; - mBinSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-output") && argsleft > 1) { - argpos++; - mOutputColumns = args[argpos++]; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - if (mAction == null) { - mAction = "alignmentCoverage"; - } - - if (mAction.equals("alignmentCoverage")) { - mainAlignmentCoverage(); - } else { - System.out.println("Unknown action: " + mAction); - usage(); - System.exit(1); - } - } - - private void mainAlignmentCoverage() - throws IOException { - - if (mStartPosition == null || mEndPosition == null) { - usage(); - System.exit(1); - } else if (mStartPosition <= 0 || mEndPosition <= 0 || mStartPosition > mEndPosition) { - System.out.println("Invalid start/end positions: " + mStartPosition + " " + mEndPosition); - usage(); - System.exit(1); - } - - mSequenceId = chromosomeToSequenceId(mChromosome); - if (mSequenceId < 0) { - System.out.println("Invalid chromosome: " + mChromosome); - usage(); - System.exit(1); - } - - if (mBinSize <= 0) { - System.out.println("Invalid bin size: " + mBinSize); - usage(); - System.exit(1); - } - - runAlignmentCoverage(); - } - - private void runAlignmentCoverage() - throws IOException { - - int length = (mEndPosition - mStartPosition + 1); - if (length <= 0) { - throw new RuntimeException("Invalid start/end positions"); - } - - int binSize = mBinSize; - int binCount = (length + binSize - 1) / binSize; - int[] readStarts = new int[binCount]; - int[] readDepths = new int[binCount]; - List alignmentFiles = getAlignmentFiles(); - for (String path : alignmentFiles) { - processAlignmentFile(path, readStarts, readDepths); - } - printStats(readStarts, readDepths); - } - - private List getAlignmentFiles() - throws IOException { - List fileList = new ArrayList(); - if (mAlignmentListFilePath != null) { - LineNumberReader reader = new LineNumberReader(new FileReader(mAlignmentListFilePath)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - String path = line.trim(); - if (path.length() == 0 || path.startsWith("#")) { - continue; - } - fileList.add(path); - } - } else if (mAlignmentFilePath != null) { - fileList.add(mAlignmentFilePath); - } - return fileList; - } - - private void processAlignmentFile(String path, int[] readStarts, int[] readDepths) - throws IOException { - - LookAlignReader reader = null; - if (path == null || path.equals("-")) { - reader = new LookAlignReader(new InputStreamReader(System.in)); - } else { - reader = new LookAlignReader(new File(path)); - } - - while (true) { - Alignment alignment = getNextAlignment(reader); - if (alignment == null) { - reader.close(); - break; - } - processAlignment(alignment, readStarts, readDepths); - } - } - - private void processAlignment(Alignment alignment, - int[] readStarts, - int[] readDepths) { - - if (readStarts != null) { - int baseOffset = alignment.getBStart() - mStartPosition; - int binIndex = baseOffset / mBinSize; - if (binIndex >= 0 && binIndex < readStarts.length) { - readStarts[binIndex]++; - } - } - - if (readDepths != null) { - int baseOffset = alignment.getBStart() - mStartPosition; - int[] alignmentBlocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < alignmentBlocks.length; i += 3) { - int gap = alignmentBlocks[i]; - int duration = alignmentBlocks[i+1]; - if (gap > 0) { - // Gap in B sequence (genome) - // Negative gaps are gaps in A sequence (read) - baseOffset += gap; - } - for (int j = 0; j < duration; j++) { - int binIndex = baseOffset / mBinSize; - if (binIndex >= 0 && binIndex < readDepths.length) { - readDepths[binIndex]++; - } - baseOffset++; - } - } - } - } - - private Alignment getNextAlignment(LookAlignReader reader) - throws IOException { - - if (!mReturnBestHits) { - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (passesAlignmentFilters(alignment)) { - return alignment; - } - } - return null; - } - - while (true) { - Alignment seed = mPendingAlignment; - mPendingAlignment = null; - if (seed == null && reader.hasNext()) { - seed = reader.next(); - } - if (seed == null) { - return null; - } - List secondaryHits = null; - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (alignment.getASequenceId() != seed.getASequenceId()) { - if (alignment.getASequenceId() < seed.getASequenceId()) { - throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); - } - mPendingAlignment = alignment; - break; - } - if (secondaryHits == null) { - secondaryHits = new ArrayList(); - } - secondaryHits.add(alignment); - } - if (secondaryHits == null) { - if (!passesAlignmentFilters(seed)) { - continue; - } - return seed; - } - secondaryHits.add(seed); - Alignment result = getUniqueBestAlignment(secondaryHits); - if (result != null && passesAlignmentFilters(result)) { - return result; - } - } - } - - private Alignment getUniqueBestAlignment(List alignments) { - int bestMismatches = 0; - List best = new ArrayList(); - for (Alignment a : alignments) { - int mismatches = getAlignmentMismatches(a); - if (best.isEmpty()) { - best.add(a); - bestMismatches = mismatches; - } - if (mismatches == bestMismatches) { - best.add(a); - } else if (mismatches < bestMismatches) { - best.clear(); - best.add(a); - bestMismatches = mismatches; - } - } - if (best.size() != 1) { - return null; - } - return best.get(0); - } - - private boolean passesAlignmentFilters(Alignment alignment) { - - if (mMismatchThreshold != null) { - if (getAlignmentMismatches(alignment) > mMismatchThreshold) { - return false; - } - } - - if (mSequenceId != null) { - if (alignment.getBSequenceId() != mSequenceId) { - return false; - } - } - - if (mStartPosition != null) { - if (alignment.getBEnd() < mStartPosition) { - return false; - } - } - - if (mEndPosition != null) { - if (alignment.getBStart() > mEndPosition) { - return false; - } - } - - return true; - } - - private int getAlignmentMismatches(Alignment alignment) { - int mismatches = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - int gap = blocks[i]; - int duration = blocks[i+1]; - int mm = blocks[i+2]; - if (mm > duration) { - throw new RuntimeException("Invalid alignment? : " + alignment.format()); - } - mismatches += Math.abs(gap); - mismatches += mm; - } - return mismatches; - } - - private void printStats(int[] readStarts, int[] readDepths) { - if (mOutputColumns != null && mOutputColumns.equals("coverage")) { - // No headers, just coverage - for (int i = 0; i < readDepths.length; i++) { - String line = ""; - if (mBinSize == 1) { - line += readDepths[i]; - } else { - line += (readDepths[i] / (double) mBinSize); - } - System.out.println(line); - } - } else { - System.out.println("Position" + "\t" + "Starts" + "\t" + "Coverage"); - for (int i = 0; i < readDepths.length; i++) { - String line = ""; - int position = mStartPosition + i*mBinSize; - line += position + "\t" + readStarts[i] + "\t"; - if (mBinSize == 1) { - line += readDepths[i]; - } else { - line += (readDepths[i] / (double) mBinSize); - } - System.out.println(line); - } - } - } - - private int chromosomeToSequenceId(String text) { - if (text == null || text.length() == 0) { - return -1; - } - if (text.matches("\\d+")) { - return Integer.parseInt(text); - } - if (text.startsWith("chr") && text.length() > 3) { - text = text.substring(3); - } - if (text.matches("\\d+") && !text.startsWith("0")) { - return Integer.parseInt(text); - } - if (text.equals("M")) { - return 0; - } else if (text.equals("X")) { - return 23; - } else if (text.equals("Y")) { - return 24; - } else { - return -1; - } - } - - private boolean mDebug = false; - private boolean mVerbose = false; - - private String mAction = null; - private String mAlignmentFilePath = null; - private String mAlignmentListFilePath = null; - private String mChromosome = null; - private Integer mStartPosition = null; - private Integer mEndPosition = null; - private Integer mSequenceId = null; - private boolean mReturnBestHits = false; - private Integer mMismatchThreshold = null; - private int mBinSize = 1; - private String mOutputColumns = null; - private Alignment mPendingAlignment = null; -} diff --git a/java/lib/edu/mit/broad/cnv/CountAlignments.java b/java/lib/edu/mit/broad/cnv/CountAlignments.java deleted file mode 100644 index e0d60255d..000000000 --- a/java/lib/edu/mit/broad/cnv/CountAlignments.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import edu.mit.broad.arachne.Alignment; -import edu.mit.broad.arachne.LookAlignReader; - -import java.io.*; -import java.util.*; - -/** - * Utility to count alignments (rather than gathering). - */ -public class CountAlignments { - - public static void main(String[] args) - throws Exception { - new CountAlignments().run(args); - } - - private void usage() { - System.out.println("Usage: CountAlignments ..."); - System.out.println(" -alignments (- for stdin)"); - System.out.println(" -chromosome "); - System.out.println(" -start "); - System.out.println(" -end "); - System.out.println(" -bestAlignments"); - System.out.println(" -mismatchThreshold "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-alignments") && argsleft > 1) { - argpos++; - mAlignmentFilePath = args[argpos++]; - } else if (arg.equals("-mismatchThreshold") && argsleft > 1) { - argpos++; - mMismatchThreshold = new Integer(args[argpos++]); - } else if (arg.equals("-bestAlignments")) { - argpos++; - mReturnBestHits = true; - } else if (arg.equals("-chromosome") && argsleft > 1) { - argpos++; - String chromosome = args[argpos++]; - mSequenceId = chromosomeToSequenceId(chromosome); - if (mSequenceId < 0) { - System.out.println("Invalid chromosome: " + chromosome); - return false; - } - } else if (arg.equals("-start") && argsleft > 1) { - argpos++; - mStartPosition = new Integer(args[argpos++]); - } else if (arg.equals("-end") && argsleft > 1) { - argpos++; - mEndPosition = new Integer(args[argpos++]); - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - long[] counts = countAlignments(mAlignmentFilePath); - String line = counts[0] + " " + counts[1]; - if (mAlignmentFilePath != null) { - line = mAlignmentFilePath + " " + line; - } - System.out.println(line); - } - - private long[] countAlignments(String path) - throws IOException { - long alignmentCount = 0; - long baseCount = 0; - LookAlignReader reader = null; - if (path == null || path.equals("-")) { - reader = new LookAlignReader(new InputStreamReader(System.in)); - } else { - reader = new LookAlignReader(new File(path)); - } - while (true) { - Alignment alignment = getNextAlignment(reader); - if (alignment == null) { - reader.close(); - break; - } - if (mMismatchThreshold != null) { - if (getAlignmentMismatches(alignment) > mMismatchThreshold) { - continue; - } - } - if (mSequenceId != null) { - if (alignment.getBSequenceId() != mSequenceId) { - continue; - } - } - if (mStartPosition != null) { - if (alignment.getBEnd() < mStartPosition) { - continue; - } - } - if (mEndPosition != null) { - if (alignment.getBStart() > mEndPosition) { - continue; - } - } - alignmentCount++; - baseCount += getBaseCount(alignment); - } - long[] result = { alignmentCount, baseCount }; - return result; - } - - private Alignment getNextAlignment(LookAlignReader reader) - throws IOException { - if (!mReturnBestHits) { - if (!reader.hasNext()) { - return null; - } - return reader.next(); - } - while (true) { - Alignment seed = mPendingAlignment; - mPendingAlignment = null; - if (seed == null && reader.hasNext()) { - seed = reader.next(); - } - if (seed == null) { - return null; - } - List secondaryHits = null; - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (alignment.getASequenceId() != seed.getASequenceId()) { - if (alignment.getASequenceId() < seed.getASequenceId()) { - throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); - } - mPendingAlignment = alignment; - break; - } - if (secondaryHits == null) { - secondaryHits = new ArrayList(); - } - secondaryHits.add(alignment); - } - if (secondaryHits == null) { - return seed; - } - secondaryHits.add(seed); - Alignment result = getUniqueBestAlignment(secondaryHits); - if (result != null) { - return result; - } - } - } - - private Alignment getUniqueBestAlignment(List alignments) { - int bestMismatches = 0; - List best = new ArrayList(); - for (Alignment a : alignments) { - int mismatches = getAlignmentMismatches(a); - if (best.isEmpty()) { - best.add(a); - bestMismatches = mismatches; - } - if (mismatches == bestMismatches) { - best.add(a); - } else if (mismatches < bestMismatches) { - best.clear(); - best.add(a); - bestMismatches = mismatches; - } - } - if (best.size() != 1) { - return null; - } - return best.get(0); - } - - private int getAlignmentMismatches(Alignment alignment) { - int mismatches = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - int gap = blocks[i]; - int duration = blocks[i+1]; - int mm = blocks[i+2]; - if (mm > duration) { - throw new RuntimeException("Invalid alignment? : " + alignment.format()); - } - mismatches += Math.abs(gap); - mismatches += mm; - } - return mismatches; - } - - // Return the number of reference bases covered by this alignment. - private int getBaseCount(Alignment alignment) { - int count = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - // int gap = blocks[i]; - int duration = blocks[i+1]; - // int mm = blocks[i+2]; - count += duration; - } - return count; - } - - private int chromosomeToSequenceId(String text) { - if (text == null || text.length() == 0) { - return -1; - } - if (text.matches("\\d+")) { - return Integer.parseInt(text); - } - if (text.startsWith("chr") && text.length() > 3) { - text = text.substring(3); - } - if (text.matches("\\d+") && !text.startsWith("0")) { - return Integer.parseInt(text); - } - if (text.equals("M")) { - return 0; - } else if (text.equals("X")) { - return 23; - } else if (text.equals("Y")) { - return 24; - } else { - return -1; - } - } - - - private boolean mDebug = false; - private boolean mVerbose = false; - - private String mAlignmentFilePath = null; - private boolean mReturnBestHits = false; - private Integer mMismatchThreshold = null; - private Integer mSequenceId = null; - private Integer mStartPosition = null; - private Integer mEndPosition = null; - private Alignment mPendingAlignment = null; -} diff --git a/java/lib/edu/mit/broad/cnv/CountKMers.java b/java/lib/edu/mit/broad/cnv/CountKMers.java deleted file mode 100644 index 0fa159615..000000000 --- a/java/lib/edu/mit/broad/cnv/CountKMers.java +++ /dev/null @@ -1,1301 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import java.io.*; -import java.util.*; - - -/** - * Tool for counting unique kmers. - */ -public class CountKMers -{ - private static final int NONUNIQUE_MARKER = -1; - private static boolean mUseOldFormat = false; - - private String mAction = null; - private static int mK = 0; - private int mBatchSize = 0; - private List mInputFiles = null; - private File mInputDirectory = null; - private File mOutputDirectory = null; - private boolean mVerbose = false; - private boolean mDebug = false; - - private List mSequenceList = null; - private List mSequenceOffsetList = null; - private List mSpillFileList = null; - private double mSpillFactor = 0.9; - - private long mKMerCount = 0; - private long mUniquePriorCount = 0; - private long mUniqueNewCount = 0; - private long mPriorMapUniqueCount = 0; - - private InputStream mPriorMapStream = null; - private int mPriorMapPosition = -1; - private int mPriorMapValue = 0; - private int mInputFileIndex = 0; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private char[] mKMerBuffer = null; - private int mKMerBufferedCount = 0; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - private int mBaseIndex = -1; - private byte[] mIOBuffer = null; - - /* Design - Inputs: - - One or more fasta files to search (currently one). - - Output directory for the result files. - - Optionally an input k-1-mer file (output from previous pass). - Outputs: - - Unique kmer file: (sorted by kmer) - This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). - - Per chromosome bit map: pos (implicit) new-bit cum-bit - New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. - Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. - - Statistics - Plan: - - Reducing memory footprint is crucial. - - Sequential pass over the input sequences to generate kmers. - - BatchSize kmers are cached in memory, then sorted and uniqified. - - As batch array fills, batches are spilled to disk. - - Batches are reloaded from disk and merged (N-finger algorithm) - - and streamed to a merge file. - - Merge file is read from disk and processed as final results. - */ - - public static void main(String[] args) - throws Exception { - new CountKMers().run(args); - } - - private void usage() { - System.out.println("Usage: CountKMers ..."); - System.out.println(" -action "); - System.out.println(" -genome "); - System.out.println(" -k "); - System.out.println(" -batchSize "); - System.out.println(" -inputDir "); - System.out.println(" -outputDir "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-genome") && argsleft > 1) { - argpos++; - if (mInputFiles == null) { - mInputFiles = new ArrayList(); - } - mInputFiles.add(new File(args[argpos++])); - } else if (arg.equals("-k") && argsleft > 1) { - argpos++; - mK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-batchSize") && argsleft > 1) { - argpos++; - mBatchSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-inputDir") && argsleft > 1) { - argpos++; - mInputDirectory = new File(args[argpos++]); - } else if (arg.equals("-outputDir") && argsleft > 1) { - argpos++; - mOutputDirectory = new File(args[argpos++]); - } else if (arg.equals("-oldFormat")) { - argpos++; - mUseOldFormat = true; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - if (!parseArguments(args)) { - System.exit(1); - } - if (mAction == null || mAction.equals("mapKMers")) { - mapKMers(); - } else if (mAction.equals("mapGaps")) { - mapGaps(); - } - } - - // Can be used to scan genome for sequence names/lengths. - private void scanKMers() - throws IOException { - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - openPriorMap(priorMapFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - if (isUniqueInPriorMap(mBaseIndex)) { - continue; - } - } - } - closePriorMap(); - } - - private void mapGaps() - throws IOException { - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int pos = 0; - int gapStart = 0; - while (true) { - char base = getNextBase(); - if (base == 0) { - break; - } - pos++; - if (base == 'N') { - if (gapStart == 0) { - gapStart = pos; - } - } else { - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - - private void mapKMers() - throws IOException { - - File textKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); - File binaryKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); - File exceptionFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); - File mapFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - File statsFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); - - if (mBatchSize == 0) { - throw new RuntimeException("Batch size not specified"); - } - - int kmerCount = 0; - int batchSize = mBatchSize; - KMerPosition[] kmerArray = new KMerPosition[batchSize]; - List exceptionList = new ArrayList(); - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; - - openPriorMap(priorMapFile); - - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Processing " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - int baseIndex = mBaseIndex; - if (isUniqueInPriorMap(baseIndex)) { - mUniquePriorCount++; - continue; - } - KMerPosition kmp = encodeKMer(kmerChars, baseIndex); - if (kmp == null) { - String kmer = new String(kmerChars); - exceptionList.add(new StringKMerPosition(kmer, baseIndex)); - continue; - } - kmerArray[kmerCount++] = kmp; - if (kmerCount == batchSize) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (kmerCount > mSpillFactor * batchSize) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - } - } - if (kmerCount > 0) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (mSpillFileList != null) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - - closePriorMap(); - - // Write out the exception kmers (text file). - compactKMers(exceptionList); - writeExceptionFile(exceptionList, exceptionFile); - - // Write out the binary file of unique encoded kmers. - if (mSpillFileList == null) { - kmerCount = removeNonUnique(kmerArray, kmerCount); - writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); - mUniqueNewCount = kmerCount; - } else { - mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); - } - mUniqueNewCount += countUniqueKMers(exceptionList); - - // Write out the text file of (all) unique kmers. - writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); - - // Create map file from prior map plus the new unique kmers. - int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1; - createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); - - // Write summary statistics file. - writeSummaryStatistics(statsFile); - } - - private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " kmers at index " + - Integer.toHexString(mBaseIndex) + " ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - KMerPosition current = kmerArray[0]; - for (int i = 1; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (current.compareTo(kmp) == 0) { - current.setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - current = kmp; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " string kmers ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - String kmerString = kmerArray[0].getKMer(); - for (int i = 1; i < kmerCount; i++) { - StringKMerPosition kmp = kmerArray[i]; - String ks = kmp.getKMer(); - if (ks.equals(kmerString)) { - kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - kmerString = ks; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private void compactKMers(List kmerList) { - int kmerCount = kmerList.size(); - if (kmerCount <= 1) { - return; - } - StringKMerPosition[] kmerArray = - kmerList.toArray(new StringKMerPosition[kmerCount]); - kmerCount = compactKMers(kmerArray, kmerCount); - kmerList.clear(); - for (int i = 0; i < kmerCount; i++) { - kmerList.add(kmerArray[i]); - } - } - - private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { - int uniqueCount = 0; - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - kmerArray[uniqueCount++] = kmp; - } - } - return uniqueCount; - } - - private int countUniqueKMers(List kmerList) { - int uniqueCount = 0; - for (StringKMerPosition kmp : kmerList) { - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - } - } - return uniqueCount; - } - - private void spillKMers(KMerPosition[] kmerArray, int kmerCount) - throws IOException { - if (mSpillFileList == null) { - mSpillFileList = new ArrayList(); - } - int fileNumber = mSpillFileList.size() + 1; - log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); - File spillFile = new File(mOutputDirectory, - "spill_" + mK + "_" + fileNumber + ".tmp"); - mSpillFileList.add(spillFile); - writeKMerBinaryFile(kmerArray, kmerCount, spillFile); - log("Spill file written"); - } - - private void writeKMerBinaryFile(KMerPosition[] kmerArray, - int kmerCount, - File outputFile) - throws IOException { - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - writeKMerPosition(outputStream, kmerArray[i]); - } - outputStream.flush(); - outputStream.close(); - } - - private void writeExceptionFile(List kmerList, - File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - for (StringKMerPosition kmer : kmerList) { - writeUniqueKMer(kmer, writer); - } - writer.flush(); - writer.close(); - } - - private KMerPosition readKMerPosition(InputStream stream) - throws IOException { - if (mUseOldFormat) { - return readKMerPositionOldFormat(stream); - } - byte[] buffer = mIOBuffer; - int encodingLength = (mK + 7)/8; - int fileLength = 4 + 2*encodingLength; - int count = readFully(stream, buffer, 0, fileLength); - if (count <= 0) { - return null; - } else if (count != fileLength) { - throw new RuntimeException("Unexpected end of file"); - } - char[] encoding = new char[encodingLength]; - int baseIndex = ((buffer[0] & 0xFF) | - (buffer[1] & 0xFF) << 8 | - (buffer[2] & 0xFF) << 16 | - (buffer[3] & 0xFF) << 24); - for (int i = 0; i < encodingLength; i++) { - encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | - ((buffer[2*i+5] & 0xFF) << 8)); - } - return new KMerPositionN(encoding, baseIndex); - } - - private KMerPosition readKMerPositionOldFormat(InputStream stream) - throws IOException { - byte[] buffer = mIOBuffer; - int length = (mK >= 32 ? 20 : 12); - int count = readFully(stream, buffer, 0, length); - if (count <= 0) { - return null; - } else if (count != length) { - throw new RuntimeException("Unexpected end of file"); - } - long encoding = (((long)(buffer[0] & 0xFF)) | - ((long)(buffer[1] & 0xFF)) << 8 | - ((long)(buffer[2] & 0xFF)) << 16 | - ((long)(buffer[3] & 0xFF)) << 24 | - ((long)(buffer[4] & 0xFF)) << 32 | - ((long)(buffer[5] & 0xFF)) << 40 | - ((long)(buffer[6] & 0xFF)) << 48 | - ((long)(buffer[7] & 0xFF)) << 56); - int baseIndex = ((buffer[length-4] & 0xFF) | - (buffer[length-3] & 0xFF) << 8 | - (buffer[length-2] & 0xFF) << 16 | - (buffer[length-1] & 0xFF) << 24); - if (length == 12) { - return new KMerPosition1(encoding, baseIndex); - } else { - long encoding2 = (((long)(buffer[8] & 0xFF)) | - ((long)(buffer[9] & 0xFF)) << 8 | - ((long)(buffer[10] & 0xFF)) << 16 | - ((long)(buffer[11] & 0xFF)) << 24 | - ((long)(buffer[12] & 0xFF)) << 32 | - ((long)(buffer[13] & 0xFF)) << 40 | - ((long)(buffer[14] & 0xFF)) << 48 | - ((long)(buffer[15] & 0xFF)) << 56); - return new KMerPosition2(encoding, encoding2, baseIndex); - } - } - - private int readFully(InputStream stream, byte[] buffer, int offset, int count) - throws IOException { - int readCount = 0; - while (readCount < count) { - int read = stream.read(buffer, offset, count-readCount); - if (read <= 0) { - break; - } - offset += read; - readCount += read; - } - return readCount; - } - - private void writeKMerPosition(OutputStream stream, KMerPosition kmer) - throws IOException { - if (mUseOldFormat) { - writeKMerPositionOldFormat(stream, kmer); - return; - } - byte[] buffer = mIOBuffer; - int baseIndex = kmer.getBaseIndex(); - char[] encoding = kmer.getKMerEncoding(); - int offset = 0; - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - for (int i = 0; i < encoding.length; i++) { - buffer[offset++] = (byte) ((encoding[i]) & 0xFF); - buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); - } - stream.write(buffer, 0, offset); - } - - private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer) - throws IOException { - byte[] buffer = mIOBuffer; - long encoding1 = kmer.getKMerEncoding1(); - long encoding2 = kmer.getKMerEncoding2(); - int baseIndex = kmer.getBaseIndex(); - int offset = 0; - buffer[offset++] = (byte) ((encoding1) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF); - if (mK >= 32) { - buffer[offset++] = (byte) ((encoding2) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF); - } - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - stream.write(buffer, 0, offset); - } - - private long mergeSpillFiles(List spillFiles, File outputFile) - throws IOException { - - if (spillFiles == null) { - return 0; - } - - log("Merging spill files ..."); - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - long uniqueCount = 0; - int fileCount = spillFiles.size(); - InputStream[] inputStreams = new InputStream[fileCount]; - KMerPosition[] kmers = new KMerPosition[fileCount]; - for (int i = 0; i < fileCount; i++) { - inputStreams[i] = - new BufferedInputStream(new FileInputStream(spillFiles.get(i))); - } - while (true) { - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == null && inputStreams[i] != null) { - kmers[i] = readKMerPosition(inputStreams[i]); - if (kmers[i] == null) { - inputStreams[i].close(); - inputStreams[i] = null; - } - } - } - int count = 0; - KMerPosition kmer = null; - for (int i = 0; i < fileCount; i++) { - KMerPosition kmp = kmers[i]; - if (kmp == null) { - continue; - } else if (kmer == null) { - kmer = kmp; - count = 1; - } else { - int cmp = kmp.compareTo(kmer); - if (cmp == 0) { - count++; - } else if (cmp < 0) { - kmer = kmp; - count = 1; - } - } - } - if (kmer == null) { - break; - } - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == kmer) { - kmers[i] = null; - } - } - if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - writeKMerPosition(outputStream, kmer); - } - } - outputStream.flush(); - outputStream.close(); - for (int i = 0; i < fileCount; i++) { - // spillFiles.get(i).delete(); - } - log("Spill files merged, unique count is " + uniqueCount); - return uniqueCount; - } - - private void writeKMerTextFile(File inputFile, - List exceptionList, - File outputFile) - throws IOException { - - log("Writing kmer file " + outputFile + " ..."); - int exceptionIndex = 0; - StringKMerPosition excKMer = null; - Iterator excIter = null; - if (!exceptionList.isEmpty()) { - excIter = exceptionList.iterator(); - excKMer = excIter.next(); - } - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(inputFile)); - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - KMerPosition kmer = readKMerPosition(inputStream); - while (kmer != null || excKMer != null) { - if (excKMer == null) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else if (kmer == null) { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } - } - inputStream.close(); - writer.flush(); - writer.close(); - log("Wrote kmer file: " + outputFile); - } - - private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { - String chr = getBaseIndexSequenceName(baseIndex); - int pos = getBaseIndexCoordinate(baseIndex); - writer.println(kmer + "\t" + chr + "\t" + pos); - } - - private void createMapFile(int mapSize, - File kmerFile, - List exceptionList, - File priorMapFile, - File mapFile) - throws IOException { - byte[] map = null; - long uniquePriorCount = 0; - if (priorMapFile.exists()) { - map = readMapFile(priorMapFile); - if (map.length != mapSize) { - throw new RuntimeException("Prior map is wrong size"); - } - // Clear the new bits from prior map. - // Also count the prior unique positions while we are at it. - // Note that this is a count of positions, not kmers. - for (int i = 0; i < mapSize; i++) { - int cumBits = map[i] & 0x55; - uniquePriorCount += Integer.bitCount(cumBits); - map[i] = (byte) cumBits; - } - } else { - map = new byte[mapSize]; - } - for (StringKMerPosition kmp : exceptionList) { - addToMap(kmp, map); - } - mPriorMapUniqueCount = uniquePriorCount; - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(kmerFile)); - while (true) { - KMerPosition kmp = readKMerPosition(inputStream); - if (kmp == null) { - inputStream.close(); - break; - } - addToMap(kmp, map); - } - - long testCum = 0; - for (int i = 0; i < map.length; i++) { - testCum += Integer.bitCount(map[i] & 0x55); - } - - writeMapFile(map, mapFile); - } - - private void addToMap(KMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(StringKMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(int baseIndex, byte[] map) { - int mod = baseIndex & 0x3; - int offset = (baseIndex >> 2) & 0x3FFFFFFF; - if (((map[offset] >> (2*mod)) & 0x3) != 0) { - throw new RuntimeException("Map entry already set: " + baseIndex); - } - map[offset] |= (0x3 << (2*mod)); - } - - private void writeSummaryStatistics(File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; - long uniqueCount = mUniquePriorCount + mUniqueNewCount; - long nonUniqueCount = mKMerCount - uniqueCount; - writer.println("K: " + mK); - writer.println("Sequences: " + mSequenceList.size()); - writer.println("Bases: " + baseCount); - writer.println("KMers: " + mKMerCount); - writer.println("Prior map count: " + mPriorMapUniqueCount); - writer.println("Unique prior: " + mUniquePriorCount + - " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); - writer.println("Unique new: " + mUniqueNewCount + - " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); - writer.println("Unique cumulative: " + uniqueCount + - " (" + formatPercent(uniqueCount, mKMerCount) + ")"); - writer.println("Nonunique: " + nonUniqueCount + - " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); - writer.flush(); - writer.close(); - } - - private String formatPercent(long numerator, long denominator) { - double fraction = 0.0; - if (denominator != 0) { - fraction = numerator / (double) denominator; - } - return String.format("%1.1f%%", fraction * 100.0); - } - - private void openPriorMap(File mapFile) - throws IOException { - if (mapFile.exists()) { - mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - } - - private void closePriorMap() - throws IOException { - if (mPriorMapStream != null) { - mPriorMapStream.close(); - } - mPriorMapStream = null; - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - - private byte[] readMapFile(File file) - throws IOException { - long fileLength = file.length(); - if (fileLength > 1000000000) { - throw new RuntimeException("Prior map too large: " + file); - } - int length = (int) fileLength; - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - private void writeMapFile(byte[] map, File file) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(map); - stream.flush(); - stream.close(); - } - - private boolean isUniqueInPriorMap(int baseIndex) - throws IOException { - if (mPriorMapStream == null) { - return false; - } - int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF; - if (byteOffset != mPriorMapPosition) { - int delta = byteOffset - mPriorMapPosition; - if (delta < 0) { - throw new RuntimeException("Attempt to seek backwards in prior map"); - } - if (delta > 1) { - skipFully(mPriorMapStream, delta-1); - } - mPriorMapValue = mPriorMapStream.read(); - if (mPriorMapValue < 0) { - throw new RuntimeException("Unexpected end of file in prior map"); - } - mPriorMapPosition += delta; - } - int mod = baseIndex & 0x3; - return (((mPriorMapValue >> (2*mod)) & 1) != 0); - } - - private void skipFully(InputStream stream, long amount) - throws IOException { - while (amount > 0) { - long skip = stream.skip(amount); - if (skip <= 0 || skip > amount) { - throw new RuntimeException("Skip failed"); - } - amount -= skip; - } - } - - private String getBaseIndexSequenceName(int baseIndex) { - int sequenceCount = mSequenceList.size(); - for (int i = 0; i < sequenceCount-1; i++) { - int nextOffset = mSequenceOffsetList.get(i+1); - if (compareBaseIndex(nextOffset, baseIndex) > 0) { - return mSequenceList.get(i); - } - } - return mSequenceList.get(sequenceCount-1); - } - - private int getBaseIndexCoordinate(int baseIndex) { - Integer sequenceOffset = null; - for (Integer offset : mSequenceOffsetList) { - if (compareBaseIndex(offset, baseIndex) > 0) { - break; - } - sequenceOffset = offset; - } - if (sequenceOffset == null) { - return 0; - } - int coordinate = baseIndex - sequenceOffset + 1; - if (coordinate <= 0) { - dumpSequenceList(); - System.out.println("coordinate: " + coordinate); - System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); - System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); - throw new RuntimeException("Internal error: illegal coordinate " + - coordinate + " for base index " + baseIndex); - } - return coordinate; - } - - private void dumpSequenceList() { - System.out.println("# Sequences:"); - int count = mSequenceList.size(); - for (int i = 0; i < count; i++) { - String seqName = mSequenceList.get(i); - int offset = mSequenceOffsetList.get(i); - System.out.println("# " + seqName + - "\t" + offset + - "\t" + Integer.toHexString(offset)); - } - } - - private int compareBaseIndex(int baseIndex1, int baseIndex2) { - // Implements unsigned comparison, a la compareTo - if (baseIndex1 < 0 ^ baseIndex2 < 0) { - return ((baseIndex1 < 0) ? 1 : -1); - } else { - return (baseIndex1 - baseIndex2); - } - } - - private String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private char[] getNextKMer() - throws IOException { - - if (mKMerBuffer == null) { - mKMerBuffer = new char[mK]; - } - System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); - if (mKMerBufferedCount > 0) { - mKMerBufferedCount--; - } - - while (mKMerBufferedCount < mK) { - char base = getNextBase(); - if (base == 0) { - incrementBaseIndex(mKMerBufferedCount); - mKMerBufferedCount = 0; - return null; - } else if (base == 'N') { - incrementBaseIndex(mKMerBufferedCount+1); - mKMerBufferedCount = 0; - } else { - mKMerBuffer[mKMerBufferedCount++] = base; - } - } - incrementBaseIndex(1); - return mKMerBuffer; - } - - private char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - return mLineBuffer.charAt(mLineBufferIndex++); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } - - private void log(String text) { - if (mVerbose) { - System.out.println("# " + new Date() + " " + text); - } - } - - private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { - if (mUseOldFormat) { - return encodeKMerOldFormat(kmerChars, baseIndex); - } - if (kmerChars == null) { - return null; - } - int kmerLength = kmerChars.length; - int encodingLength = (kmerLength + 7) / 8; - char[] encoding = new char[encodingLength]; - int offset = kmerLength % 8; - offset = (offset == 0) ? 8 : offset; - int bits = encodeKMerBits(kmerChars, 0, offset); - if (bits < 0) { - return null; - } - encoding[0] = (char) bits; - for (int i = 1; i < encodingLength; i++) { - bits = encodeKMerBits(kmerChars, offset, 8); - if (bits < 0) { - return null; - } - encoding[i] = (char) bits; - offset += 8; - } - return new KMerPositionN(encoding, baseIndex); - } - - private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) { - if (kmerChars == null) { - return null; - } - int length = kmerChars.length; - if (length <= 31) { - long bits = encodeKMerBitsLong(kmerChars, 0, length); - if (bits == -1) { - return null; - } - return new KMerPosition1(bits, baseIndex); - } else if (length <= 62) { - long bits1 = encodeKMerBitsLong(kmerChars, 0, 31); - long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31); - if (bits1 == -1 || bits2 == -1) { - return null; - } - return new KMerPosition2(bits1, bits2, baseIndex); - } else { - return null; - } - } - - private static int encodeKMerBits(char[] kmerChars, int offset, int length) { - int bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= baseBits << (2*(length-i-1)); - } - return bits; - } - - private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) { - long bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= ((long)baseBits) << (2*(length-i-1)); - } - return bits; - } - - private static String decodeKMer1(long bits) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits, buffer, 0, length); - return new String(buffer); - } - - private static String decodeKMer2(long bits1, long bits2) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits1, buffer, 0, 31); - decodeKMerBits(bits2, buffer, 31, length-31); - return new String(buffer); - } - - private static String decodeKMerN(char[] encoding) { - int length = mK; - char[] buffer = new char[length]; - int offset = length % 8; - offset = (offset == 0) ? 8 : offset; - decodeKMerBits(encoding[0], buffer, 0, offset); - for (int i = 1; i < encoding.length; i++) { - decodeKMerBits(encoding[i], buffer, offset, 8); - offset += 8; - } - return new String(buffer); - } - - private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - static class KMerPosition - implements Comparable { - - private int mBaseIndex; - - KMerPosition(int baseIndex) { - mBaseIndex = baseIndex; - } - - public String getKMer() { - return null; - } - - public long getKMerEncoding1() { - return -1; - } - - public long getKMerEncoding2() { - return -1; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public char[] getKMerEncoding() { - return null; - } - - public int compareTo(KMerPosition kmp) { - char[] encoding1 = getKMerEncoding(); - char[] encoding2 = kmp.getKMerEncoding(); - int length = Math.max(encoding1.length, encoding2.length); - for (int i = 0; i < length; i++) { - int result = encoding1[i] - encoding2[i]; - if (result != 0) { - return result; - } - } - return 0; - } - } - - static class KMerPosition1 - extends KMerPosition { - - private long mKMerEncoding1; - - KMerPosition1(long kmer, int baseIndex) { - super(baseIndex); - mKMerEncoding1 = kmer; - } - - public String getKMer() { - return decodeKMer1(getKMerEncoding1()); - } - - public final long getKMerEncoding1() { - return mKMerEncoding1; - } - - public int compareTo(KMerPosition kmp) { - int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1()); - if (result == 0) { - result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2()); - } - return result; - } - } - - static class KMerPosition2 - extends KMerPosition1 { - - private long mKMerEncoding2; - - KMerPosition2(long encoding1, long encoding2, int baseIndex) { - super(encoding1, baseIndex); - mKMerEncoding2 = encoding2; - } - - public String getKMer() { - return decodeKMer2(getKMerEncoding1(), getKMerEncoding2()); - } - - public final long getKMerEncoding2() { - return mKMerEncoding2; - } - } - - static class KMerPositionN - extends KMerPosition { - - private char[] mKMerEncoding; - - KMerPositionN(char[] encoding, int baseIndex) { - super(baseIndex); - mKMerEncoding = encoding; - } - - public String getKMer() { - return decodeKMerN(mKMerEncoding); - } - - public final char[] getKMerEncoding() { - return mKMerEncoding; - } - } - - static class StringKMerPosition - implements Comparable { - - private String mKMerString = null; - private int mBaseIndex; - - StringKMerPosition(String kmer, int baseIndex) { - mKMerString = kmer; - mBaseIndex = baseIndex; - } - - public final String getKMer() { - return mKMerString; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public int compareTo(StringKMerPosition kmp) { - return mKMerString.compareTo(kmp.mKMerString); - } - } -} diff --git a/java/lib/edu/mit/broad/cnv/CountKMers3.java b/java/lib/edu/mit/broad/cnv/CountKMers3.java deleted file mode 100644 index 81ddb1745..000000000 --- a/java/lib/edu/mit/broad/cnv/CountKMers3.java +++ /dev/null @@ -1,1426 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import java.io.*; -import java.util.*; - - -/** - * Tool for counting unique kmers. - */ -public class CountKMers3 -{ - private static final int NONUNIQUE_MARKER = -1; - private static boolean mUseOldFormat = false; - - private String mAction = null; - private static int mK = 0; - private int mBatchSize = 0; - private List mInputFiles = null; - private File mInputDirectory = null; - private File mOutputDirectory = null; - private boolean mVerbose = false; - private boolean mDebug = false; - - private List mSequenceList = null; - private List mSequenceOffsetList = null; - private List mSpillFileList = null; - private double mSpillFactor = 0.9; - - private long mKMerCount = 0; - private long mUniquePriorCount = 0; - private long mUniqueNewCount = 0; - private long mPriorMapUniqueCount = 0; - - private InputStream mPriorMapStream = null; - private int mPriorMapPosition = -1; - private int mPriorMapValue = 0; - private int mInputFileIndex = 0; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private char[] mKMerBuffer = null; - private int mKMerBufferedCount = 0; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - private int mBaseIndex = -1; - private byte[] mIOBuffer = null; - - /* Design - Inputs: - - One or more fasta files to search (currently one). - - Output directory for the result files. - - Optionally an input k-1-mer file (output from previous pass). - Outputs: - - Unique kmer file: (sorted by kmer) - This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). - - Per chromosome bit map: pos (implicit) new-bit cum-bit - New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. - Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. - - Statistics - Plan: - - Reducing memory footprint is crucial. - - Sequential pass over the input sequences to generate kmers. - - BatchSize kmers are cached in memory, then sorted and uniqified. - - As batch array fills, batches are spilled to disk. - - Batches are reloaded from disk and merged (N-finger algorithm) - - and streamed to a merge file. - - Merge file is read from disk and processed as final results. - */ - - public static void main(String[] args) - throws Exception { - new CountKMers3().run(args); - } - - private void usage() { - System.out.println("Usage: CountKMers ..."); - System.out.println(" -action "); - System.out.println(" -genome "); - System.out.println(" -k "); - System.out.println(" -batchSize "); - System.out.println(" -inputDir "); - System.out.println(" -outputDir "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-genome") && argsleft > 1) { - argpos++; - if (mInputFiles == null) { - mInputFiles = new ArrayList(); - } - mInputFiles.add(new File(args[argpos++])); - } else if (arg.equals("-k") && argsleft > 1) { - argpos++; - mK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-batchSize") && argsleft > 1) { - argpos++; - mBatchSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-inputDir") && argsleft > 1) { - argpos++; - mInputDirectory = new File(args[argpos++]); - } else if (arg.equals("-outputDir") && argsleft > 1) { - argpos++; - mOutputDirectory = new File(args[argpos++]); - } else if (arg.equals("-oldFormat")) { - argpos++; - mUseOldFormat = true; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - if (!parseArguments(args)) { - System.exit(1); - } - if (mAction == null || mAction.equals("mapKMers")) { - mapKMers(); - } else if (mAction.equals("mapGaps")) { - mapGaps(); - } - } - - // Can be used to scan genome for sequence names/lengths. - private void scanKMers() - throws IOException { - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - openPriorMap(priorMapFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - if (isUniqueInPriorMap(mBaseIndex)) { - continue; - } - } - } - closePriorMap(); - } - - private void mapGaps() - throws IOException { - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int pos = 0; - int gapStart = 0; - while (true) { - char base = getNextBase(); - if (base == 0) { - break; - } - pos++; - if (base == 'N') { - if (gapStart == 0) { - gapStart = pos; - } - } else { - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - - private void mapKMers() - throws IOException { - - File textKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); - File binaryKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); - File exceptionFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); - File mapFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - File statsFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); - - if (mBatchSize == 0) { - throw new RuntimeException("Batch size not specified"); - } - - int kmerCount = 0; - int batchSize = mBatchSize; - KMerPosition[] kmerArray = new KMerPosition[batchSize]; - List exceptionList = new ArrayList(); - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; - - openPriorMap(priorMapFile); - - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Processing " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - int baseIndex = mBaseIndex; - if (isUniqueInPriorMap(baseIndex)) { - mUniquePriorCount++; - continue; - } - - KMerPosition kmp = encodeKMer(kmerChars, baseIndex); - if (kmp == null) { - // Note: We currently do not handle the reverse - // complement of exception characters correctly. - // For hg18, however, this doesn't matter as - // none of the kmers containing non-ACGT characters - // are present on the reverse strand. - String kmer = new String(kmerChars); - exceptionList.add(new StringKMerPosition(kmer, baseIndex)); - continue; - } - kmerArray[kmerCount++] = kmp; - if (kmerCount == batchSize) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (kmerCount > mSpillFactor * batchSize) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - } - } - if (kmerCount > 0) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (mSpillFileList != null) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - - closePriorMap(); - - // Write out the exception kmers (text file). - compactKMers(exceptionList); - writeExceptionFile(exceptionList, exceptionFile); - - // Write out the binary file of unique encoded kmers. - if (mSpillFileList == null) { - kmerCount = removeNonUnique(kmerArray, kmerCount); - writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); - mUniqueNewCount = kmerCount; - } else { - mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); - } - mUniqueNewCount += countUniqueKMers(exceptionList); - - // Write out the text file of (all) unique kmers. - writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); - - // Create map file from prior map plus the new unique kmers. - int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1; - createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); - - // Write summary statistics file. - writeSummaryStatistics(statsFile); - } - - private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " kmers at index " + - Integer.toHexString(mBaseIndex) + " ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - KMerPosition current = kmerArray[0]; - for (int i = 1; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (current.compareTo(kmp) == 0) { - current.setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - current = kmp; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " string kmers ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - String kmerString = kmerArray[0].getKMer(); - for (int i = 1; i < kmerCount; i++) { - StringKMerPosition kmp = kmerArray[i]; - String ks = kmp.getKMer(); - if (ks.equals(kmerString)) { - kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - kmerString = ks; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private void compactKMers(List kmerList) { - int kmerCount = kmerList.size(); - if (kmerCount <= 1) { - return; - } - StringKMerPosition[] kmerArray = - kmerList.toArray(new StringKMerPosition[kmerCount]); - kmerCount = compactKMers(kmerArray, kmerCount); - kmerList.clear(); - for (int i = 0; i < kmerCount; i++) { - kmerList.add(kmerArray[i]); - } - } - - private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { - int uniqueCount = 0; - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - kmerArray[uniqueCount++] = kmp; - } - } - return uniqueCount; - } - - private int countUniqueKMers(List kmerList) { - int uniqueCount = 0; - for (StringKMerPosition kmp : kmerList) { - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - } - } - return uniqueCount; - } - - private void spillKMers(KMerPosition[] kmerArray, int kmerCount) - throws IOException { - if (mSpillFileList == null) { - mSpillFileList = new ArrayList(); - } - int fileNumber = mSpillFileList.size() + 1; - log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); - File spillFile = new File(mOutputDirectory, - "spill_" + mK + "_" + fileNumber + ".tmp"); - mSpillFileList.add(spillFile); - writeKMerBinaryFile(kmerArray, kmerCount, spillFile); - log("Spill file written"); - } - - private void writeKMerBinaryFile(KMerPosition[] kmerArray, - int kmerCount, - File outputFile) - throws IOException { - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - writeKMerPosition(outputStream, kmerArray[i]); - } - outputStream.flush(); - outputStream.close(); - } - - private void writeExceptionFile(List kmerList, - File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - for (StringKMerPosition kmer : kmerList) { - writeUniqueKMer(kmer, writer); - } - writer.flush(); - writer.close(); - } - - private KMerPosition readKMerPosition(InputStream stream) - throws IOException { - if (mUseOldFormat) { - return readKMerPositionOldFormat(stream); - } - byte[] buffer = mIOBuffer; - int encodingLength = (mK + 7)/8; - int fileLength = 4 + 2*encodingLength; - int count = readFully(stream, buffer, 0, fileLength); - if (count <= 0) { - return null; - } else if (count != fileLength) { - throw new RuntimeException("Unexpected end of file"); - } - char[] encoding = new char[encodingLength]; - int baseIndex = ((buffer[0] & 0xFF) | - (buffer[1] & 0xFF) << 8 | - (buffer[2] & 0xFF) << 16 | - (buffer[3] & 0xFF) << 24); - for (int i = 0; i < encodingLength; i++) { - encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | - ((buffer[2*i+5] & 0xFF) << 8)); - } - return new KMerPositionN(encoding, baseIndex); - } - - private KMerPosition readKMerPositionOldFormat(InputStream stream) - throws IOException { - byte[] buffer = mIOBuffer; - int length = (mK >= 32 ? 20 : 12); - int count = readFully(stream, buffer, 0, length); - if (count <= 0) { - return null; - } else if (count != length) { - throw new RuntimeException("Unexpected end of file"); - } - long encoding = (((long)(buffer[0] & 0xFF)) | - ((long)(buffer[1] & 0xFF)) << 8 | - ((long)(buffer[2] & 0xFF)) << 16 | - ((long)(buffer[3] & 0xFF)) << 24 | - ((long)(buffer[4] & 0xFF)) << 32 | - ((long)(buffer[5] & 0xFF)) << 40 | - ((long)(buffer[6] & 0xFF)) << 48 | - ((long)(buffer[7] & 0xFF)) << 56); - int baseIndex = ((buffer[length-4] & 0xFF) | - (buffer[length-3] & 0xFF) << 8 | - (buffer[length-2] & 0xFF) << 16 | - (buffer[length-1] & 0xFF) << 24); - if (length == 12) { - return new KMerPosition1(encoding, baseIndex); - } else { - long encoding2 = (((long)(buffer[8] & 0xFF)) | - ((long)(buffer[9] & 0xFF)) << 8 | - ((long)(buffer[10] & 0xFF)) << 16 | - ((long)(buffer[11] & 0xFF)) << 24 | - ((long)(buffer[12] & 0xFF)) << 32 | - ((long)(buffer[13] & 0xFF)) << 40 | - ((long)(buffer[14] & 0xFF)) << 48 | - ((long)(buffer[15] & 0xFF)) << 56); - return new KMerPosition2(encoding, encoding2, baseIndex); - } - } - - private int readFully(InputStream stream, byte[] buffer, int offset, int count) - throws IOException { - int readCount = 0; - while (readCount < count) { - int read = stream.read(buffer, offset, count-readCount); - if (read <= 0) { - break; - } - offset += read; - readCount += read; - } - return readCount; - } - - private void writeKMerPosition(OutputStream stream, KMerPosition kmer) - throws IOException { - if (mUseOldFormat) { - writeKMerPositionOldFormat(stream, kmer); - return; - } - byte[] buffer = mIOBuffer; - int baseIndex = kmer.getBaseIndex(); - char[] encoding = kmer.getKMerEncoding(); - int offset = 0; - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - for (int i = 0; i < encoding.length; i++) { - buffer[offset++] = (byte) ((encoding[i]) & 0xFF); - buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); - } - stream.write(buffer, 0, offset); - } - - private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer) - throws IOException { - byte[] buffer = mIOBuffer; - long encoding1 = kmer.getKMerEncoding1(); - long encoding2 = kmer.getKMerEncoding2(); - int baseIndex = kmer.getBaseIndex(); - int offset = 0; - buffer[offset++] = (byte) ((encoding1) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF); - if (mK >= 32) { - buffer[offset++] = (byte) ((encoding2) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF); - } - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - stream.write(buffer, 0, offset); - } - - private long mergeSpillFiles(List spillFiles, File outputFile) - throws IOException { - - if (spillFiles == null) { - return 0; - } - - log("Merging spill files ..."); - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - long uniqueCount = 0; - int fileCount = spillFiles.size(); - InputStream[] inputStreams = new InputStream[fileCount]; - KMerPosition[] kmers = new KMerPosition[fileCount]; - for (int i = 0; i < fileCount; i++) { - inputStreams[i] = - new BufferedInputStream(new FileInputStream(spillFiles.get(i))); - } - while (true) { - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == null && inputStreams[i] != null) { - kmers[i] = readKMerPosition(inputStreams[i]); - if (kmers[i] == null) { - inputStreams[i].close(); - inputStreams[i] = null; - } - } - } - int count = 0; - KMerPosition kmer = null; - for (int i = 0; i < fileCount; i++) { - KMerPosition kmp = kmers[i]; - if (kmp == null) { - continue; - } else if (kmer == null) { - kmer = kmp; - count = 1; - } else { - int cmp = kmp.compareTo(kmer); - if (cmp == 0) { - count++; - } else if (cmp < 0) { - kmer = kmp; - count = 1; - } - } - } - if (kmer == null) { - break; - } - for (int i = 0; i < fileCount; i++) { - if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) { - kmers[i] = null; - } - } - if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - writeKMerPosition(outputStream, kmer); - } - - } - outputStream.flush(); - outputStream.close(); - for (int i = 0; i < fileCount; i++) { - // spillFiles.get(i).delete(); - } - log("Spill files merged, unique count is " + uniqueCount); - return uniqueCount; - } - - private void writeKMerTextFile(File inputFile, - List exceptionList, - File outputFile) - throws IOException { - - log("Writing kmer file " + outputFile + " ..."); - int exceptionIndex = 0; - StringKMerPosition excKMer = null; - Iterator excIter = null; - if (!exceptionList.isEmpty()) { - excIter = exceptionList.iterator(); - excKMer = excIter.next(); - } - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(inputFile)); - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - KMerPosition kmer = readKMerPosition(inputStream); - while (kmer != null || excKMer != null) { - if (excKMer == null) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else if (kmer == null) { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } - } - inputStream.close(); - writer.flush(); - writer.close(); - log("Wrote kmer file: " + outputFile); - } - - private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { - String chr = getBaseIndexSequenceName(baseIndex); - int pos = getBaseIndexCoordinate(baseIndex); - writer.println(kmer + "\t" + chr + "\t" + pos); - } - - private void createMapFile(int mapSize, - File kmerFile, - List exceptionList, - File priorMapFile, - File mapFile) - throws IOException { - byte[] map = null; - long uniquePriorCount = 0; - if (priorMapFile.exists()) { - map = readMapFile(priorMapFile); - if (map.length != mapSize) { - throw new RuntimeException("Prior map is wrong size"); - } - // Clear the new bits from prior map. - // Also count the prior unique positions while we are at it. - // Note that this is a count of positions, not kmers. - for (int i = 0; i < mapSize; i++) { - int cumBits = map[i] & 0x55; - uniquePriorCount += Integer.bitCount(cumBits); - map[i] = (byte) cumBits; - } - } else { - map = new byte[mapSize]; - } - for (StringKMerPosition kmp : exceptionList) { - addToMap(kmp, map); - } - mPriorMapUniqueCount = uniquePriorCount; - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(kmerFile)); - while (true) { - KMerPosition kmp = readKMerPosition(inputStream); - if (kmp == null) { - inputStream.close(); - break; - } - addToMap(kmp, map); - } - - long testCum = 0; - for (int i = 0; i < map.length; i++) { - testCum += Integer.bitCount(map[i] & 0x55); - } - - writeMapFile(map, mapFile); - } - - private void addToMap(KMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(StringKMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(int baseIndex, byte[] map) { - int mod = baseIndex & 0x3; - int offset = (baseIndex >> 2) & 0x3FFFFFFF; - if (((map[offset] >> (2*mod)) & 0x3) != 0) { - throw new RuntimeException("Map entry already set: " + baseIndex); - } - map[offset] |= (0x3 << (2*mod)); - } - - private void writeSummaryStatistics(File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; - long uniqueCount = mUniquePriorCount + mUniqueNewCount; - long nonUniqueCount = mKMerCount - uniqueCount; - writer.println("K: " + mK); - writer.println("Sequences: " + mSequenceList.size()); - writer.println("Bases: " + baseCount); - writer.println("KMers: " + mKMerCount); - writer.println("Prior map count: " + mPriorMapUniqueCount); - writer.println("Unique prior: " + mUniquePriorCount + - " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); - writer.println("Unique new: " + mUniqueNewCount + - " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); - writer.println("Unique cumulative: " + uniqueCount + - " (" + formatPercent(uniqueCount, mKMerCount) + ")"); - writer.println("Nonunique: " + nonUniqueCount + - " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); - writer.flush(); - writer.close(); - } - - private String formatPercent(long numerator, long denominator) { - double fraction = 0.0; - if (denominator != 0) { - fraction = numerator / (double) denominator; - } - return String.format("%1.1f%%", fraction * 100.0); - } - - private void openPriorMap(File mapFile) - throws IOException { - if (mapFile.exists()) { - mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - } - - private void closePriorMap() - throws IOException { - if (mPriorMapStream != null) { - mPriorMapStream.close(); - } - mPriorMapStream = null; - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - - private byte[] readMapFile(File file) - throws IOException { - long fileLength = file.length(); - if (fileLength > 1000000000) { - throw new RuntimeException("Prior map too large: " + file); - } - int length = (int) fileLength; - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - private void writeMapFile(byte[] map, File file) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(map); - stream.flush(); - stream.close(); - } - - private boolean isUniqueInPriorMap(int baseIndex) - throws IOException { - if (mPriorMapStream == null) { - return false; - } - int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF; - if (byteOffset != mPriorMapPosition) { - int delta = byteOffset - mPriorMapPosition; - if (delta < 0) { - throw new RuntimeException("Attempt to seek backwards in prior map"); - } - if (delta > 1) { - skipFully(mPriorMapStream, delta-1); - } - mPriorMapValue = mPriorMapStream.read(); - if (mPriorMapValue < 0) { - throw new RuntimeException("Unexpected end of file in prior map"); - } - mPriorMapPosition += delta; - } - int mod = baseIndex & 0x3; - return (((mPriorMapValue >> (2*mod)) & 1) != 0); - } - - private void skipFully(InputStream stream, long amount) - throws IOException { - while (amount > 0) { - long skip = stream.skip(amount); - if (skip <= 0 || skip > amount) { - throw new RuntimeException("Skip failed"); - } - amount -= skip; - } - } - - private String getBaseIndexSequenceName(int baseIndex) { - int sequenceCount = mSequenceList.size(); - for (int i = 0; i < sequenceCount-1; i++) { - int nextOffset = mSequenceOffsetList.get(i+1); - if (compareBaseIndex(nextOffset, baseIndex) > 0) { - return mSequenceList.get(i); - } - } - return mSequenceList.get(sequenceCount-1); - } - - private int getBaseIndexCoordinate(int baseIndex) { - Integer sequenceOffset = null; - for (Integer offset : mSequenceOffsetList) { - if (compareBaseIndex(offset, baseIndex) > 0) { - break; - } - sequenceOffset = offset; - } - if (sequenceOffset == null) { - return 0; - } - int coordinate = baseIndex - sequenceOffset + 1; - if (coordinate <= 0) { - dumpSequenceList(); - System.out.println("coordinate: " + coordinate); - System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); - System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); - throw new RuntimeException("Internal error: illegal coordinate " + - coordinate + " for base index " + baseIndex); - } - return coordinate; - } - - private void dumpSequenceList() { - System.out.println("# Sequences:"); - int count = mSequenceList.size(); - for (int i = 0; i < count; i++) { - String seqName = mSequenceList.get(i); - int offset = mSequenceOffsetList.get(i); - System.out.println("# " + seqName + - "\t" + offset + - "\t" + Integer.toHexString(offset)); - } - } - - private int compareBaseIndex(int baseIndex1, int baseIndex2) { - // Implements unsigned comparison, a la compareTo - if (baseIndex1 < 0 ^ baseIndex2 < 0) { - return ((baseIndex1 < 0) ? 1 : -1); - } else { - return (baseIndex1 - baseIndex2); - } - } - - private String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private char[] getNextKMer() - throws IOException { - - if (mKMerBuffer == null) { - mKMerBuffer = new char[mK]; - } - System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); - if (mKMerBufferedCount > 0) { - mKMerBufferedCount--; - } - - while (mKMerBufferedCount < mK) { - char base = getNextBase(); - if (base == 0) { - incrementBaseIndex(mKMerBufferedCount); - mKMerBufferedCount = 0; - return null; - } else if (base == 'N') { - incrementBaseIndex(mKMerBufferedCount+1); - mKMerBufferedCount = 0; - } else { - mKMerBuffer[mKMerBufferedCount++] = base; - } - } - incrementBaseIndex(1); - return mKMerBuffer; - } - - private char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - return mLineBuffer.charAt(mLineBufferIndex++); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } - - private void log(String text) { - if (mVerbose) { - System.out.println("# " + new Date() + " " + text); - } - } - - private static void dbg(String text) { - System.out.println("#DBG: " + text); - } - - private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { - if (mUseOldFormat) { - return encodeKMerOldFormat(kmerChars, baseIndex); - } - char[] encoding = encodeKMerChars(kmerChars); - if (encoding == null) { - return null; - } - char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars)); - if (compareEncodings(encoding, reverseEncoding) <= 0) { - return new KMerPositionN(encoding, baseIndex); - } else { - KMerPositionN kmp = new KMerPositionN(reverseEncoding, baseIndex); - kmp.setIsReversed(true); - return kmp; - } - } - - private static char[] encodeKMerChars(char[] kmerChars) { - if (kmerChars == null) { - return null; - } - - int kmerLength = kmerChars.length; - int encodingLength = (kmerLength + 7) / 8; - char[] encoding = new char[encodingLength]; - int offset = kmerLength % 8; - offset = (offset == 0) ? 8 : offset; - int bits = encodeKMerBits(kmerChars, 0, offset); - if (bits < 0) { - return null; - } - encoding[0] = (char) bits; - for (int i = 1; i < encodingLength; i++) { - bits = encodeKMerBits(kmerChars, offset, 8); - if (bits < 0) { - return null; - } - encoding[i] = (char) bits; - offset += 8; - } - return encoding; - } - - private static int compareEncodings(char[] encoding1, char[] encoding2) { - int length = Math.max(encoding1.length, encoding2.length); - for (int i = 0; i < length; i++) { - int result = encoding1[i] - encoding2[i]; - if (result != 0) { - return result; - } - } - return 0; - } - - private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) { - if (kmerChars == null) { - return null; - } - int length = kmerChars.length; - if (length <= 31) { - long bits = encodeKMerBitsLong(kmerChars, 0, length); - if (bits == -1) { - return null; - } - return new KMerPosition1(bits, baseIndex); - } else if (length <= 62) { - long bits1 = encodeKMerBitsLong(kmerChars, 0, 31); - long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31); - if (bits1 == -1 || bits2 == -1) { - return null; - } - return new KMerPosition2(bits1, bits2, baseIndex); - } else { - return null; - } - } - - private static int encodeKMerBits(char[] kmerChars, int offset, int length) { - int bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= baseBits << (2*(length-i-1)); - } - return bits; - } - - private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) { - long bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= ((long)baseBits) << (2*(length-i-1)); - } - return bits; - } - - private static String decodeKMer1(long bits) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits, buffer, 0, length); - return new String(buffer); - } - - private static String decodeKMer2(long bits1, long bits2) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits1, buffer, 0, 31); - decodeKMerBits(bits2, buffer, 31, length-31); - return new String(buffer); - } - - private static String decodeKMerN(char[] encoding, boolean reverse) { - int length = mK; - char[] buffer = new char[length]; - int offset = length % 8; - offset = (offset == 0) ? 8 : offset; - decodeKMerBits(encoding[0], buffer, 0, offset); - for (int i = 1; i < encoding.length; i++) { - decodeKMerBits(encoding[i], buffer, offset, 8); - offset += 8; - } - if (reverse) { - reverseComplementInPlace(buffer); - } - return new String(buffer); - } - - private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static char[] reverseComplement(char[] buffer) { - int length = buffer.length; - char[] result = new char[length]; - System.arraycopy(buffer, 0, result, 0, length); - reverseComplementInPlace(result); - return result; - } - - private static void reverseComplementInPlace(char[] buffer) { - int length = buffer.length; - int limit = (length + 1)/2; - for (int i = 0; i < limit; i++) { - char ch1 = reverseComplement(buffer[i]); - char ch2 = reverseComplement(buffer[length-i-1]); - buffer[i] = ch2; - buffer[length-i-1] = ch1; - } - } - - private static char reverseComplement(char base) { - switch (base) { - case 'A': - return 'T'; - case 'C': - return 'G'; - case 'G': - return 'C'; - case 'T': - return 'A'; - } - return base; - } - - private static String formatEncoding(char[] encoding) { - if (encoding == null) { - return null; - } - StringBuilder builder = new StringBuilder(); - builder.append('['); - for (int i = 0; i < encoding.length; i++) { - String hex = Integer.toHexString(encoding[i]); - int length = hex.length(); - while (length < 4) { - builder.append('0'); - length++; - } - builder.append(hex); - } - builder.append(']'); - return builder.toString(); - } - - static class KMerPosition - implements Comparable { - - private int mBaseIndex; - - KMerPosition(int baseIndex) { - mBaseIndex = baseIndex; - } - - public String getKMer() { - return null; - } - - public long getKMerEncoding1() { - return -1; - } - - public long getKMerEncoding2() { - return -1; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public char[] getKMerEncoding() { - return null; - } - - public int compareTo(KMerPosition kmp) { - return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding()); - } - - public boolean equals(Object object) { - if (!(object instanceof KMerPosition)) { - return false; - } - KMerPosition kmp = (KMerPosition) object; - return (getBaseIndex() == kmp.getBaseIndex() && - this.compareTo(kmp) == 0); - } - - public String format() { - return(getKMer() + - " " + formatEncoding(getKMerEncoding()) + - " " + Integer.toHexString(mBaseIndex)); - } - } - - static class KMerPosition1 - extends KMerPosition { - - private long mKMerEncoding1; - - KMerPosition1(long kmer, int baseIndex) { - super(baseIndex); - mKMerEncoding1 = kmer; - } - - public String getKMer() { - return decodeKMer1(getKMerEncoding1()); - } - - public final long getKMerEncoding1() { - return mKMerEncoding1; - } - - public int compareTo(KMerPosition kmp) { - int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1()); - if (result == 0) { - result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2()); - } - return result; - } - } - - static class KMerPosition2 - extends KMerPosition1 { - - private long mKMerEncoding2; - - KMerPosition2(long encoding1, long encoding2, int baseIndex) { - super(encoding1, baseIndex); - mKMerEncoding2 = encoding2; - } - - public String getKMer() { - return decodeKMer2(getKMerEncoding1(), getKMerEncoding2()); - } - - public final long getKMerEncoding2() { - return mKMerEncoding2; - } - } - - static class KMerPositionN - extends KMerPosition { - - private boolean mReversed; - private char[] mKMerEncoding; - - KMerPositionN(char[] encoding, int baseIndex) { - super(baseIndex); - mReversed = false; - mKMerEncoding = encoding; - } - - public boolean getIsReversed() { - return mReversed; - } - - public void setIsReversed(boolean value) { - mReversed = value; - } - - public String getKMer() { - return decodeKMerN(mKMerEncoding, mReversed); - } - - public final char[] getKMerEncoding() { - return mKMerEncoding; - } - - public String format() { - return(getKMer() + - " " + formatEncoding(getKMerEncoding()) + - " " + (mReversed ? 'R' : 'F') + - " " + Integer.toHexString(getBaseIndex())); - } - } - - static class StringKMerPosition - implements Comparable { - - private String mKMerString = null; - private int mBaseIndex; - - StringKMerPosition(String kmer, int baseIndex) { - mKMerString = kmer; - mBaseIndex = baseIndex; - } - - public final String getKMer() { - return mKMerString; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public int compareTo(StringKMerPosition kmp) { - return mKMerString.compareTo(kmp.mKMerString); - } - - public boolean equals(Object object) { - if (!(object instanceof StringKMerPosition)) { - return false; - } - StringKMerPosition kmp = (StringKMerPosition) object; - return (mBaseIndex == kmp.mBaseIndex && - mKMerString.equals(kmp.mKMerString)); - } - } -} diff --git a/java/lib/edu/mit/broad/cnv/GatherAlignments.java b/java/lib/edu/mit/broad/cnv/GatherAlignments.java deleted file mode 100644 index b0dc2d5af..000000000 --- a/java/lib/edu/mit/broad/cnv/GatherAlignments.java +++ /dev/null @@ -1,399 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import edu.mit.broad.arachne.Alignment; -import edu.mit.broad.arachne.LookAlignReader; - -import java.io.*; -import java.util.*; - -/** - * Utility program to gather CNV alignments from LookAlign files in an I/O efficient manner. - */ -public class GatherAlignments { - - public static void main(String[] args) - throws Exception { - new GatherAlignments().run(args); - } - - private void usage() { - System.out.println("Usage: GatherAlignments ..."); - System.out.println(" -cnpList "); - System.out.println(" -sampleId "); - System.out.println(" -inputFileList "); - System.out.println(" -outputDirectory "); - System.out.println(" -padding "); - System.out.println(" -bestAlignments"); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-cnpList") && argsleft > 1) { - argpos++; - mCnpListPath = args[argpos++]; - } else if (arg.equals("-sampleId") && argsleft > 1) { - argpos++; - mSampleId = args[argpos++]; - } else if (arg.equals("-inputFileList") && argsleft > 1) { - argpos++; - mInputFileListPath = args[argpos++]; - } else if (arg.equals("-outputDirectory") && argsleft > 1) { - argpos++; - mOutputDirectory = args[argpos++]; - } else if (arg.equals("-padding") && argsleft > 1) { - argpos++; - mCnpRegionPadding = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-bestAlignments")) { - argpos++; - mReturnBestHits = true; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - List mInputFileList = parseInputFiles(mInputFileListPath); - Map> mCnpMap = parseCnpFile(mCnpListPath); - for (File inputFile : mInputFileList) { - scanInputFile(inputFile, mCnpMap); - } - } - - private List parseInputFiles(String path) - throws IOException { - List fileList = new ArrayList(); - LineNumberReader reader = new LineNumberReader(new FileReader(path)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - line = line.trim(); - if (line.length() == 0 || line.startsWith("#")) { - continue; - } - String[] fields = line.split("\\s+"); - fileList.add(new File(fields[0])); - } - return fileList; - } - - private Map> parseCnpFile(String path) - throws IOException { - Map> cnpMap = new HashMap>(); - LineNumberReader reader = new LineNumberReader(new FileReader(path)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - line = line.trim(); - if (line.length() == 0 || line.startsWith("#")) { - continue; - } - String[] fields = line.split("\\s+"); - if (fields.length != 4) { - throw new RuntimeException("Invalid CNP line: " + line); - } - if (fields[0].equalsIgnoreCase("CNPID")) { - continue; - } - String cnpId = fields[0]; - String chromosome = fields[1]; - int start = Integer.parseInt(fields[2].replaceAll(",", "")); - int end = Integer.parseInt(fields[3].replaceAll(",", "")); - int sequenceId = chromosomeToSequenceId(chromosome); - if (sequenceId < 0) { - throw new RuntimeException("Unrecognized chromosome: " + chromosome); - } - if (mCnpRegionPadding > 0) { - start = Math.max(1, start - mCnpRegionPadding); - end = end + mCnpRegionPadding; - } - CnpRegion cnp = new CnpRegion(cnpId, sequenceId, start, end); - List cnpList = cnpMap.get(sequenceId); - if (cnpList == null) { - cnpList = new ArrayList(); - cnpMap.put(sequenceId, cnpList); - } - cnpList.add(cnp); - } - return cnpMap; - } - - private int chromosomeToSequenceId(String text) { - if (text == null || text.length() == 0) { - return -1; - } - if (text.matches("\\d+")) { - return Integer.parseInt(text); - } - if (text.startsWith("chr") && text.length() > 3) { - text = text.substring(3); - } - if (text.matches("\\d+") && !text.startsWith("0")) { - return Integer.parseInt(text); - } - if (text.equals("M")) { - return 0; - } else if (text.equals("X")) { - return 23; - } else if (text.equals("Y")) { - return 24; - } else { - return -1; - } - } - - private void scanInputFile(File inputFile, - Map> cnpMap) - throws IOException { - LookAlignReader reader = new LookAlignReader(inputFile); - while (true) { - Alignment alignment = getNextAlignment(reader); - if (alignment == null) { - reader.close(); - break; - } - List cnpList = cnpMap.get(alignment.getBSequenceId()); - if (cnpList == null) { - continue; - } - for (CnpRegion cnp : cnpList) { - if (overlaps(cnp, alignment)) { - saveCnpAlignment(cnp, alignment, inputFile); - } - } - } - flushCnpAlignments(inputFile); - } - - private Alignment getNextAlignment(LookAlignReader reader) - throws IOException { - if (!mReturnBestHits) { - if (reader.hasNext()) { - return reader.next(); - } else { - return null; - } - } - while (true) { - Alignment seed = mPendingAlignment; - mPendingAlignment = null; - if (seed == null && reader.hasNext()) { - seed = reader.next(); - } - if (seed == null) { - return null; - } - List secondaryHits = null; - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (alignment.getASequenceId() != seed.getASequenceId()) { - if (alignment.getASequenceId() < seed.getASequenceId()) { - throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); - } - mPendingAlignment = alignment; - break; - } - if (secondaryHits == null) { - secondaryHits = new ArrayList(); - } - secondaryHits.add(alignment); - } - if (secondaryHits == null) { - return seed; - } - secondaryHits.add(seed); - Alignment result = getUniqueBestAlignment(secondaryHits); - if (result != null) { - return result; - } - } - } - - private Alignment getUniqueBestAlignment(List alignments) { - int bestMismatches = 0; - List best = new ArrayList(); - for (Alignment a : alignments) { - int mismatches = getAlignmentMismatches(a); - if (best.isEmpty()) { - best.add(a); - bestMismatches = mismatches; - } - if (mismatches == bestMismatches) { - best.add(a); - } else if (mismatches < bestMismatches) { - best.clear(); - best.add(a); - bestMismatches = mismatches; - } - } - if (best.size() != 1) { - return null; - } - return best.get(0); - } - - private int getAlignmentMismatches(Alignment alignment) { - int mismatches = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - int gap = blocks[i]; - int duration = blocks[i+1]; - int mm = blocks[i+2]; - if (mm > duration) { - throw new RuntimeException("Invalid alignment? : " + alignment.format()); - } - mismatches += Math.abs(gap); - mismatches += mm; - } - return mismatches; - } - - private boolean overlaps(CnpRegion cnp, Alignment alignment) { - return (cnp.getSequenceId() == alignment.getBSequenceId() && - cnp.getStart() <= alignment.getBEnd() && - cnp.getEnd() >= alignment.getBStart()); - } - - private void saveCnpAlignment(CnpRegion cnp, Alignment alignment, File inputFile) - throws IOException { - if (mCnpAlignmentCount > mCnpAlignmentLimit) { - flushCnpAlignments(inputFile); - } - String cnpId = cnp.getCnpId(); - List alignmentList = mCnpAlignmentMap.get(cnpId); - if (alignmentList == null) { - alignmentList = new ArrayList(); - mCnpAlignmentMap.put(cnpId, alignmentList); - } - alignmentList.add(alignment); - mCnpAlignmentCount++; - } - - private void flushCnpAlignments(File inputFile) - throws IOException { - while (!mCnpAlignmentMap.isEmpty()) { - String cnpId = mCnpAlignmentMap.keySet().iterator().next(); - List alignmentList = mCnpAlignmentMap.get(cnpId); - writeAlignments(cnpId, mSampleId, alignmentList, inputFile); - mCnpAlignmentMap.remove(cnpId); - mCnpAlignmentCount -= alignmentList.size(); - } - if (mCnpAlignmentCount != 0) { - throw new RuntimeException("Unsynchronized alignment count"); - } - } - - private void writeAlignments(String cnpId, String sampleId, List alignmentList, File inputFile) - throws IOException { - File outputDir = new File("."); - if (mOutputDirectory != null) { - outputDir = new File(mOutputDirectory); - } - String cnpSample = cnpId; - if (sampleId != null) { - cnpSample = cnpSample + "_" + sampleId; - } - File cnpSampleDir = new File(outputDir, cnpSample); - if (!cnpSampleDir.exists()) { - if (!cnpSampleDir.mkdir()) { - throw new RuntimeException("Failed to create directory " + cnpSampleDir); - } - } - String fileName = inputFile.getName(); - File alignmentFile = new File(cnpSampleDir, fileName); - PrintWriter writer = new PrintWriter(new FileWriter(alignmentFile, true)); - for (Alignment alignment : alignmentList) { - writer.println(alignment.arachneFormat()); - } - writer.flush(); - writer.close(); - } - - private GatherAlignments() { - } - - private static class CnpRegion { - - private CnpRegion(String cnpId, int sequenceId, int start, int end) { - mCnpId = cnpId; - mSequenceId = sequenceId; - mStart = start; - mEnd = end; - } - - public String getCnpId() { return mCnpId; }; - public int getSequenceId() { return mSequenceId; }; - public int getStart() { return mStart; }; - public int getEnd() { return mEnd; }; - - private String mCnpId; - private int mSequenceId; - private int mStart; - private int mEnd; - } - - private boolean mDebug = false; - private boolean mVerbose = false; - - private boolean mReturnBestHits = false; - private String mCnpListPath = null; - private String mSampleId = null; - private String mInputFileListPath = null; - private String mOutputDirectory = null; - private int mCnpRegionPadding = 0; - - private Alignment mPendingAlignment = null; - private int mCnpAlignmentCount = 0; - private int mCnpAlignmentLimit = 1000000; - private Map> mCnpAlignmentMap = new LinkedHashMap>(); -} - - - diff --git a/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java b/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java deleted file mode 100644 index 23b9d6af4..000000000 --- a/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java +++ /dev/null @@ -1,1494 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.kmer; - - -import edu.mit.broad.cnv.util.SequenceIterator; - -import java.io.*; -import java.util.*; - - -/** - * Tool for counting unique kmers. - */ -public class CountKMers -{ - private static final int NONUNIQUE_MARKER = -1; - - private String mAction = null; - private static int mK = 0; - private int mMinimumK = 0; - private int mMaximumK = 0; - private int mBatchSize = 0; - private List mInputFiles = null; - private File mSearchFile = null; - private String mSequenceName = null; - private File mInputDirectory = null; - private File mOutputDirectory = null; - private boolean mRunDistributed = false; - private int mDistributedWorkerCount = 0; - private boolean mVerbose = false; - private boolean mDebug = false; - - private List mSequenceList = null; - private List mSequenceOffsetList = null; - private List mSpillFileList = null; - private double mSpillFactor = 0.9; - - private long mKMerCount = 0; - private long mUniquePriorCount = 0; - private long mUniqueNewCount = 0; - private long mPriorMapUniqueCount = 0; - - private InputStream mPriorMapStream = null; - private int mPriorMapPosition = -1; - private int mPriorMapValue = 0; - private int mInputFileIndex = 0; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private char[] mKMerBuffer = null; - private int mKMerBufferedCount = 0; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - private int mBaseIndex = -1; - private byte[] mIOBuffer = null; - - /* Design - Inputs: - - One or more fasta files to search (currently one). - - Output directory for the result files. - - Optionally an input k-1-mer file (output from previous pass). - Outputs: - - Unique kmer file: (sorted by kmer) - This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). - - Per chromosome bit map: pos (implicit) new-bit cum-bit - New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. - Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. - - Statistics - Plan: - - Reducing memory footprint is crucial. - - Sequential pass over the input sequences to generate kmers. - - BatchSize kmers are cached in memory, then sorted and uniqified. - - As batch array fills, batches are spilled to disk. - - Batches are reloaded from disk and merged (N-finger algorithm) - - and streamed to a merge file. - - Merge file is read from disk and processed as final results. - */ - - public static void main(String[] args) - throws Exception { - new CountKMers().run(args); - } - - private void usage() { - System.out.println("Usage: CountKMers ..."); - System.out.println(" -action "); - System.out.println(" -genome "); - System.out.println(" -chromosome "); - System.out.println(" -k "); - System.out.println(" -minK "); - System.out.println(" -maxK "); - System.out.println(" -batchSize "); - System.out.println(" -inputDir "); - System.out.println(" -outputDir "); - System.out.println(" -distributed"); - System.out.println(" -workers "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-genome") && argsleft > 1) { - argpos++; - if (mInputFiles == null) { - mInputFiles = new ArrayList(); - } - mInputFiles.add(new File(args[argpos++])); - } else if (arg.equals("-chromosome") && argsleft > 1) { - argpos++; - mSequenceName = args[argpos++]; - } else if (arg.equals("-k") && argsleft > 1) { - argpos++; - mK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-minK") && argsleft > 1) { - argpos++; - mMinimumK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-maxK") && argsleft > 1) { - argpos++; - mMaximumK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-batchSize") && argsleft > 1) { - argpos++; - mBatchSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-inputDir") && argsleft > 1) { - argpos++; - mInputDirectory = new File(args[argpos++]); - } else if (arg.equals("-outputDir") && argsleft > 1) { - argpos++; - mOutputDirectory = new File(args[argpos++]); - } else if (arg.equals("-searchFile") && argsleft > 1) { - argpos++; - mSearchFile = new File(args[argpos++]); - } else if (arg.equals("-distributed")) { - argpos++; - mRunDistributed = true; - } else if (arg.equals("-workers") && argsleft > 1) { - argpos++; - mDistributedWorkerCount = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - if (!parseArguments(args)) { - System.exit(1); - } - if (mAction == null || mAction.equals("mapKMers")) { - if (mRunDistributed) { - mapKMersDistributed(); - } else { - mapKMers(); - } - } else if (mAction.equals("mapGaps")) { - mapGaps(); - } else if (mAction.equals("rollUp")) { - rollUp(); - } else if (mAction.equals("search")) { - search(); - } - } - - private void search() - throws IOException { - char[][] searchStrings = loadSearchFile(mSearchFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int position = 0; - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - position++; - for (int i = 0; i < searchStrings.length; i++) { - if (Arrays.equals(searchStrings[i], kmerChars)) { - String kmer = new String(searchStrings[i]); - String strand = ((i % 2) == 0) ? "F" : "R"; - System.out.println(kmer + "\t" + seqName + "\t" + position + "\t" + strand); - } - } - } - } - } - - private char[][] loadSearchFile(File file) - throws IOException { - List list = new ArrayList(); - LineNumberReader reader = new LineNumberReader(new FileReader(file)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - String text = line.trim(); - if (text.length() == 0 || text.startsWith("#")) { - continue; - } - String[] fields = text.split("\\s+"); - char[] kmer = fields[0].toUpperCase().toCharArray(); - list.add(kmer); - list.add(reverseComplement(kmer)); - } - return list.toArray(new char[0][0]); - } - - // Can be used to scan genome for sequence names/lengths. - private void scanKMers() - throws IOException { - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - openPriorMap(priorMapFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - if (isUniqueInPriorMap(mBaseIndex)) { - continue; - } - } - } - closePriorMap(); - } - - private void mapGaps() - throws IOException { - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int pos = 0; - int gapStart = 0; - while (true) { - char base = getNextBase(); - if (base == 0) { - break; - } - pos++; - if (base == 'N') { - if (gapStart == 0) { - gapStart = pos; - } - } else { - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - - private void rollUp() - throws IOException { - // Roll up based on the middle of the reads. - File[] mapFiles = getAllMapFiles(); - if (mapFiles.length > 127) { - throw new RuntimeException("K to large for byte sized counts"); - } - SequenceIterator seqIterator = new SequenceIterator(mInputFiles); - while (true) { - String seqName = seqIterator.getNextSequence(); - if (seqName == null) { - break; - } - if (mSequenceName != null && !mSequenceName.equals(seqName)) { - continue; - } - log("Rolling up sequence " + seqName + " ..."); - int seqBaseIndex = seqIterator.getBaseIndex() + 1; - char[] seqChars = loadSequence(seqIterator); - int seqLength = seqChars.length; - int seqMapOffset = (seqBaseIndex >> 3) & 0x1FFFFFFF; - int seqMapModulus = (seqBaseIndex & 0x7); - int seqMapLength = (seqMapModulus + seqLength + 7)/8; - // log(" seqLength = " + seqLength); - // log(" baseIndex = " + Integer.toHexString(seqBaseIndex) - // + " (" + (((long)seqBaseIndex) & 0xFFFFFFFFL) + ")"); - // log(" seqMapOffset = " + seqMapOffset); - // log(" seqMapLength = " + seqMapLength); - byte[] counts = new byte[seqLength]; - for (int pos = 1; pos <= seqLength; pos++) { - if (seqChars[pos-1] == 'N') { - counts[pos-1] = -1; - } - } - for (int k = 1; k <= mapFiles.length; k++) { - if (mapFiles[k-1] == null) { - continue; - } - log("Processing map file " + mapFiles[k-1] + " ..."); - byte[] kmerMap = readMapFileRegion(mapFiles[k-1], seqMapOffset, seqMapLength); - for (int pos = 1; pos <= seqLength; pos++) { - if (counts[pos-1] != 0) { - continue; - } else if (isNearContigBoundary(pos, seqChars, k)) { - counts[pos-1] = -1; - } else { - int baseOffset = pos - (k+1)/2; - int mapIndex = seqMapModulus + baseOffset; - if (isUniqueInMap(kmerMap, mapIndex)) { - counts[pos-1] = (byte) k; - } - } - } - } - File outputFile = - new File(mOutputDirectory, "rollup_" + seqName + ".bin"); - writeRollUpFile(outputFile, counts); - } - } - - private boolean isNearContigBoundary(int pos, char[] seqChars, int k) { - int windowStart = pos - (k-1)/2; - int windowEnd = pos + k/2; - if (windowStart < 1 || windowEnd > seqChars.length) { - return true; - } - for (int i = windowStart-1; i < windowEnd; i++) { - if (seqChars[i] == 'N') { - return true; - } - } - return false; - } - - private void writeRollUpFile(File file, byte[] counts) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(counts); - stream.flush(); - stream.close(); - if (mDebug) { - PrintWriter writer = new PrintWriter(file + ".dbg"); - for (int i = 0; i < counts.length; i++) { - writer.println(counts[i]); - } - writer.flush(); - writer.close(); - } - } - - /** - * Returns an array of files, indexed by K, - * where the array index = K-1 (i.e. K=1 is the first file). - * If there is no file for index K, then the array element is null. - */ - private File[] getAllMapFiles() { - int maxK = mMaximumK; - if (maxK == 0) { - // Safe upper bound - maxK = 1000; - } - List fileList = new ArrayList(); - for (int k = 1; k <= maxK; k++) { - if (mMinimumK > 0 && k < mMinimumK) { - continue; - } - File mapFile = - new File(mInputDirectory, "unique_" + k + "_mers_map.bin"); - if (mapFile.exists()) { - while (fileList.size() < k-1) { - fileList.add(null); - } - fileList.add(mapFile); - } else { - if (mMaximumK == 0 && !fileList.isEmpty()) { - break; - } - } - } - File[] result = new File[fileList.size()]; - result = fileList.toArray(result); - if (mDebug) { - for (int i = 0; i < result.length; i++) { - debug("mapFiles[k=" + (i+1) + "] = " + result[i]); - } - } - return result; - } - - private char[] loadSequence(SequenceIterator seqIterator) - throws IOException { - StringBuilder builder = new StringBuilder(); - while (true) { - char ch = seqIterator.getNextBase(); - if (ch == 0) { - break; - } - builder.append(ch); - } - char[] result = new char[builder.length()]; - builder.getChars(0, builder.length(), result, 0); - return result; - } - - private void mapKMersDistributed() - throws Exception { - DistributedKMerCounter algorithm = new DistributedKMerCounter(); - algorithm.setDebug(mDebug); - algorithm.setVerbose(mVerbose); - algorithm.setInputFiles(mInputFiles); - algorithm.setK(mK); - algorithm.setMaximumWorkerCount(mDistributedWorkerCount); - // algorithm.setLsfQueue(mLsfQueue); - // algorithm.setLsfLogDirectory(mLsfLogDirectory); - // algorithm.setEnableGcLogging(mEnableGcLogging); - algorithm.run(); - } - - private void mapKMers() - throws IOException { - - File textKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); - File binaryKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); - File exceptionFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); - File mapFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - File statsFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); - - if (mBatchSize == 0) { - throw new RuntimeException("Batch size not specified"); - } - - int kmerCount = 0; - int batchSize = mBatchSize; - KMerPosition[] kmerArray = new KMerPosition[batchSize]; - List exceptionList = new ArrayList(); - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; - - openPriorMap(priorMapFile); - - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Processing " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - int baseIndex = mBaseIndex; - if (isUniqueInPriorMap(baseIndex)) { - mUniquePriorCount++; - continue; - } - KMerPosition kmp = encodeKMer(kmerChars, baseIndex); - if (kmp == null) { - // Note: We currently do not handle the reverse - // complement of exception characters correctly. - // For hg18, however, this doesn't matter as - // none of the kmers containing non-ACGT characters - // are present on the reverse strand. - String kmer = new String(kmerChars); - exceptionList.add(new StringKMerPosition(kmer, baseIndex)); - continue; - } - kmerArray[kmerCount++] = kmp; - if (kmerCount == batchSize) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (kmerCount > mSpillFactor * batchSize) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - } - } - if (kmerCount > 0) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (mSpillFileList != null) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - - closePriorMap(); - - // Write out the exception kmers (text file). - compactKMers(exceptionList); - writeExceptionFile(exceptionList, exceptionFile); - - // Write out the binary file of unique encoded kmers. - if (mSpillFileList == null) { - kmerCount = removeNonUnique(kmerArray, kmerCount); - writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); - mUniqueNewCount = kmerCount; - } else { - mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); - } - mUniqueNewCount += countUniqueKMers(exceptionList); - - // Write out the text file of (all) unique kmers. - writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); - - // Create map file from prior map plus the new unique kmers. - long mapSize = (mBaseIndex + 1) & 0xFFFFFFFFL; - createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); - - // Write summary statistics file. - writeSummaryStatistics(statsFile); - } - - private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " kmers at index " + - Integer.toHexString(mBaseIndex) + " ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - KMerPosition current = kmerArray[0]; - for (int i = 1; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (current.compareTo(kmp) == 0) { - current.setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - current = kmp; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " string kmers ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - String kmerString = kmerArray[0].getKMer(); - for (int i = 1; i < kmerCount; i++) { - StringKMerPosition kmp = kmerArray[i]; - String ks = kmp.getKMer(); - if (ks.equals(kmerString)) { - kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - kmerString = ks; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private void compactKMers(List kmerList) { - int kmerCount = kmerList.size(); - if (kmerCount <= 1) { - return; - } - StringKMerPosition[] kmerArray = - kmerList.toArray(new StringKMerPosition[kmerCount]); - kmerCount = compactKMers(kmerArray, kmerCount); - kmerList.clear(); - for (int i = 0; i < kmerCount; i++) { - kmerList.add(kmerArray[i]); - } - } - - private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { - int uniqueCount = 0; - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - kmerArray[uniqueCount++] = kmp; - } - } - return uniqueCount; - } - - private int countUniqueKMers(List kmerList) { - int uniqueCount = 0; - for (StringKMerPosition kmp : kmerList) { - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - } - } - return uniqueCount; - } - - private void spillKMers(KMerPosition[] kmerArray, int kmerCount) - throws IOException { - if (mSpillFileList == null) { - mSpillFileList = new ArrayList(); - } - int fileNumber = mSpillFileList.size() + 1; - log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); - File spillFile = new File(mOutputDirectory, - "spill_" + mK + "_" + fileNumber + ".tmp"); - mSpillFileList.add(spillFile); - writeKMerBinaryFile(kmerArray, kmerCount, spillFile); - log("Spill file written"); - } - - private void writeKMerBinaryFile(KMerPosition[] kmerArray, - int kmerCount, - File outputFile) - throws IOException { - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - writeKMerPosition(outputStream, kmerArray[i]); - } - outputStream.flush(); - outputStream.close(); - } - - private void writeExceptionFile(List kmerList, - File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - for (StringKMerPosition kmer : kmerList) { - writeUniqueKMer(kmer, writer); - } - writer.flush(); - writer.close(); - } - - private KMerPosition readKMerPosition(InputStream stream) - throws IOException { - byte[] buffer = mIOBuffer; - int encodingLength = (mK + 7)/8; - int fileLength = 4 + 2*encodingLength; - int count = readFully(stream, buffer, 0, fileLength); - if (count <= 0) { - return null; - } else if (count != fileLength) { - throw new RuntimeException("Unexpected end of file"); - } - char[] encoding = new char[encodingLength]; - int baseIndex = ((buffer[0] & 0xFF) | - (buffer[1] & 0xFF) << 8 | - (buffer[2] & 0xFF) << 16 | - (buffer[3] & 0xFF) << 24); - for (int i = 0; i < encodingLength; i++) { - encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | - ((buffer[2*i+5] & 0xFF) << 8)); - } - return new KMerPosition(encoding, baseIndex); - } - - private int readFully(InputStream stream, byte[] buffer, int offset, int count) - throws IOException { - int readCount = 0; - while (readCount < count) { - int read = stream.read(buffer, offset, count-readCount); - if (read <= 0) { - break; - } - offset += read; - readCount += read; - } - return readCount; - } - - private void skipBytes(InputStream stream, int count) - throws IOException { - - long longCount = count; - long skipCount = 0; - while (skipCount < longCount) { - long skipped = stream.skip(longCount - skipCount); - if (skipped <= 0) { - throw new RuntimeException("Skip failed"); - } - skipCount += skipped; - } - } - - private void writeKMerPosition(OutputStream stream, KMerPosition kmer) - throws IOException { - byte[] buffer = mIOBuffer; - int baseIndex = kmer.getBaseIndex(); - char[] encoding = kmer.getKMerEncoding(); - int offset = 0; - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - for (int i = 0; i < encoding.length; i++) { - buffer[offset++] = (byte) ((encoding[i]) & 0xFF); - buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); - } - stream.write(buffer, 0, offset); - } - - private long mergeSpillFiles(List spillFiles, File outputFile) - throws IOException { - - if (spillFiles == null) { - return 0; - } - - log("Merging spill files ..."); - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - long uniqueCount = 0; - int fileCount = spillFiles.size(); - InputStream[] inputStreams = new InputStream[fileCount]; - KMerPosition[] kmers = new KMerPosition[fileCount]; - for (int i = 0; i < fileCount; i++) { - inputStreams[i] = - new BufferedInputStream(new FileInputStream(spillFiles.get(i))); - } - while (true) { - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == null && inputStreams[i] != null) { - kmers[i] = readKMerPosition(inputStreams[i]); - if (kmers[i] == null) { - inputStreams[i].close(); - inputStreams[i] = null; - } - } - } - int count = 0; - KMerPosition kmer = null; - for (int i = 0; i < fileCount; i++) { - KMerPosition kmp = kmers[i]; - if (kmp == null) { - continue; - } else if (kmer == null) { - kmer = kmp; - count = 1; - } else { - int cmp = kmp.compareTo(kmer); - if (cmp == 0) { - count++; - } else if (cmp < 0) { - kmer = kmp; - count = 1; - } - } - } - if (kmer == null) { - break; - } - for (int i = 0; i < fileCount; i++) { - if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) { - kmers[i] = null; - } - } - if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - writeKMerPosition(outputStream, kmer); - } - } - outputStream.flush(); - outputStream.close(); - for (int i = 0; i < fileCount; i++) { - // spillFiles.get(i).delete(); - } - log("Spill files merged, unique count is " + uniqueCount); - return uniqueCount; - } - - private void writeKMerTextFile(File inputFile, - List exceptionList, - File outputFile) - throws IOException { - - log("Writing kmer file " + outputFile + " ..."); - int exceptionIndex = 0; - StringKMerPosition excKMer = null; - Iterator excIter = null; - if (!exceptionList.isEmpty()) { - excIter = exceptionList.iterator(); - excKMer = excIter.next(); - } - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(inputFile)); - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - KMerPosition kmer = readKMerPosition(inputStream); - while (kmer != null || excKMer != null) { - if (excKMer == null) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else if (kmer == null) { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } - } - inputStream.close(); - writer.flush(); - writer.close(); - log("Wrote kmer file: " + outputFile); - } - - private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { - String chr = getBaseIndexSequenceName(baseIndex); - int pos = getBaseIndexCoordinate(baseIndex); - writer.println(kmer + "\t" + chr + "\t" + pos); - } - - private void createMapFile(long mapSize, - File kmerFile, - List exceptionList, - File priorMapFile, - File mapFile) - throws IOException { - byte[] map = null; - long uniquePriorCount = 0; - long byteSize = (mapSize + 7)/8; - int mapByteSize = (int) byteSize; - if (mapByteSize != byteSize) { - throw new RuntimeException("Map too large: " + mapSize); - } - if (priorMapFile.exists()) { - map = readMapFile(priorMapFile); - if (map.length != mapByteSize) { - throw new RuntimeException("Prior map is wrong size"); - } - // Count the prior unique positions - for (int i = 0; i < mapByteSize; i++) { - uniquePriorCount += Integer.bitCount(map[i] & 0xFF); - } - } else { - map = new byte[mapByteSize]; - } - for (StringKMerPosition kmp : exceptionList) { - addToMap(kmp, map); - } - mPriorMapUniqueCount = uniquePriorCount; - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(kmerFile)); - while (true) { - KMerPosition kmp = readKMerPosition(inputStream); - if (kmp == null) { - inputStream.close(); - break; - } - addToMap(kmp, map); - } - - writeMapFile(map, mapFile); - } - - private void addToMap(KMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(StringKMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(int baseIndex, byte[] map) { - int mod = baseIndex & 0x7; - int offset = (baseIndex >> 3) & 0x1FFFFFFF; - if ((map[offset] & (1 << mod)) != 0) { - throw new RuntimeException("Map entry already set: " + baseIndex); - } - map[offset] |= (1 << mod); - } - - private boolean isUniqueInMap(byte[] map, int baseIndex) { - int mod = baseIndex & 0x7; - int offset = (baseIndex >> 3) & 0x1FFFFFFF; - return ((map[offset] & (1 << mod)) != 0); - } - - private void writeSummaryStatistics(File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; - long uniqueCount = mUniquePriorCount + mUniqueNewCount; - long nonUniqueCount = mKMerCount - uniqueCount; - writer.println("K: " + mK); - writer.println("Sequences: " + mSequenceList.size()); - writer.println("Bases: " + baseCount); - writer.println("KMers: " + mKMerCount); - writer.println("Prior map count: " + mPriorMapUniqueCount); - writer.println("Unique prior: " + mUniquePriorCount + - " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); - writer.println("Unique new: " + mUniqueNewCount + - " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); - writer.println("Unique cumulative: " + uniqueCount + - " (" + formatPercent(uniqueCount, mKMerCount) + ")"); - writer.println("Nonunique: " + nonUniqueCount + - " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); - writer.flush(); - writer.close(); - } - - private String formatPercent(long numerator, long denominator) { - double fraction = 0.0; - if (denominator != 0) { - fraction = numerator / (double) denominator; - } - return String.format("%1.1f%%", fraction * 100.0); - } - - private void openPriorMap(File mapFile) - throws IOException { - if (mapFile.exists()) { - mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - } - - private void closePriorMap() - throws IOException { - if (mPriorMapStream != null) { - mPriorMapStream.close(); - } - mPriorMapStream = null; - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - - private byte[] readMapFile(File file) - throws IOException { - long fileLength = file.length(); - if (fileLength > 1000000000) { - throw new RuntimeException("Prior map too large: " + file); - } - int length = (int) fileLength; - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - /** - * Read just a subset of a map file. - */ - private byte[] readMapFileRegion(File file, int offset, int length) - throws IOException { - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - skipBytes(stream, offset); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - private void writeMapFile(byte[] map, File file) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(map); - stream.flush(); - stream.close(); - } - - private boolean isUniqueInPriorMap(int baseIndex) - throws IOException { - if (mPriorMapStream == null) { - return false; - } - int byteOffset = (baseIndex >> 3) & 0x1FFFFFFF; - if (byteOffset != mPriorMapPosition) { - int delta = byteOffset - mPriorMapPosition; - if (delta < 0) { - throw new RuntimeException("Attempt to seek backwards in prior map"); - } - if (delta > 1) { - skipFully(mPriorMapStream, delta-1); - } - mPriorMapValue = mPriorMapStream.read(); - if (mPriorMapValue < 0) { - throw new RuntimeException("Unexpected end of file in prior map"); - } - mPriorMapPosition += delta; - } - int mod = baseIndex & 0x7; - return (((1 << mod) & mPriorMapValue) != 0); - } - - private void skipFully(InputStream stream, long amount) - throws IOException { - while (amount > 0) { - long skip = stream.skip(amount); - if (skip <= 0 || skip > amount) { - throw new RuntimeException("Skip failed"); - } - amount -= skip; - } - } - - private String getBaseIndexSequenceName(int baseIndex) { - int sequenceCount = mSequenceList.size(); - for (int i = 0; i < sequenceCount-1; i++) { - int nextOffset = mSequenceOffsetList.get(i+1); - if (compareBaseIndex(nextOffset, baseIndex) > 0) { - return mSequenceList.get(i); - } - } - return mSequenceList.get(sequenceCount-1); - } - - private int getBaseIndexCoordinate(int baseIndex) { - Integer sequenceOffset = null; - for (Integer offset : mSequenceOffsetList) { - if (compareBaseIndex(offset, baseIndex) > 0) { - break; - } - sequenceOffset = offset; - } - if (sequenceOffset == null) { - return 0; - } - int coordinate = baseIndex - sequenceOffset + 1; - if (coordinate <= 0) { - dumpSequenceList(); - System.out.println("coordinate: " + coordinate); - System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); - System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); - throw new RuntimeException("Internal error: illegal coordinate " + - coordinate + " for base index " + baseIndex); - } - return coordinate; - } - - private void dumpSequenceList() { - System.out.println("# Sequences:"); - int count = mSequenceList.size(); - for (int i = 0; i < count; i++) { - String seqName = mSequenceList.get(i); - int offset = mSequenceOffsetList.get(i); - System.out.println("# " + seqName + - "\t" + offset + - "\t" + Integer.toHexString(offset)); - } - } - - private int compareBaseIndex(int baseIndex1, int baseIndex2) { - // Implements unsigned comparison, a la compareTo - if (baseIndex1 < 0 ^ baseIndex2 < 0) { - return ((baseIndex1 < 0) ? 1 : -1); - } else { - return (baseIndex1 - baseIndex2); - } - } - - private String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private char[] getNextKMer() - throws IOException { - - if (mKMerBuffer == null) { - mKMerBuffer = new char[mK]; - } - System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); - if (mKMerBufferedCount > 0) { - mKMerBufferedCount--; - } - - while (mKMerBufferedCount < mK) { - char base = getNextBase(); - if (base == 0) { - incrementBaseIndex(mKMerBufferedCount); - mKMerBufferedCount = 0; - return null; - } else if (base == 'N') { - incrementBaseIndex(mKMerBufferedCount+1); - mKMerBufferedCount = 0; - } else { - mKMerBuffer[mKMerBufferedCount++] = base; - } - } - incrementBaseIndex(1); - return mKMerBuffer; - } - - private char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - return mLineBuffer.charAt(mLineBufferIndex++); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } - - private void log(String text) { - if (mVerbose) { - System.out.println("# " + new Date() + " " + text); - } - } - - private void debug(String text) { - if (mDebug) { - System.out.println("# " + new Date() + " " + text); - } - } - - private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { - char[] encoding = encodeKMerChars(kmerChars); - if (encoding == null) { - return null; - } - char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars)); - if (compareEncodings(encoding, reverseEncoding) <= 0) { - return new KMerPosition(encoding, baseIndex); - } else { - KMerPosition kmp = new KMerPosition(reverseEncoding, baseIndex); - kmp.setIsReversed(true); - return kmp; - } - } - - private static char[] encodeKMerChars(char[] kmerChars) { - if (kmerChars == null) { - return null; - } - - int kmerLength = kmerChars.length; - int encodingLength = (kmerLength + 7) / 8; - char[] encoding = new char[encodingLength]; - int offset = kmerLength % 8; - offset = (offset == 0) ? 8 : offset; - int bits = encodeKMerBits(kmerChars, 0, offset); - if (bits < 0) { - return null; - } - encoding[0] = (char) bits; - for (int i = 1; i < encodingLength; i++) { - bits = encodeKMerBits(kmerChars, offset, 8); - if (bits < 0) { - return null; - } - encoding[i] = (char) bits; - offset += 8; - } - return encoding; - } - - private static int compareEncodings(char[] encoding1, char[] encoding2) { - int length = Math.max(encoding1.length, encoding2.length); - for (int i = 0; i < length; i++) { - int result = encoding1[i] - encoding2[i]; - if (result != 0) { - return result; - } - } - return 0; - } - - private static int encodeKMerBits(char[] kmerChars, int offset, int length) { - int bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= baseBits << (2*(length-i-1)); - } - return bits; - } - - private static String decodeKMer(char[] encoding, boolean reverse) { - int length = mK; - char[] buffer = new char[length]; - int offset = length % 8; - offset = (offset == 0) ? 8 : offset; - decodeKMerBits(encoding[0], buffer, 0, offset); - for (int i = 1; i < encoding.length; i++) { - decodeKMerBits(encoding[i], buffer, offset, 8); - offset += 8; - } - if (reverse) { - reverseComplementInPlace(buffer); - } - return new String(buffer); - } - - private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static char[] reverseComplement(char[] buffer) { - int length = buffer.length; - char[] result = new char[length]; - System.arraycopy(buffer, 0, result, 0, length); - reverseComplementInPlace(result); - return result; - } - - private static void reverseComplementInPlace(char[] buffer) { - int length = buffer.length; - int limit = (length + 1)/2; - for (int i = 0; i < limit; i++) { - char ch1 = reverseComplement(buffer[i]); - char ch2 = reverseComplement(buffer[length-i-1]); - buffer[i] = ch2; - buffer[length-i-1] = ch1; - } - } - - private static char reverseComplement(char base) { - switch (base) { - case 'A': - return 'T'; - case 'C': - return 'G'; - case 'G': - return 'C'; - case 'T': - return 'A'; - } - return base; - } - - private static String formatEncoding(char[] encoding) { - if (encoding == null) { - return null; - } - StringBuilder builder = new StringBuilder(); - builder.append('['); - for (int i = 0; i < encoding.length; i++) { - String hex = Integer.toHexString(encoding[i]); - int length = hex.length(); - while (length < 4) { - builder.append('0'); - length++; - } - builder.append(hex); - } - builder.append(']'); - return builder.toString(); - } - - static class KMerPosition - implements Comparable { - - private int mBaseIndex; - private boolean mReversed; - private char[] mKMerEncoding; - - KMerPosition(char[] encoding, int baseIndex) { - mBaseIndex = baseIndex; - mReversed = false; - mKMerEncoding = encoding; - } - - public final String getKMer() { - return decodeKMer(mKMerEncoding, mReversed); - } - - public final boolean getIsReversed() { - return mReversed; - } - - public final void setIsReversed(boolean value) { - mReversed = value; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public final char[] getKMerEncoding() { - return mKMerEncoding; - } - - public int compareTo(KMerPosition kmp) { - return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding()); - } - - public boolean equals(Object object) { - if (!(object instanceof KMerPosition)) { - return false; - } - KMerPosition kmp = (KMerPosition) object; - return (getBaseIndex() == kmp.getBaseIndex() && - this.compareTo(kmp) == 0); - } - - public String format() { - return(getKMer() + - " " + formatEncoding(getKMerEncoding()) + - " " + (mReversed ? 'R' : 'F') + - " " + Integer.toHexString(mBaseIndex)); - } - } - - static class StringKMerPosition - implements Comparable { - - private String mKMerString = null; - private int mBaseIndex; - - StringKMerPosition(String kmer, int baseIndex) { - mKMerString = kmer; - mBaseIndex = baseIndex; - } - - public final String getKMer() { - return mKMerString; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public int compareTo(StringKMerPosition kmp) { - return mKMerString.compareTo(kmp.mKMerString); - } - - public boolean equals(Object object) { - if (!(object instanceof StringKMerPosition)) { - return false; - } - StringKMerPosition kmp = (StringKMerPosition) object; - return (mBaseIndex == kmp.mBaseIndex && - mKMerString.equals(kmp.mKMerString)); - } - } -} diff --git a/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java b/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java deleted file mode 100644 index 90b26d0b1..000000000 --- a/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.kmer; - - -import edu.mit.broad.dcp.DistributedAlgorithm; -import edu.mit.broad.cnv.util.SequenceIterator; - -import java.io.*; -import java.util.*; - - -/** - * Distributed algorithm for counting unique kmers. - */ -public class DistributedKMerCounter - extends DistributedAlgorithm -{ - private boolean mDebug = false; - private boolean mVerbose = false; - private int mK = 0; - private List mInputFiles = null; - private List mSequenceList = null; - private List mSequenceOffsetList = null; - - - public DistributedKMerCounter() { - } - - public boolean getDebug() { - return mDebug; - } - - public void setDebug(boolean value) { - mDebug = value; - } - - public boolean getVerbose() { - return mVerbose; - } - - public void setVerbose(boolean value) { - mVerbose = value; - } - - public int getK() { - return mK; - } - - public void setK(int value) { - mK = value; - } - - public List getInputFiles() { - return mInputFiles; - } - - public void setInputFiles(List value) { - mInputFiles = value; - } - - public void run() - throws Exception { - super.run(); - finish(); - } - - protected void init() - throws Exception { - if (getWorkerId() == MASTER) { - initMaster(); - } else { - initWorker(); - } - } - - private void initMaster() - throws IOException { - // Tasks to be amortized - report("Scanning sequences ..."); - scanSequences(); - report("Scan complete."); - } - - private void initWorker() { - // Tasks to be amortized - } - - protected void start() { - // scan genome, divide into chromosomes and optionally segments, distribute calls - } - - private void finish() { - // merge individual files, write out final results - } - - private void scanSequences() - throws IOException { - List sequenceList = new ArrayList(); - List sequenceOffsetList = new ArrayList(); - SequenceIterator seqIterator = new SequenceIterator(getInputFiles()); - while (true) { - String seqName = seqIterator.getNextSequence(); - if (seqName == null) { - break; - } - int baseIndex = seqIterator.getBaseIndex() + 1; - sequenceList.add(seqName); - sequenceOffsetList.add(baseIndex); - } - mSequenceList = sequenceList; - mSequenceOffsetList = sequenceOffsetList; - } - - // Currently not used - private void loadGenomeOffsets(File file) - throws IOException { - List sequenceList = new ArrayList(); - List sequenceOffsetList = new ArrayList(); - int baseIndex = 0; - LineNumberReader reader = new LineNumberReader(new FileReader(file)); - while (true) { - String line = reader.readLine(); - if (line == null) { - break; - } - String text = line.trim(); - if (text.length() == 0 || text.startsWith("#")) { - continue; - } - String[] fields = text.split("\\s+"); - if (fields.length != 2) { - throw new RuntimeException("Invalid input line: " + line); - } - int length = Integer.parseInt(fields[1]); - sequenceList.add(fields[0]); - sequenceOffsetList.add(baseIndex); - baseIndex += length; - } - mSequenceList = sequenceList; - mSequenceOffsetList = sequenceOffsetList; - } -} diff --git a/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java b/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java deleted file mode 100644 index 7ed22faf3..000000000 --- a/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.util; - - -import java.io.*; -import java.util.*; - - -/** - * Utility class for transforming between a linear base index - * and a chromsome + position coordinate system. - */ -public class GenomeBaseIndex { - - private List mSequenceNames = null; - private int[] mLengths = null; - private long[] mOffsets = null; - - private GenomeBaseIndex() { - } - - public static GenomeBaseIndex read(File file) - throws IOException { - Reader reader = new BufferedReader(new FileReader(file)); - try { - return read(reader); - } finally { - reader.close(); - } - } - - // The input is just a list of space-delimited sequence name and length. - public static GenomeBaseIndex read(Reader reader) - throws IOException { - List sequenceNames = new ArrayList(); - List sequenceLengths = new ArrayList(); - BufferedReader bufferedReader = new BufferedReader(reader); - while (true) { - String line = bufferedReader.readLine(); - if (line == null) { - break; - } - String text = line.trim(); - if (text.length() == 0 || text.startsWith("#")) { - continue; - } - String[] fields = text.split("\\s+"); - if (fields.length < 2) { - throw new RuntimeException("Invalid input line: " + line); - } - int length = Integer.parseInt(fields[1]); - if (length <= 0) { - throw new RuntimeException("Invalid sequence length: " + length); - } - sequenceNames.add(fields[0]); - sequenceLengths.add(length); - } - int count = sequenceLengths.size(); - int[] lengths = new int[count]; - long[] offsets = new long[count]; - long offset = 0; - for (int i = 0; i < count; i++) { - lengths[i] = sequenceLengths.get(i); - offsets[i] = offset; - offset += lengths[i]; - } - GenomeBaseIndex result = new GenomeBaseIndex(); - result.mSequenceNames = sequenceNames; - result.mLengths = lengths; - result.mOffsets = offsets; - return result; - } - - public List getSequenceNames() { - return mSequenceNames; - } - - public boolean contains(String seqName) { - return (getSequenceIndex(seqName) >= 0); - } - - public long getFirstIndex(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - return mOffsets[index]; - } - - public long getLastIndex(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - return (mOffsets[index] + mLengths[index] - 1); - } - - public int getSequenceLength(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return 0; - } - return mLengths[index]; - } - - public long getBaseIndex(String seqName, int position) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - if (position > mLengths[index]) { - return -1; - } - if (position < 1) { - // Zero or negative position means last base index - position = mLengths[index]; - } - return (mOffsets[index] + position - 1); - } - - public String getSequenceName(long baseIndex) { - int index = getSequenceIndex(baseIndex); - if (index < 0) { - return null; - } - return mSequenceNames.get(index); - } - - public int getPosition(long baseIndex) { - if (baseIndex < 0) { - // Catch common sign-extension error when packing indexes as ints. - throw new IllegalArgumentException("Invalid base index: " + baseIndex); - } - int index = getSequenceIndex(baseIndex); - if (index < 0) { - return 0; - } - long offset = mOffsets[index]; - long result = baseIndex - offset + 1; - return (int) result; - } - - // Same as getSequenceName, but treat the argument as an unsigned int. - // This is useful for manipulating/storing indexes for the human - // genome as 4-byte unsigned ints. - public String getSequenceNameUnsigned(int baseIndex) { - return getSequenceName(baseIndex & 0xFFFFFFFFL); - } - - // Same as getPosition, but treat the argument as an unsigned int. - // This is useful for manipulating/storing indexes for the human - // genome as 4-byte unsigned ints. - public int getPositionUnsigned(int baseIndex) { - return getPosition(baseIndex & 0xFFFFFFFFL); - } - - private int getSequenceIndex(String seqName) { - return mSequenceNames.indexOf(seqName); - } - - private int getSequenceIndex(long baseIndex) { - long offset = 0; - if (baseIndex < 0) { - return -1; - } - for (int i = 0; i < mLengths.length; i++) { - int length = mLengths[i]; - if (offset + length > baseIndex) { - return i; - } - offset += length; - } - return -1; - } -} diff --git a/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java b/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java deleted file mode 100644 index 2d1a96f61..000000000 --- a/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.util; - - -import java.io.*; -import java.util.*; - - -/** - * Utility class for transforming between a chromsome + position - * coordinate system and a binned coordinate system where each - * chromosome (separately) is divided into fixed sized bins, - * ragged on the right/upper end. - */ -public class GenomeBinIndex { - - private int mBinSize; - private List mSequenceNames; - private int[] mSequenceLengths; - private int[] mBinOffsets; - - public GenomeBinIndex(GenomeBaseIndex gbi, int binSize) { - if (binSize <= 0) { - throw new IllegalArgumentException("Illegal bin size: " + binSize); - } - mBinSize = binSize; - mSequenceNames = new ArrayList(gbi.getSequenceNames()); - int count = mSequenceNames.size(); - mSequenceLengths = new int[count]; - mBinOffsets = new int[count]; - long binOffset = 0; // long to detect overflow - for (int i = 0; i < count; i++) { - int length = gbi.getSequenceLength(mSequenceNames.get(i)); - int binCount = (length + binSize - 1) / binSize; - mSequenceLengths[i] = length; - mBinOffsets[i] = (int) binOffset; - binOffset += binCount; - } - if (binOffset > Integer.MAX_VALUE) { - // Check for integer overflow. - // This will happen, e.g., with the human genome and a bin size of 1. - throw new RuntimeException("Binsize too small: " + binSize); - } - } - - public int getBinSize() { - return mBinSize; - } - - public int getBinIndex(String seqName, int position) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - if (position > mSequenceLengths[index]) { - return -1; - } - if (position < 1) { - position = mSequenceLengths[index]; - } - int bin = (position - 1) / mBinSize; - return (mBinOffsets[index] + bin); - } - - public String getSequenceName(int binIndex) { - int index = getSequenceIndex(binIndex); - if (index < 0) { - return null; - } - return mSequenceNames.get(index); - } - - public int getStartPosition(int binIndex) { - int index = getSequenceIndex(binIndex); - if (index < 0) { - return -1; - } - int bin = binIndex - mBinOffsets[index]; - return (bin * mBinSize + 1); - } - - public int getEndPosition(int binIndex) { - int index = getSequenceIndex(binIndex); - if (index < 0) { - return -1; - } - int bin = binIndex - mBinOffsets[index]; - int position = (bin+1) * mBinSize; - position = Math.min(position, mSequenceLengths[index]); - return position; - } - - public List getSequenceNames() { - return mSequenceNames; - } - - public int getFirstBin(String seqName) { - return getBinIndex(seqName, 1); - } - - public int getLastBin(String seqName) { - return getBinIndex(seqName, 0); - } - - public int getBinCount() { - if (mBinOffsets.length == 0) { - return 0; - } - int lastIndex = mBinOffsets.length - 1; - int count = mBinOffsets[lastIndex]; - count += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize; - return count; - } - - public int getBinCount(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - return ((mSequenceLengths[index] + mBinSize - 1) / mBinSize); - } - - public int getSequenceLength(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return 0; - } - return mSequenceLengths[index]; - } - - private int getSequenceIndex(String seqName) { - for (int i = 0; i < mSequenceNames.size(); i++) { - if (mSequenceNames.get(i).equals(seqName)) { - return i; - } - } - return -1; - } - - private int getSequenceIndex(int binIndex) { - if (binIndex < 0) { - return -1; - } - for (int i = 1; i < mBinOffsets.length; i++) { - if (mBinOffsets[i] > binIndex) { - return i-1; - } - } - int lastIndex = mBinOffsets.length-1; - int lastBinIndex = mBinOffsets[lastIndex]; - lastBinIndex += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize; - if (binIndex <= lastBinIndex) { - return lastIndex; - } - return -1; - } -} - diff --git a/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java b/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java deleted file mode 100644 index 57bbae7a5..000000000 --- a/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.util; - - -import java.io.*; -import java.util.*; - - -/** - * Utility class for iterating over fasta files. - * Also maintains an unsigned base index over the file set. - */ -public class SequenceIterator -{ - private List mInputFiles = null; - private int mInputFileIndex = 0; - private int mBaseIndex = -1; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - - public SequenceIterator(File inputFile) { - mInputFiles = new ArrayList(); - mInputFiles.add(inputFile); - } - - public SequenceIterator(List inputFiles) { - mInputFiles = inputFiles; - } - - public void close() { - if (mCurrentReader != null) { - try { - mCurrentReader.close(); - } catch (IOException exc) { - throw new RuntimeException("Error closing reader: " + exc.getMessage(), - exc); - } - } - mCurrentReader = null; - mInputFiles = null; - mInputFileIndex = 0; - mBaseIndex = -1; - mNextSequence = null; - mLineBuffer = null; - mLineBufferIndex = 0; - } - - public String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mLineBuffer != null) { - incrementBaseIndex(mLineBuffer.length() - mLineBufferIndex); - mLineBuffer = null; - mLineBufferIndex = 0; - } - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } else { - incrementBaseIndex(line.length()); - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - public char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - if (mNextSequence != null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - char result = mLineBuffer.charAt(mLineBufferIndex++); - incrementBaseIndex(1); - return result; - } - - public int getBaseIndex() { - return mBaseIndex; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } -} - diff --git a/java/lib/edu/mit/broad/dcp/CallStatus.java b/java/lib/edu/mit/broad/dcp/CallStatus.java deleted file mode 100644 index e431b27df..000000000 --- a/java/lib/edu/mit/broad/dcp/CallStatus.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -public enum CallStatus -{ - PENDING, - PROCESSING -} - - diff --git a/java/lib/edu/mit/broad/dcp/CommandRunner.java b/java/lib/edu/mit/broad/dcp/CommandRunner.java deleted file mode 100644 index b93b310dd..000000000 --- a/java/lib/edu/mit/broad/dcp/CommandRunner.java +++ /dev/null @@ -1,309 +0,0 @@ -/** - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2006 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import java.io.*; - - -/** - * Utility class to run system commands synchronously and return the output. - * - * The interface supports the typical case where you want to return a modest - * amount of information from the command's standard output or standard error - * as a string. The caller can override this behavior, however, and provide - * alternative output destinations if necessary. - * - * If setMergeOutput() is true, then this class will attempt to interleave - * the standard output and standard error streams of the command into one - * stream (standard output). This may not produce exactly the same results - * as having the operating system interleave the output, but works well for - * simple executables that do not heavily intermix stdout and stderr. - * - * A typical invocation is: - *
- *  CommandRunner runner = new CommandRunner();
- *  int status = runner.runCommand("ls");
- *  if (status == 0) {
- *      System.out.print(runner.getStandardOutput());
- *  }
- * 
- * - * @author Bob Handsaker - */ -public class CommandRunner { - - private boolean mMergeOutput = false; - private Writer mStandardOutputDestination = null; - private Writer mStandardErrorDestination = null; - private String mStandardOutputString = null; - private String mStandardErrorString = null; - - - /** - * Default constructor. - */ - public CommandRunner() { - } - - /** - * Get the standard output from the last command as a string. - * - * If no command has been run or an explicit output destination - * was set, then this method returns null. - */ - public String getStandardOutputString() { - return mStandardOutputString; - } - - /** - * Get the standard error from the last command as a string. - * - * If no command has been run or an explicit output destination - * was set, then this method returns null. - */ - public String getStandardErrorString() { - return mStandardErrorString; - } - - /** - * If true, the command's standard error stream will be interleaved - * with the command's standard output stream. The standard error - * stream destination will not be used. - */ - public boolean getMergeOutput() { - return mMergeOutput; - } - - /** - * If true, the command's standard error stream will be interleaved - * with the command's standard output stream. - */ - public void setMergeOutput(boolean value) { - mMergeOutput = value; - } - - /** - * The destination for the command's standard output stream. - * If null, the standard output will be captured in a string. - */ - public Writer getStandardOutputDestination() { - return mStandardOutputDestination; - } - - /** - * The destination for the command's standard output stream. - * If set to null, the standard output will be captured in a string. - */ - public void setStandardOutputDestination(Writer writer) { - mStandardOutputDestination = writer; - } - - /** - * The destination for the command's standard error stream. - * If null, the standard error will be captured in a string. - */ - public Writer getStandardErrorDestination() { - return mStandardErrorDestination; - } - - /** - * The destination for the command's standard error stream. - * If set to null, the standard error will be captured in a string. - */ - public void setStandardErrorDestination(Writer writer) { - mStandardErrorDestination = writer; - } - - /** - * Run a command string as a system command. - * - * Returns the exit status of the command. - * - * When this method is called, the standard output string - * and standard error string are updated if no alternative output - * destinations have been set. - * - * This method throws a RuntimeException if running the command fails - * (for example, if there are not enough system resources to spawn - * the process). - * - * @param commmand The command string to run. - * @return Command exit status. - * @throws RuntimeException If command execution fails. - */ - public int runCommand(String command) - throws RuntimeException { - return runCommand(command.split(" "), null, null); - } - - /** - * Run a command string as a system command. - * - * Returns the exit status of the command. - * - * When this method is called, the standard output string - * and standard error string are updated if no alternative output - * destinations have been set. - * - * This method throws a RuntimeException if running the command fails - * (for example, if there are not enough system resources to spawn - * the process). - * - * @param commmand The command string to run. - * @param environment The command environment (or null to inherit). - * @param workingDirectory The working directory (or null to inherit). - * @return Command exit status. - * @throws RuntimeException If command execution fails. - */ - public int runCommand(String command, String[] environment, File workingDirectory) - throws RuntimeException { - return runCommand(command.split(" "), environment, workingDirectory); - } - - /** - * Run a command string as a system command. - * - * Returns the exit status of the command. - * - * When this method is called, the standard output string - * and standard error string are updated if no alternative output - * destinations have been set. - * - * This method throws a RuntimeException if running the command fails - * (for example, if there are not enough system resources to spawn - * the process). - * - * @param commmand The command to run (as a array of arguments). - * @param environment The command environment (or null to inherit). - * @param workingDirectory The working directory (or null to inherit). - * @return Command exit status. - * @throws RuntimeException If command execution fails. - */ - public int runCommand(String[] command, String[] environment, File workingDirectory) - throws RuntimeException { - - Writer stdout = mStandardOutputDestination; - Writer stderr = mStandardErrorDestination; - if (stdout == null) { - stdout = new StringWriter(); - } - if (mMergeOutput) { - stderr = stdout; - } else if (stderr == null) { - stderr = new StringWriter(); - } - - mStandardOutputString = null; - mStandardErrorString = null; - - int commandStatus = 0; - try { - Process process = - Runtime.getRuntime().exec(command, environment, workingDirectory); - StreamHandler stdoutHandler = - new StreamHandler(process.getInputStream(), stdout); - StreamHandler stderrHandler = - new StreamHandler(process.getErrorStream(), stderr); - - commandStatus = process.waitFor(); - - // Wait for the streams to drain. - stdoutHandler.join(); - stderrHandler.join(); - } catch (Exception exc) { - throw new RuntimeException("Command execution failed: " + - exc.getMessage(), - exc); - } - - if (mStandardOutputDestination == null) { - mStandardOutputString = stdout.toString(); - } - if (mStandardErrorDestination == null && !mMergeOutput) { - mStandardErrorString = stderr.toString(); - } - - return commandStatus; - } - - - /** - * Internal class to asynchronously read from the standard output - * and standard error streams of the command being executed. - * - * If you do not handle command output asynchronously, then execution - * of a command may block in some environments if the program produces - * too much output. In this case, the call to run the process will - * never complete. - */ - private static class StreamHandler extends Thread { - - /** - * Constructor. - * Create an instance of this class, which is an asynchronous - * thread that will consume input from the given input stream - * and send the output to the given output destination. - * - * @param input The input stream to read. - * @param output The output destination. - */ - StreamHandler(InputStream input, Writer output) { - m_input = input; - m_output = output; - start(); - } - - - /** - * Standard thread run method. - * Pipe input from the input source to the output destination - * until there is no more input left. - * - * If an IOException occurs, the thread will make sure all - * available output has been flushed to the destination and - * then terminate. The IOException is not propagated. - */ - public void run() { - - char[] buffer = new char[4096]; - Reader reader = - new InputStreamReader(new BufferedInputStream(m_input)); - Writer writer = m_output; - - try { - while (true) { - int count = reader.read(buffer); - if (count <= 0) { - break; - } - if (writer != null) { - synchronized (writer) { - writer.write(buffer, 0, count); - } - } - } - } catch (IOException ignore) { - // Ignore IO exceptions - } finally { - try { - reader.close(); - } catch (Exception ignore) { - } - try { - m_output.flush(); - } catch (Exception ignore) { - } - } - } - - private InputStream m_input; - private Writer m_output; - } -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java b/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java deleted file mode 100644 index a223c0326..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java +++ /dev/null @@ -1,618 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import edu.mit.broad.dcp.message.*; - -import java.io.*; -import java.util.*; -import java.lang.reflect.Method; -import java.net.InetAddress; -import java.net.ServerSocket; -import java.rmi.registry.*; - -/** - * Experimental. - */ -public abstract class DistributedAlgorithm - implements Serializable -{ - public static final Integer ANY = 0; - public static final Integer MASTER = 1; - - public DistributedAlgorithm() { - } - - public String getServerHost() { - return mServerHost; - } - - public void setServerHost(String value) { - mServerHost = value; - } - - public int getServerPort() { - return mServerPort; - } - - public void setServerPort(int value) { - mServerPort = value; - } - - public String getAlgorithmName() { - if (mAlgorithmName != null) { - return mAlgorithmName; - } else { - return getClassName(); - } - } - - public void setAlgorithmName(String value) { - mAlgorithmName = value; - } - - public int getMaximumWorkerCount() { - return mMaximumWorkerCount; - } - - public void setMaximumWorkerCount(int value) { - mMaximumWorkerCount = value; - } - - /** - * Name of LSF queue to use for workers. - */ - public String getLsfQueue() { - return mLsfQueue; - } - - public void setLsfQueue(String value) { - mLsfQueue = value; - } - - /** - * Directory to hold lsf log files. - */ - public String getLsfLogDirectory() { - return mLsfLogDirectory; - } - - public void setLsfLogDirectory(String value) { - mLsfLogDirectory = value; - } - - public boolean getEnableGcLogging() { - return mEnableGcLogging; - } - - public void setEnableGcLogging(boolean value) { - mEnableGcLogging = value; - } - - public Integer getWorkerId() { - return mWorkerId; - } - - public Integer getProcessId() { - return mProcessId; - } - - protected void init() - throws Exception { - } - - protected abstract void start() - throws Exception; - - public void run() - throws Exception { - - if (mIsRunning) { - throw new IllegalStateException("Algorithm is already running"); - } - - mIsRunning = true; - mWorkerId = MASTER; - mProcessId = MASTER; - - try { - startDistributedServer(); - init(); - startWorkerThread(); - startWorkers(); - start(); - waitForCompletion(); - } finally { - // TBD: More cleanup (shutdown threads, etc.) - stopDistributedServer(); - mIsRunning = false; - } - } - - void runWorker(int workerId, int processId) - throws Exception { - - if (mIsRunning) { - throw new IllegalStateException("Algorithm is already running"); - } - - mIsRunning = true; - mWorkerId = workerId; - mProcessId = processId; - - try { - if (openDistributedServer() == null) { - report("Server " + mServerHost + ":" + mServerPort + " not responding"); - return; - } - init(); - startWorkerThread(); - mWorkerThread.join(); - } finally { - closeDistributedServer(); - mIsRunning = false; - } - } - - private void startWorkers() { - int workerCount = getMaximumWorkerCount(); - if (workerCount <= 0) { - // Use single process execution for testing/debugging. - new InProcessWorker().start(); - return; - } - if (workerCount > 1000) { - throw new RuntimeException("Excessive worker count: " + workerCount); - } - for (int i = 0; i < workerCount; i++) { - Integer workerId = (MASTER + i + 1); - Integer processId = workerId; // for now - startWorker(workerId, processId); - } - } - - private void startDistributedServer() { - try { - // Create a server socket to allocate a unique port. - // There is a window of vulnerability where the port - // can get reused, but in practice this works ok. - String serverHost = getCurrentHost(); - ServerSocket socket = new ServerSocket(0); - int serverPort = socket.getLocalPort(); - socket.close(); - Registry registry = LocateRegistry.createRegistry(serverPort); - DistributedCallServer server = new DistributedCallServer(); - server.setAlgorithm(this); - registry.bind("DistributedCallService", server); - mServerHost = serverHost; - mServerPort = serverPort; - mDistributedCallServer = server; - mDistributedCallService = server; - } catch (Exception exc) { - throw wrapException(exc); - } - } - - private void stopDistributedServer() { - if (mDistributedCallServer != null) { - try { - Registry registry = LocateRegistry.getRegistry(mServerPort); - registry.unbind("DistributedCallService"); - mDistributedCallServer.stop(); - } catch (Exception exc) { - throw wrapException(exc); - } - } - mDistributedCallService = null; - mDistributedCallServer = null; - } - - private DistributedCallService openDistributedServer() { - mDistributedCallService = null; - try { - String url = "rmi://" + getServerHost() + ":" + getServerPort() + "/DistributedCallService"; - DistributedCallService server = - (DistributedCallService) java.rmi.Naming.lookup(url); - mDistributedCallService = server; - } catch (java.rmi.NotBoundException exc) { - // Server has exited - } catch (Exception exc) { - throw wrapException(exc); - } - return mDistributedCallService; - } - - private void closeDistributedServer() { - mDistributedCallService = null; - } - - private void startWorker(Integer workerId, Integer processId) { - - String logFile = "worker_" + processId + "_%J.bsub"; - if (mLsfLogDirectory != null) { - logFile = mLsfLogDirectory + "/" + logFile; - } - - List command = new ArrayList(); - command.add("bsub"); - command.add("-o"); - command.add(logFile); - if (mLsfQueue != null) { - command.add("-q"); - command.add(mLsfQueue); - } - command.add("runDistributedWorker"); - command.add("-serverHost"); - command.add(getServerHost()); - command.add("-serverPort"); - command.add(Integer.toString(getServerPort())); - command.add("-workerId"); - command.add(Integer.toString(workerId)); - command.add("-processId"); - command.add(Integer.toString(processId)); - - // Pass our -Xmx setting along to all workers. - Map environment = - new LinkedHashMap(System.getenv()); - long maxMemory = Runtime.getRuntime().maxMemory(); - long maxKbytes = maxMemory / 1024; - String memJavaOpt = "-Xmx" + maxKbytes + "K"; - - // Enable GC logging if requested - String gcJavaOpt = null; - if (mEnableGcLogging) { - String gcLogFile = "worker_" + processId + ".gc.log"; - if (mLsfLogDirectory != null) { - gcLogFile = mLsfLogDirectory + "/" + gcLogFile; - } - gcJavaOpt = "-Xloggc:" + gcLogFile; - } - - String javaOpts = environment.get("JAVAOPTS"); - if (javaOpts == null) { - javaOpts = memJavaOpt; - if (gcJavaOpt != null) { - javaOpts = javaOpts + " " + gcJavaOpt; - } - environment.put("JAVAOPTS", javaOpts); - } - - // Log output ourselves (rather than waiting for bsub). - String workerLogFile = "worker_" + processId + ".log"; - if (mLsfLogDirectory != null) { - workerLogFile = mLsfLogDirectory + "/" + workerLogFile; - } - environment.put("DA_LOG_FILE", workerLogFile); - - CommandRunner runner = new CommandRunner(); - Writer output = new LsfOutputFilter(); - runner.setStandardOutputDestination(output); - runner.setStandardErrorDestination(output); - String[] commandArray = command.toArray(new String[command.size()]); - String[] environmentArray = createEnvironmentArray(environment); - int status = runner.runCommand(commandArray, environmentArray, null); - if (status != 0) { - throw new RuntimeException("Error starting worker: " + status); - } - } - - private String[] createEnvironmentArray(Map map) { - if (map == null) { - return null; - } - int index = 0; - String[] array = new String[map.size()]; - for (Map.Entry entry : map.entrySet()) { - array[index++] = entry.getKey() + "=" + entry.getValue(); - } - return array; - } - - private String getCurrentHost() { - try { - return InetAddress.getLocalHost().getCanonicalHostName(); - } catch (Exception exc) { - throw wrapException(exc); - } - } - - private void waitForCompletion() { - DistributedCallServer server = mDistributedCallServer; - while (true) { - if (server.isQueueEmpty()) { - break; - } - try { - Thread.sleep(1000); - } catch (InterruptedException exc) { - // ignore - } - } - } - - protected void callDistributed(String methodName, Object... methodArgs) { - callDistributed(null, methodName, methodArgs); - } - - protected void callDistributed(Integer workerId, String methodName, Object... methodArgs) { - if (workerId == null) { - workerId = ANY; - } - try { - DistributedCallMessage message = new DistributedCallMessage(); - message.setSenderWorkerId(getWorkerId()); - message.setSenderProcessId(getProcessId()); - message.setReceiverWorkerId(workerId); - message.setMethodName(methodName); - message.setMethodArgs(methodArgs); - mDistributedCallService.writeMessage(message); - } catch (Throwable exc) { - throw wrapException(exc); - } - } - - private void callMethod(String methodName, Object[] methodArgs) { - try { - Object target = this; - Class targetClass = target.getClass(); - Method targetMethod = findMethod(targetClass, methodName); - if (targetMethod == null) { - throw new RuntimeException("Cannot find target method: " + methodName); - } - targetMethod.invoke(target, methodArgs); - } catch (Throwable exc) { - throw wrapException(exc); - } - } - - private Method findMethod(Class clazz, String methodName) throws Exception { - Method result = null; - Method[] methods = clazz.getDeclaredMethods(); - for (int i = 0; i < methods.length; i++) { - if (methods[i].getName().equals(methodName)) { - if (result != null) { - throw new RuntimeException("Duplicate method name: " + methodName); - } - result = methods[i]; - } - } - return result; - } - - private RuntimeException wrapException(Throwable exception) { - if (exception instanceof RuntimeException) { - return (RuntimeException) exception; - } else { - return new RuntimeException(exception.getMessage(), exception); - } - } - - private void startWorkerThread() { - if (mWorkerThread != null) { - throw new IllegalStateException("WorkerThread is running"); - } - mWorkerThread = new WorkerThread(); - mWorkerThread.start(); - } - - private void stopWorkerThread() { - if (mWorkerThread == null) { - throw new IllegalStateException("WorkerThread is running"); - } - mWorkerThread.stopThread(); - } - - private class WorkerThread extends Thread { - - WorkerThread() { - setDaemon(true); - } - - public void run() { - try { - DistributedCallService service = mDistributedCallService; - while (true) { - if (isInterrupted()) { - System.out.println("#DBG: Worker isInterrupted"); - throw new InterruptedException(); - } - DistributedCallMessage message = - service.acceptMessage(getWorkerId(), getProcessId()); - if (message == null) { - Thread.sleep(1000); - } else { - processMessage(message); - } - } - } catch (InterruptedException exc) { - // Interruption terminates this thread. - // System.out.println("#DBG: Worker caught InterruptedException"); - } catch (Throwable exc) { - if (isDisconnectException(exc)) { - report("Server disconnected"); - } else { - reportError("Exception in WorkerThread: " + exc.getMessage(), exc); - System.exit(1); - } - } - report("WorkerThread terminated"); - } - - void stopThread() { - // System.out.println("#DBG: About to interrupt worker..."); - interrupt(); - // System.out.println("#DBG: Joining worker..."); - try { - join(); - } catch (InterruptedException exc) { - // ignore - } - } - - private boolean isDisconnectException(Throwable exc) { - if (exc instanceof java.rmi.ConnectException) { - return true; - } else if (exc instanceof java.rmi.NoSuchObjectException) { - return true; - } else if (exc instanceof java.rmi.UnmarshalException && - exc.getCause() != null && - exc.getCause() instanceof EOFException) { - return true; - } else { - return false; - } - } - } - - private void processMessage(DistributedCallMessage message) { - try { - Integer workerId = message.getReceiverWorkerId(); - if (workerId == null || !workerId.equals(getWorkerId())) { - reportError("Invalid worker ID in message: " + message); - return; - } - callMethod(message.getMethodName(), message.getMethodArgs()); - } catch (Throwable exc) { - reportError("Exception running message: " + message, exc); - } finally { - completeMessage(message); - } - } - - private void completeMessage(DistributedCallMessage message) { - try { - DistributedCallService service = mDistributedCallService; - service.completeMessage(getWorkerId(), getProcessId(), message.getCallId()); - } catch (Throwable exc) { - reportError("Exception completing message: " + message, exc); - } - } - - protected void report(String message) { - String identity = - getAlgorithmName() + " " + - getWorkerId() + "/" + getProcessId(); - System.out.println("# " + identity + " : " + message); - } - - protected void reportError(String message) { - reportError(message, null); - } - - protected void reportError(String message, Throwable exception) { - String identity = - getAlgorithmName() + " " + - getWorkerId() + "/" + getProcessId(); - System.out.println("Error" + - " [" + identity + "]" + - ": " + message); - if (exception != null) { - System.out.println(" with exception: " + exception.getMessage()); - exception.printStackTrace(System.out); - } - } - - private String getClassName() { - String name = getClass().getName(); - return name.substring(name.lastIndexOf('.')+1); - } - - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("DistributedAlgorithm"); - builder.append("("); - builder.append("" + getAlgorithmName()); - builder.append(","); - builder.append("" + getWorkerId()); - builder.append(","); - builder.append("" + getProcessId()); - builder.append(","); - builder.append("" + getMaximumWorkerCount()); - builder.append(","); - builder.append("" + getLsfQueue()); - builder.append(","); - builder.append("" + mIsRunning); - builder.append(")"); - return builder.toString(); - } - - // This class is used only during in-process execution/testing/debugging. - private class InProcessWorker extends Thread { - - InProcessWorker() { - setDaemon(true); - } - - public void run() { - report("InProcessWorker starting"); - try { - String serverAddress = getServerHost() + ":" + getServerPort(); - String url = "rmi://" + serverAddress + "/DistributedCallService"; - DistributedCallService server = - (DistributedCallService) java.rmi.Naming.lookup(url); - DistributedAlgorithm algorithm = server.getAlgorithm(); - algorithm.setServerHost(getServerHost()); - algorithm.setServerPort(getServerPort()); - algorithm.runWorker(2, 1); - } catch (Throwable exc) { - reportError("Exception in InProcessWorker: " + exc.getMessage(), exc); - System.exit(1); - } - report("InProcessWorker terminated"); - } - } - - private static class LsfOutputFilter - extends FilterWriter { - - LsfOutputFilter() { - super(new PrintWriter(System.out, true)); - } - - public void write(int ch) - throws IOException { - if (mAtLineStart) { - out.write("# "); - mAtLineStart = false; - } - out.write(ch); - mAtLineStart = (ch == '\n'); - } - - public void write(String s, int off, int len) - throws IOException { - write(s.toCharArray(), off, len); - } - - public void write(char[] a, int off, int len) - throws IOException { - for (int i = 0; i < len; i++) { - write(a[off+i]); - } - } - - private boolean mAtLineStart = true; - } - - - private transient int mMaximumWorkerCount = 0; - private transient String mLsfQueue = null; - private transient String mLsfLogDirectory = null; - private transient boolean mEnableGcLogging = false; - private transient boolean mIsRunning = false; - private transient int mWorkerId = 0; - private transient int mProcessId = 0; - private transient WorkerThread mWorkerThread = null; - private transient String mAlgorithmName = null; - private transient String mServerHost = null; - private transient int mServerPort = 0; - private transient DistributedCallService mDistributedCallService = null; - private transient DistributedCallServer mDistributedCallServer = null; -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java b/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java deleted file mode 100644 index dcee13eb8..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import java.util.*; - -/** - * Command line driver for distributed worker invocation. - */ -public class DistributedAlgorithmWorker -{ - public static void main(String[] args) - throws Exception { - new DistributedAlgorithmWorker().run(args); - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - System.out.println("# DistributedAlgorithmWorker"); - System.out.println("# Started at " + new Date()); - runDistributedWorker(); - System.out.println("# Ended at " + new Date()); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-serverHost") && argsleft > 1) { - argpos++; - mServerHost = args[argpos++]; - } else if (arg.equals("-serverPort") && argsleft > 1) { - argpos++; - mServerPort = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-workerId") && argsleft > 1) { - argpos++; - mWorkerId = new Integer(args[argpos++]); - } else if (arg.equals("-processId") && argsleft > 1) { - argpos++; - mProcessId = new Integer(args[argpos++]); - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - continue; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - continue; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void usage() { - System.out.println("Usage: DistributedWorkerMain ..."); - System.out.println(" -serverHost "); - System.out.println(" -serverPort "); - System.out.println(" -workerId "); - System.out.println(" -processId "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private void runDistributedWorker() - throws Exception { - - DistributedAlgorithm algorithm = null; - String serverAddress = getServerHost() + ":" + getServerPort(); - try { - String url = "rmi://" + serverAddress + "/DistributedCallService"; - DistributedCallService server = - (DistributedCallService) java.rmi.Naming.lookup(url); - algorithm = server.getAlgorithm(); - } catch (java.rmi.ConnectException exc) { - System.out.println("# Server " + serverAddress + " not responding."); - return; - } - - algorithm.setServerHost(getServerHost()); - algorithm.setServerPort(getServerPort()); - algorithm.runWorker(getWorkerId(), getProcessId()); - } - - private Integer getWorkerId() { - return mWorkerId; - } - - private Integer getProcessId() { - return mProcessId; - } - - private String getServerHost() { - return mServerHost; - } - - private int getServerPort() { - return mServerPort; - } - - - private boolean mDebug = false; - private boolean mVerbose = false; - private String mServerHost = null; - private int mServerPort = 0; - private Integer mWorkerId = null; - private Integer mProcessId = null; -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedCallServer.java b/java/lib/edu/mit/broad/dcp/DistributedCallServer.java deleted file mode 100644 index 995eff571..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedCallServer.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - - -import edu.mit.broad.dcp.message.*; - -import java.rmi.server.UnicastRemoteObject; -import java.util.*; - -public class DistributedCallServer - extends UnicastRemoteObject - implements DistributedCallService -{ - public DistributedCallServer() - throws java.rmi.RemoteException { - } - - public void setAlgorithm(DistributedAlgorithm algorithm) { - mAlgorithm = algorithm; - } - - public DistributedAlgorithm getAlgorithm() { - return mAlgorithm; - } - - public long writeMessage(DistributedCallMessage message) { - message.setCallStatus(CallStatus.PENDING); - message.setCallId(generateCallId()); - if (message.getReceiverWorkerId().equals(0)) { - synchronized (mMessageQueue) { - mMessageQueue.addLast(message); - } - } else { - synchronized (mMessageQueue) { - mMessageQueue.addFirst(message); - } - } - return message.getCallId(); - } - - public DistributedCallMessage acceptMessage(int workerId, int processId) { - if (workerId <= 0) { - throw new IllegalArgumentException("Invalid worker ID: " + workerId); - } - if (processId <= 0) { - throw new IllegalArgumentException("Invalid process ID: " + processId); - } - synchronized (mMessageQueue) { - Iterator iterator = mMessageQueue.iterator(); - while (iterator.hasNext()) { - DistributedCallMessage message = iterator.next(); - if (message.getCallStatus() != CallStatus.PENDING) { - continue; - } - int receiverId = message.getReceiverWorkerId(); - if (receiverId == workerId || - (receiverId == 0 && workerId > 1)) { - message.setCallStatus(CallStatus.PROCESSING); - message.setReceiverWorkerId(workerId); - message.setReceiverProcessId(processId); - return message; - } - } - } - - return null; - } - - public void completeMessage(int workerId, int processId, long callId) { - if (workerId <= 0) { - throw new IllegalArgumentException("Invalid worker ID: " + workerId); - } - if (processId <= 0) { - throw new IllegalArgumentException("Invalid process ID: " + processId); - } - if (callId <= 0) { - throw new IllegalArgumentException("Invalid call ID: " + callId); - } - synchronized (mMessageQueue) { - Iterator iterator = mMessageQueue.iterator(); - while (iterator.hasNext()) { - DistributedCallMessage message = iterator.next(); - if (message.getCallId().longValue() == callId) { - if (message.getCallStatus() != CallStatus.PROCESSING) { - throw new IllegalStateException("Call #" + callId + " not in state PROCESSING"); - } - if (!message.getReceiverWorkerId().equals(workerId)) { - throw new IllegalStateException("Call #" + callId + " assigned to worker " + message.getReceiverWorkerId() + " not worker " + workerId); - } - if (!message.getReceiverProcessId().equals(processId)) { - throw new IllegalStateException("Call #" + callId + " assigned to process " + message.getReceiverProcessId() + " not process " + processId); - } - iterator.remove(); - return; - } - } - } - - throw new IllegalArgumentException("Unrecognized call ID " + callId); - } - - public boolean isQueueEmpty() { - synchronized (mMessageQueue) { - return mMessageQueue.isEmpty(); - } - } - - public void stop() { - try { - UnicastRemoteObject.unexportObject(this, false); - } catch (java.rmi.NoSuchObjectException exc) { - throw new RuntimeException("Exception unexporting object: " + exc.getMessage(), - exc); - } - } - - private synchronized long generateCallId() { - return ++mCallIdGenerator; - } - - private long mCallIdGenerator = 0; - private DistributedAlgorithm mAlgorithm = null; - private LinkedList mMessageQueue = - new LinkedList(); -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedCallService.java b/java/lib/edu/mit/broad/dcp/DistributedCallService.java deleted file mode 100644 index 202b25f42..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedCallService.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import edu.mit.broad.dcp.message.*; - -public interface DistributedCallService - extends java.rmi.Remote -{ - public DistributedAlgorithm getAlgorithm() - throws java.rmi.RemoteException; - public long writeMessage(DistributedCallMessage message) - throws java.rmi.RemoteException; - public DistributedCallMessage acceptMessage(int workerId, int processId) - throws java.rmi.RemoteException; - public void completeMessage(int workerId, int processId, long callId) - throws java.rmi.RemoteException; -} diff --git a/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java b/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java deleted file mode 100644 index 1b0fa0a4d..000000000 --- a/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp.message; - -import edu.mit.broad.dcp.CallStatus; - -public class DistributedCallMessage - extends DistributedMessage -{ - public DistributedCallMessage() { - } - - public Long getCallId() { - return mCallId; - } - - public void setCallId(Long value) { - mCallId = value; - } - - public CallStatus getCallStatus() { - return mCallStatus; - } - - public void setCallStatus(CallStatus value) { - mCallStatus = value; - } - - public String getMethodName() { - return mMethodName; - } - - public void setMethodName(String value) { - mMethodName = value; - } - - public Object[] getMethodArgs() { - return mMethodArgs; - } - - public void setMethodArgs(Object[] value) { - mMethodArgs = value; - } - - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("DistributedCallMessage"); - builder.append("("); - builder.append("" + getSenderWorkerId()); - builder.append(","); - builder.append("" + getSenderProcessId()); - builder.append(","); - builder.append("" + getReceiverWorkerId()); - builder.append(","); - builder.append("" + getReceiverProcessId()); - builder.append(","); - builder.append("" + mCallId); - builder.append(","); - builder.append("" + mCallStatus); - builder.append(","); - builder.append("" + mMethodName); - builder.append(","); - if (mMethodArgs == null) { - builder.append("" + mMethodArgs); - } else { - builder.append("["); - for (int i = 0; i < mMethodArgs.length; i++) { - if (i > 0) { - builder.append(","); - } - builder.append("" + mMethodArgs[i]); - } - builder.append("]"); - } - builder.append(")"); - return builder.toString(); - } - - public Long mCallId; - public CallStatus mCallStatus; - public String mMethodName; - public Object[] mMethodArgs; -} diff --git a/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java b/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java deleted file mode 100644 index a5e837a69..000000000 --- a/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp.message; - - -public class DistributedMessage -{ - public DistributedMessage() { - } - - public Integer getSenderWorkerId() { - return mSenderWorkerId; - } - - public void setSenderWorkerId(Integer value) { - mSenderWorkerId = value; - } - - public Integer getSenderProcessId() { - return mSenderProcessId; - } - - public void setSenderProcessId(Integer value) { - mSenderProcessId = value; - } - - public Integer getReceiverWorkerId() { - return mReceiverWorkerId; - } - - public void setReceiverWorkerId(Integer value) { - mReceiverWorkerId = value; - } - - public Integer getReceiverProcessId() { - return mReceiverProcessId; - } - - public void setReceiverProcessId(Integer value) { - mReceiverProcessId = value; - } - - public Integer mSenderWorkerId; - public Integer mSenderProcessId; - public Integer mReceiverWorkerId; - public Integer mReceiverProcessId; -} diff --git a/java/lib/edu/mit/broad/picard/PicardException.java b/java/lib/edu/mit/broad/picard/PicardException.java deleted file mode 100644 index 4e36ba648..000000000 --- a/java/lib/edu/mit/broad/picard/PicardException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard; - -/** - * Basic Picard runtime exception that, for now, does nothing much - * - * @author Kathleen Tibbetts - */ -public class PicardException extends RuntimeException -{ - public PicardException(String message) { - super(message); - } - - public PicardException(String message, Throwable throwable) { - super(message, throwable); - } - -} diff --git a/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java b/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java deleted file mode 100644 index 54f0ab9aa..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner; - -import edu.mit.broad.picard.io.IoUtil; - -import java.io.File; -import java.io.IOException; -import java.util.Map; -import java.util.List; - -/** - * Abstract base class for use by Aligner implementations. Provides a constructor and - * accessors for common inputs and outputs. - * - * @author Kathleen Tibbetts - */ -public abstract class AbstractBaseAligner implements Aligner { - - private final Stringency stringency; // The stringency of the alignment - private final File readsBamFile; // The BAM file containing the read data - private final String outputPrefix; // The directory and file name prefix for outputs - private final String referenceFileDir; // The directory where the reference file can be found - private final int clipPoints[]; // The clip points to use - private final Integer expectedInsertSize; // Expected insert size; null for non-paired-end lanes - private final Integer readsToAlign; // The number of reads to align (all if null) - private final boolean pairedReads; // Whether this is a paired-end run - private final int readLength; - // Parameters specific to the Aligner implementation being used - private final Map customParametersMap; - - /** - * Constructor that sets every parameter. - * - * @param stringency the stringency of the alignment - * @param readsBamFile the BAM file containing the reads - * @param outputPrefix the directory and filename prefix for output - * @param referenceFileDir the directory where the reference file is located - * @param clipPoints the clip points - * @param expectedInsertSize the expected insert size (null for non-PE lanes) - * @param readsToAlign the number of reads to align - * @param customParametersMap parameters specific to the Aligner implementation - */ - public AbstractBaseAligner(Stringency stringency, File readsBamFile, String outputPrefix, - String referenceFileDir, int clipPoints[], Integer expectedInsertSize, - Integer readsToAlign, Map customParametersMap, - boolean pairedReads, int readLength) { - - // First, a little validation - if (clipPoints != null && clipPoints.length != 4) { - throw new IllegalArgumentException("Length of clipPoints array argument must be 4."); - } - IoUtil.assertFileIsReadable(readsBamFile); - - this.stringency = stringency; - this.readsBamFile = readsBamFile; - this.outputPrefix = outputPrefix; - this.referenceFileDir = referenceFileDir; - this.clipPoints = clipPoints != null ? clipPoints : new int[4]; - this.expectedInsertSize = expectedInsertSize; - this.readsToAlign = readsToAlign; - this.customParametersMap = customParametersMap; - this.pairedReads = pairedReads; - this.readLength = readLength; - } - - /** - * Utility method for deleting a list of files, to be used by the - * cleanup method of sub-classes - * - * @param files the list of files to delete - */ - protected final void deleteFiles(List files) { - for (File f : files) { - f.delete(); - } - } - - // Accessors - protected final Stringency getStringency() { return stringency; } - protected final File getReadsBamFile() { return readsBamFile; } - protected final String getOutputPrefix() { return outputPrefix; } - protected final String getReferenceFileDir() { return referenceFileDir; } - protected final int[] getClipPoints() { return clipPoints; } - protected final Integer getExpectedInsertSize() { return expectedInsertSize; } - protected final Integer getReadsToAlign() { return readsToAlign; } - protected final Map getCustomParametersMap() { return customParametersMap; } - protected final boolean isPairedReads() { return pairedReads; } - protected final int getReadLength() { return readLength; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/Aligner.java b/java/lib/edu/mit/broad/picard/aligner/Aligner.java deleted file mode 100644 index d0fdf47de..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/Aligner.java +++ /dev/null @@ -1,45 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner; - -/** - * API for aligners. Clients must call these methods in order, as each depends on - * the previous one, but they may call them multiple times and need not call them all. - * This allows steps to be rerun and also lets the caller review intermediate files - * when troubleshooting. - * - * @author Kathleen Tibbetts - */ -public interface Aligner { - - public static enum Stringency{ low, high }; - - /** - * Prepares all the necessary inputs for the alignment process from a BAM file of read data. - */ - public void prepareInputs(); - - /** - * Does the alignment and produces output in the underlying form of the aligner. - */ - public void align(); - - /** - * Converts the output of the aligner to BAM format - */ - public void prepareOutput(); - - /** - * Cleans up intermediate files (the files created in by and for the underlying aligner by the - * prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file. - */ - public void cleanup(); - -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java b/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java deleted file mode 100644 index 1f3cd55ac..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java +++ /dev/null @@ -1,319 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.filter.*; -import edu.mit.broad.picard.util.PeekableIterator; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.sam.ReservedTagConstants; - -import java.io.File; -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Arrays; - -/** - * Class to take unmapped reads in BAM file format and create Maq binary fastq format file(s) -- - * one or two of them, depending on whether it's a paired-end read. This relies on the unmapped - * BAM file having all paired reads together in order. - */ -public class BamToBfqWriter { - - private final File bamFile; - private final String outputPrefix; - private boolean pairedReads = false; - private int wrote = 0; - private int increment = 1; - private int chunk = 0; - private BinaryCodec codec1; - private BinaryCodec codec2; - private final Log log = Log.getInstance(BamToBfqWriter.class); - - /** - * Constructor - * - * @param bamFile the BAM file to read from - * @param outputPrefix the directory and file prefix for the binary fastq files - * @param total the total number of records that should be written, drawn evenly - * from throughout the file (null for all). - * @param chunk the maximum number of records taht should be written to any one file - * @param pairedReads whether these reads are from a paired-end run - */ - public BamToBfqWriter(File bamFile, String outputPrefix, Integer total, Integer chunk, boolean pairedReads) { - this.bamFile = bamFile; - this.outputPrefix = outputPrefix; - this.pairedReads = pairedReads; - if (total != null) { - double writeable = (double)countWritableRecords(); - this.increment = (int)Math.floor(writeable/total.doubleValue()); - } - if (chunk != null) { - this.chunk = chunk; - } - } - - /** - * Constructor - * - * @param bamFile the BAM file to read from - * @param outputPrefix the directory and file prefix for the binary fastq files - * @param pairedReads whether these reads are from a paired-end run - */ - public BamToBfqWriter(File bamFile, String outputPrefix, boolean pairedReads) { - this(bamFile, outputPrefix, null, null, pairedReads); - } - - /** - * Writes the binary fastq file(s) to the output directory - */ - public void writeBfqFiles() { - - Iterator iterator = (new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator(); - - // Filter out noise reads and reads that fail the quality filter - TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1); - FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter(); - - if (!pairedReads) { - writeSingleEndBfqs(iterator, Arrays.asList(tagFilter, qualityFilter)); - codec1.close(); - } - else { - writePairedEndBfqs(iterator, tagFilter, qualityFilter); - codec1.close(); - codec2.close(); - } - log.info("Wrote " + wrote + " bfq records."); - - } - - /** - * Path for writing bfqs for paired-end reads - * - * @param iterator the iterator witht he SAM Records to write - * @param tagFilter the filter for noise reads - * @param qualityFilter the filter for PF reads - */ - private void writePairedEndBfqs(Iterator iterator, TagFilter tagFilter, - FailsVendorReadQualityFilter qualityFilter) { - // Open the codecs for writing - int fileIndex = 0; - initializeNextBfqFiles(fileIndex++); - - int records = 0; - - while (iterator.hasNext()) { - SAMRecord first = iterator.next(); - if (!iterator.hasNext()) { - throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath()); - } - SAMRecord second = iterator.next(); - if (!second.getReadName().equals(first.getReadName()) || - first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) { - throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() + - ": " + first.getReadName() + ", " + second.getReadName() + "."); - } - - // If both are noise reads, filter them out - if (tagFilter.filterOut(first) && tagFilter.filterOut(second)) { - // skip it - } - // If either fails to pass filter, then exclude them as well - else if (qualityFilter.filterOut(first) || qualityFilter.filterOut(second)) { - // skip it - } - // Otherwise, write them out - else { - records++; - if (records % increment == 0) { - first.setReadName(first.getReadName() + "#0/1"); - writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first); - second.setReadName(second.getReadName() + "#0/2"); - writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second); - wrote++; - if (wrote % 1000000 == 0) { - log.info(wrote + " records written."); - } - if (chunk > 0 && wrote % chunk == 0) { - initializeNextBfqFiles(fileIndex++); - } - } - } - } - } - - /** - * Path for writing bfqs for single-end reads - * - * @param iterator the iterator witht he SAM Records to write - * @param filters the list of filters to be applied - */ - private void writeSingleEndBfqs(Iterator iterator, List filters) { - - // Open the codecs for writing - int fileIndex = 0; - initializeNextBfqFiles(fileIndex++); - - int records = 0; - - FilteringIterator it = new FilteringIterator(iterator, new AggregateFilter(filters)); - while (it.hasNext()) { - SAMRecord record = it.next(); - records++; - if (records % increment == 0) { - - writeFastqRecord(codec1, record); - wrote++; - if (wrote % 1000000 == 0) { - log.info(wrote + " records processed."); - } - if (chunk > 0 && wrote % chunk == 0) { - initializeNextBfqFiles(fileIndex++); - } - } - } - } - - /** - * Closes any the open bfq file(s), if any, and opens the new one(s) - * - * @param fileIndex the index (counter) of the files to write - */ - private void initializeNextBfqFiles(int fileIndex) { - // Close the codecs if they were writing before - if (codec1 != null) { - codec1.close(); - if (pairedReads) { - codec2.close(); - } - } - - // Open new file, using the fileIndex. - File bfq1 = getOutputFile(this.outputPrefix , 1, fileIndex); - codec1 = new BinaryCodec(IoUtil.openFileForWriting(bfq1)); - log.info("Now writing to file " + bfq1.getAbsolutePath()); - if (pairedReads) { - File bfq2 = getOutputFile(this.outputPrefix , 2, fileIndex); - codec2 = new BinaryCodec(IoUtil.openFileForWriting(bfq2)); - log.info("Now writing to file " + bfq2.getAbsolutePath()); - } - } - - /** - * Writes out a SAMRecord in Maq fastq format - * - * @param codec the code to write to - * @param rec the SAMRecord to write - */ - private void writeFastqRecord(BinaryCodec codec, SAMRecord rec) { - - // Writes the length of the read name and then the name (null-terminated) - codec.writeString(rec.getReadName(), true, true); - - char seqs[] = rec.getReadString().toCharArray(); - char quals[] = rec.getBaseQualityString().toCharArray(); - - // Write the length of the sequence - codec.writeInt(seqs.length); - - // Calculate and write the sequence and qualities - byte seqsAndQuals[] = new byte[seqs.length]; - - for (int i = 0; i < seqs.length; i++) { - int quality = Math.min(quals[i]-33, 63); - int base; - switch(seqs[i]) { - case 'A': - case 'a': - base = 0; - break; - case 'C': - case 'c': - base = 1; - break; - case 'G': - case 'g': - base = 2; - break; - case 'T': - case 't': - base = 3; - break; - case 'N': - case 'n': - case '.': - base = 0; - quality = 0; - break; - default: - throw new PicardException("Unknown base when writing bfq file: " + seqs[i]); - } - seqsAndQuals[i] = (byte) (base << 6 | quality); - } - codec.writeBytes(seqsAndQuals); - } - - private int countWritableRecords() { - int count = 0; - PeekableIterator it = new PeekableIterator((new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator()); - if (!this.pairedReads) { - // Filter out noise reads and reads that fail the quality filter - List filters = new ArrayList(); - filters.add(new TagFilter(ReservedTagConstants.XN, 1)); - filters.add(new FailsVendorReadQualityFilter()); - FilteringIterator itr = new FilteringIterator(it, new AggregateFilter(filters)); - while (itr.hasNext()) { - itr.next(); - count++; - } - } - else { - while (it.hasNext()) { - SAMRecord first = it.next(); - SAMRecord second = it.next(); - // If both are noise reads, filter them out - if (first.getAttribute(ReservedTagConstants.XN) != null && - second.getAttribute(ReservedTagConstants.XN) != null) { - // skip it - } - // If either fails to pass filter, then exclude them as well - else if (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag() ) { - // skip it - } - // Otherwise, write them out - else { - count++; - } - } - } - it.close(); - return count; - } - - /** - * Constructs the name for the output file and returns the file - * - * @param outputPrefix the directory and file prefix for the output bfq file - * @param read whether this is the file for the first or second read - * @return a new File object for the bfq file. - */ - private File getOutputFile(String outputPrefix, int read, int index) { - File result = new File(outputPrefix + "." + index + "." + read + ".bfq"); - IoUtil.assertFileIsWritable(result); - return result; - } - -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java b/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java deleted file mode 100644 index af5574185..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java +++ /dev/null @@ -1,357 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.sam.*; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.StringUtil; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.util.SamPairUtil; - -import java.io.File; -import java.io.BufferedInputStream; -import java.util.*; - -/** - * Reads a Maq map file and returns an an iterator of SAMRecords and a populated header - * - * IMPORTANT! Even though the reads in the map file are in coordinate order, this iterator - * will not necessarily return them in that order. For paired reads, both will be - * returned only after *both* records have been seen. - * - * @author Kathleen Tibbetts - */ -public class MapFileIterator implements CloseableIterator { - - public static final int MATE_UNMAPPED_FLAG = 64; - public static final int READ_UNMAPPED_FLAG = 192; - - private static final int READ_NAME_LENGTH = 36; - private static final int MAP_FORMAT = -1; - private static final int MAX_READ_LENGTH = 128; - - private static final byte ACGT[] = {'A', 'C', 'G', 'T'}; - - public static final String PROGRAM_RECORD = "0"; - - private long recordCount = 0L; - private int recordsRead = 0; - private BinaryCodec mapCodec; - private final SAMFileHeader header; - private final boolean pairedReads; - private final boolean jumpingLibrary; - private final List next = new ArrayList(); - private final Map pending = new HashMap(); - private final List mapFiles = new LinkedList(); - - /** - * Constructor. Opens the map file, reads the record count and header from it, - * creates the SAMFileHeader, and queues up the first read - * - * @param mapFile The Maq map file to read - * @param commandLine The command line used to invoke Maq (for the header) - * @param pairedReads Whether this is a paired-end run - */ - public MapFileIterator(String commandLine, boolean pairedReads, boolean jumpingLibrary, File... mapFile) { - if (mapFile.length == 0) { - throw new IllegalArgumentException("At least one map file must be provided."); - } - mapFiles.addAll(Arrays.asList(mapFile)); - - this.pairedReads = pairedReads; - this.jumpingLibrary = jumpingLibrary; - - header = new SAMFileHeader(); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - SAMProgramRecord program = new SAMProgramRecord(PROGRAM_RECORD); - program.setProgramVersion(MaqConstants.getProgramVersion()); - program.setCommandLine(commandLine); - header.addProgramRecord(program); - - queueNextMapFile(); - } - - /** - * Queues up the next map file - * - * @return true if there's another map file to iterate over - */ - private boolean queueNextMapFile() { - - // Close the old file - if (mapCodec != null) { - mapCodec.close(); - } - - // If there are no more map files, return fales - if (mapFiles.size() == 0) { - return false; - } - - // Otherwise, open the next file and reset the recordsRead count - mapCodec = new BinaryCodec(new BufferedInputStream(IoUtil.openFileForReading(mapFiles.remove(0)))); - int format = mapCodec.readInt(); - if (format != MAP_FORMAT) { - mapCodec.close(); - throw new PicardException("Unrecognized Maq map file format: " + format); - } - recordsRead = 0; - - - // Read the sequences out of the map file and set them on the header - int sequenceCount = mapCodec.readInt(); - List sequences = new ArrayList(); - for (int i = 0; i < sequenceCount; i++) { - int length = mapCodec.readInt(); - // Write the sequence name, trimming off the null terminator - sequences.add(new SAMSequenceRecord(mapCodec.readString(length).substring(0, length-1))); - } - if (header.getSequences() == null || header.getSequences().size() == 0) { - header.setSequences(sequences); - } - else { - // TODO: Check that the sequences match and throw and exception if they don't - } - recordCount = mapCodec.readLong(); - - readNext(); - return true; - } - - /** - * Closes the BinaryCodec reading the map file - */ - public void close() { - mapCodec.close(); - } - - /** - * @return true if the iteration has more elements - */ - public boolean hasNext() { - return next.size() > 0; - } - - /** - * @return the next SAMRecord in the iteration - * @throws NoSuchElementException if this is called when hasNext() returns false - */ - public SAMRecord next() { - if (!hasNext()) { - throw new NoSuchElementException("No more elements in this iteration"); - } - SAMRecord result = next.remove(0); - readNext(); - return result; - } - - /** - * Reads the next element from the map file. If we are done with it, we put it in the next - * list; if we are waiting to see its mate, we put it in the pending map. Calls itself - * repeatedly until there is at least one element in next. - */ - private void readNext() { - - // If there's already a record queued up, just return - if (next.size() > 0) { - return; - } - - // If we've read all there is, then any remaining records in the pending map should be returned. - // If this is not a PE run, then the pending map will be empty and we're done. - if (recordsRead == recordCount) { - if (pending.size() > 0) { - StringBuffer sb = new StringBuffer(); - for (String item : pending.keySet()) { - sb.append(item).append("\n"); - } - throw new PicardException("MapFileIterator pending map should have been empty but contained " + - "the following records: " + sb.toString()); - } - queueNextMapFile(); - return; - } - - // Otherwise, we read until there is at least one record in the next list - readMapRecord(); - if (next.size() == 0) { - readNext(); - } - } - - /** - * Reads one record from the map file and throws it onto the pending map or the next list, - * depending on whether we have already seen its mate - */ - private void readMapRecord() { - - // Now that we've got all the data from the binary file, write a SAMRecord and add it to - // the new BAM file - SAMRecord record = new SAMRecord(); - record.setAttribute(SAMTag.PG.toString(), PROGRAM_RECORD); - record.setReadPairedFlag(this.pairedReads); - - // the last base is the single-end mapping quality. - byte seqsAndQuals[] = new byte[MAX_READ_LENGTH-1]; - mapCodec.readBytes(seqsAndQuals); - - byte singleEndMappingQualityOrIndelLength = mapCodec.readByte(); - - // the length of the read - int readLength = mapCodec.readUByte(); - setSeqsAndQuals(seqsAndQuals, readLength, record); - - // the final mapping quality (unless flag below is 130, then it is the - // position of the indel (or 0 if no indel) - int mappingQuality = mapCodec.readUByte(); - - // mismatches in the 28bp (higher 4 bits) and mismatches (lower 4 bits) - mapCodec.readUByte(); - // sum of errors of the best hit - mapCodec.readUByte(); - // counts of all 0- and 1-mismatch hits on the reference - mapCodec.readUByte(); - mapCodec.readUByte(); - - // A bitwise flag. See the Maq docs for its full meaning - int flag = mapCodec.readUByte(); - - // the lower mapQ of the two ends (equals map_qual if unpaired); if flag is 130: mapQ of its mate - int altQual = mapCodec.readUByte(); - - // Index of the sequence for this read - record.setReferenceIndex((int)mapCodec.readUInt(), getHeader()); - - // Start position and strand - long pos = mapCodec.readUInt(); - int startPos = ((int)((pos>>1)& 0x7FFFFFFF)) + 1; - record.setAlignmentStart(startPos); - record.setReadNegativeStrandFlag((pos&1) == 1); - - // offset of the mate (zero if unpaired, or two ends mapped to different chr) - mapCodec.readInt(); - - // The read name - byte nameBytes[] = new byte[READ_NAME_LENGTH]; - mapCodec.readBytes(nameBytes); - String name = StringUtil.bytesToString(nameBytes).trim(); - if (this.pairedReads) { - if (name.endsWith("/1")) { - record.setFirstOfPairFlag(true); - record.setSecondOfPairFlag(false); - } - else if (name.endsWith("/2")) { - record.setFirstOfPairFlag(false); - record.setSecondOfPairFlag(true); - } - else { - throw new PicardException("Unrecognized ending for paired read name: " + name); - } - name = name.substring(0, name.length()-2); - } - record.setReadName(name); - - - if (flag != 130 || singleEndMappingQualityOrIndelLength == 0) { // No indel - record.setCigarString(readLength + "M"); - record.setMappingQuality(mappingQuality); - } - else { // Indel - int indelPos = mappingQuality; - String cigar = indelPos + "M" + Math.abs(singleEndMappingQualityOrIndelLength); - int remaining = readLength - indelPos; - if (singleEndMappingQualityOrIndelLength > 0) { - cigar += "I" + (remaining - singleEndMappingQualityOrIndelLength) + "M"; - } - else { - cigar += "D" + remaining + "M"; - } - record.setCigarString(cigar); - // In the docs, it look like there is a mapping quality for the mate, do we use that? - record.setMappingQuality(altQual); - } - - if (!pairedReads) { - record.setProperPairFlag(false); - next.add(record); - } - else { - record.setMateUnmappedFlag(flag == MATE_UNMAPPED_FLAG); - SAMRecord mate = pending.remove(record.getReadName()); - - if (mate != null) { - boolean proper = SamPairUtil.isProperPair(record, mate, jumpingLibrary); - record.setProperPairFlag(proper); - mate.setProperPairFlag(proper); - - SamPairUtil.setMateInfo(record, mate); - - int insertSize = SamPairUtil.computeInsertSize(record, mate); - record.setInferredInsertSize(insertSize); - mate.setInferredInsertSize(insertSize); - - if (!mate.getMateUnmappedFlag()) { - next.add(record); - } - if (!record.getMateUnmappedFlag()) { - next.add(mate); - } - } - else { - pending.put(record.getReadName(), record); - } - } - - // TODO: Figure out what do do about noise reads long-term - // Note that it is possible that we have lost a "Noise read" annotation at this point. Since - // we try to map a pair if only one of the reads is classified as "noise", then for any paired - // reads where one was a noise read and one was not, we will lose the noise annotation on the - // one noisy read. We have discussed either re-doing the noise evaluation here, modifying the - // read name to carry the noise flag through Maq, or changing what reads we give to Maq. - - recordsRead++; - - } - - /** - * Decodes the sequence and the qualities and sets them on the SAMrecords - * - * @param seqsAndQuals the list of seqs and quals - * @param readLength the length of the read - * @param sam the SAMRecord to populate - */ - private void setSeqsAndQuals(byte seqsAndQuals[], int readLength, SAMRecord sam) { - byte sequence[] = new byte[readLength]; - byte qualities[] = new byte[readLength]; - for (int i = 0; i < readLength; i++) { - byte b = seqsAndQuals[i]; - qualities[i] = (byte)(b & 0x3F); - if (b == 0) { - sequence[i] = 'N'; - } - else { - sequence[i] = ACGT[(seqsAndQuals[i] >> 6) & 3]; - } - } - sam.setReadBases(sequence); - sam.setBaseQualities(qualities); - } - - /** - * @throws UnsupportedOperationException -- not implemented - */ - public void remove() { - throw new UnsupportedOperationException("remove() not supported in MapFileIterator"); - } - - public SAMFileHeader getHeader() { return header; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java deleted file mode 100644 index 6c1890818..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java +++ /dev/null @@ -1,211 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.picard.aligner.Aligner; -import edu.mit.broad.picard.aligner.AbstractBaseAligner; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.util.Log; - -import java.io.File; -import java.io.FilenameFilter; -import java.util.*; - -/** - * Maq implementation of the Aligner interface - */ -public class MaqAligner extends AbstractBaseAligner implements Aligner { - - // Constants related to Maq output files - public static final String MAQ_MAP_SUFFIX = ".out.aln.map"; - public static final String MAQ_LOG_SUFFIX = ".out.map.log"; - - // Internal constant for multi-plexing lane data - private static final int READ_CHUNK_SIZE = 2000000; - - public static final String REFERENCE_FILE_SUFFIX = ".bfa"; - - private final Log log = Log.getInstance(MaqAligner.class); - - private String commandLine = null; - - - /** - * Constructor that sets every parameter. All other constructors delegate to this one. - * - * @param stringency the stringency of the alignment - * @param readsBamFile the BAM file containing the reads - * @param outputPrefix the directory and filename prefix for output - * @param referenceFileDir the directory where the reference file is located - * @param clipPoints the clip points - * @param expectedInsertSize the expected insert size (null for non-PE lanes) - * @param readsToAlign the number of reads to align - * @param customParametersMap parameters specific to the Aligner implementation - */ - public MaqAligner(Stringency stringency, File readsBamFile, String outputPrefix, - String referenceFileDir, int clipPoints[], Integer expectedInsertSize, - Integer readsToAlign, Map customParametersMap, - boolean pairedReads, int readLength) { - - super(stringency, readsBamFile, outputPrefix, referenceFileDir, clipPoints, - expectedInsertSize, readsToAlign, customParametersMap, pairedReads, readLength); - } - - /** - * Prepares all the necessary inputs for the alignment process from a BAM file of read data. - */ - public void prepareInputs() { - log.info("Preparing Maq inputs."); - BamToBfqWriter writer = new BamToBfqWriter(this.getReadsBamFile(), this.getOutputPrefix(), - this.getReadsToAlign(), READ_CHUNK_SIZE, isPairedReads()); - writer.writeBfqFiles(); - } - - /** - * Does the alignment and produces output in the underlying form of the aligner. - */ - public void align() { - log.info("Running Maq alignment."); - - // Temporary hack until we get the multi-tasking code from Seva - List mapFileNames = new ArrayList(); // All map files that we will merge together at the end - - String maqParams = MaqConstants.SWITCH_RANDOM_SEED + " " + MaqConstants.DEFAULT_RANDOM_SEED; - - if (this.getStringency() == Stringency.high) { - maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + Math.round( - this.getExpectedInsertSize() * MaqConstants.HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER); - maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " + - MaqConstants.HIGH_STRINGENCY_SUM_MISMATCHES; - } - else { - maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + - MaqConstants.LOW_STRINGENCY_MAX_OUTER_DISTANCE; - // For low stringency, get at least 30 bases and then let half of what's remaining mismatch - int maxMisMatches = (this.getReadLength() - 30)/2; - maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " + - (maxMisMatches * MaqConstants.LOW_STRINGENCY_QUALITY_FOR_MISMATCHES); - } - - String referenceFile = new File(this.getReferenceFileDir()).listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(REFERENCE_FILE_SUFFIX); - } - })[0].getAbsolutePath(); - - ProcessBuilder builder; - - // Map the bfq files, individually or in pairs - SortedSet bfqs = new TreeSet(this.getBfqFiles()); - for (Iterator it = bfqs.iterator(); it.hasNext();) { - - String read1bfq = it.next().getAbsolutePath(); - String read2bfq = (this.isPairedReads()) ? it.next().getAbsolutePath() : ""; - - String outputFileBase = read1bfq.substring(0, read1bfq.lastIndexOf('.')-2); - String mapFile = outputFileBase + MAQ_MAP_SUFFIX; - String logFile = outputFileBase + MAQ_LOG_SUFFIX; - - String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + MaqConstants.MAP_COMMAND + - " " + maqParams + " " + mapFile + " " + referenceFile + " " + read1bfq + " " + read2bfq + - " 2> " + logFile; - setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command); - log.info("Executing command: " + command); - try { - builder = new ProcessBuilder(command.split(" ")); - Process p = builder.start(); - p.waitFor(); - } - catch (Exception e) { - throw new PicardException("Error starting Maq process", e); - } - - mapFileNames.add(mapFile); - } - - // If there's more than one map file, then merge them. - String finalFileName = this.getOutputPrefix() + "." + this.getStringency() + MAQ_MAP_SUFFIX; - if (mapFileNames.size() > 1) { - String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + - MaqConstants.MERGE_COMMAND + " " + finalFileName; - for (String name : mapFileNames) { - command += " " + name; - } - setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command); - log.info("Executing command: " + command); - - try { - builder = new ProcessBuilder(command.split(" ")); - Process p = builder.start(); - p.waitFor(); - } - catch (Exception e) { - throw new PicardException("Error starting Maq process", e); - } - } - else { // Otherwise rename the single map file so we can find it later - File f = new File(mapFileNames.get(0)); - if (!f.renameTo(new File(finalFileName))) { - throw new PicardException("Error renaming " + f.getAbsolutePath() + " to " + finalFileName); - } - } - } - - /** - * Converts the output of the aligner to BAM format - */ - public void prepareOutput() { - log.info("Preparing output from Maq alignment."); - // TODO: MaqToBam - } - - /** - * Cleans up intermediate files (the files created in by and for the underlying aligner by the - * prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file. - */ - public void cleanup() { - log.info("Cleaning up Maq intermediate files."); - this.deleteFiles(getBfqFiles()); -// this.deleteFiles(getMaqAlignmentFiles()); - } - - /** - * Returns a list of zero to two BFQ files, depending on whether they are there - * and whether it was a paired-end run or not - * - * @return a list of BFQ files - */ - private List getBfqFiles() { - File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/"))); - return Arrays.asList(dir.listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(".bfq"); - } - })); - } - - /** - * Returns the Maq map files - * - * @return a list of Maq .map files - */ - private List getMaqAlignmentFiles() { - File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/"))); - return Arrays.asList(dir.listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - // TODO: Add the text files if we do not read the binary map files - return name.endsWith(MAQ_MAP_SUFFIX) || name.endsWith(MAQ_LOG_SUFFIX); - } - })); - } - - public String getCommandLine() { return commandLine; } - public void setCommandLine(String commandLine) { this.commandLine = commandLine; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java deleted file mode 100644 index b5e4b9b59..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -/** - * Utility class to hold Maq-related constants (program name, location, switches, etc) - */ -public class MaqConstants { - // General Maq constants - public static final String PROGRAM_NAME = "Maq"; - public static final String PROGRAM_VERSION = "0.7.1"; - public static final String MAQ_HOME = "/seq/dirseq/maq-0.7.1/"; - - // Command-related constants - public static final String MAQ_COMMAND = "maq"; - public static final String MAP_COMMAND = "map"; - public static final String MERGE_COMMAND = "mapmerge"; - - // Constants related to Maq map switches - public static final String SWITCH_SUM_MISMATCHES = "-e"; - public static final int HIGH_STRINGENCY_SUM_MISMATCHES = 100; - public static final int LOW_STRINGENCY_QUALITY_FOR_MISMATCHES = 30; - - public static final String SWITCH_MAX_OUTER_DISTANCE = "-a"; - public static final int LOW_STRINGENCY_MAX_OUTER_DISTANCE = 1500; - public static final double HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER = 1.5d; - - public static final String SWITCH_RANDOM_SEED = "-s"; - public static final int DEFAULT_RANDOM_SEED = 0; - - public static String getProgramVersion() { return PROGRAM_VERSION; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java deleted file mode 100644 index 3b82cc106..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.StringSortingCollectionFactory; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.util.SortingCollection; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.*; - -import java.io.File; -import java.io.BufferedInputStream; -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import java.nio.ByteBuffer; - -/** - * Class to write a BAM file that includes the results from a Maq .map file along with the unaligned - * reads from the original BAM file. - * - * Information on the meaning of the elements of the map file is drawn from the Maq documentation - * on this page: http://maq.sourceforge.net/maqmap_format.shtml - */ -public class MaqMapMerger { - - private final File mapFile; - private final File sourceBamFile; - private final File targetBamFile; - private final boolean pairedReads; - private final Log log = Log.getInstance(MaqMapMerger.class); - private String commandLine = null; - private List sequences = new ArrayList(); - - - /** - * Constructor - * - * @param mapFile The Maq map file to parse - * @param sourceBamFile The BAM file that was used as the input to the Maq aligner, which will - * include info on all the reads that did not map - * @param targetBamFile The file to which to write the merged - */ - public MaqMapMerger(File mapFile, File sourceBamFile, File targetBamFile, boolean pairedReads) { - IoUtil.assertFileIsReadable(mapFile); - IoUtil.assertFileIsReadable(sourceBamFile); - IoUtil.assertFileIsWritable(targetBamFile); - this.mapFile = mapFile; - this.sourceBamFile = sourceBamFile; - this.targetBamFile = targetBamFile; - this.pairedReads = pairedReads; - } - - /** - * Merges the alignment from the map file with the remaining records from the source BAM file. - */ - public void mergeAlignment() { - log.info("Processing map file: " + mapFile.getAbsolutePath()); - // Write the header - MapFileIterator it = new MapFileIterator(getCommandLine(), this.pairedReads, false, this.mapFile); - SAMFileHeader header = it.getHeader(); - SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, targetBamFile); - - // Write the alignments - SortingCollection readNames = writeAlignments(it, writer); - - // We're done with the map file, so close it - it.close(); - writeUnalignedReads(writer, readNames.iterator()); - - // Now close the writer - writer.close(); - } - - - private void writeUnalignedReads(SAMFileWriter writer, CloseableIterator nameIterator) { - - int skipCount = 0; - SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(this.sourceBamFile)); - CloseableIterator bamRecords = reader.iterator(); - - String readName = nameIterator.hasNext() ? nameIterator.next() : null; - while(bamRecords.hasNext()) { - SAMRecord rec = bamRecords.next(); - if (rec.getReadName().equals(readName)) { - // skip it and pull the next name off the name iterator - readName = nameIterator.hasNext() ? nameIterator.next() : null; - skipCount++; - } - else { - writer.addAlignment(rec); - } - } -System.out.println("Skipped " + skipCount + " already-aligned records."); - bamRecords.close(); - nameIterator.close(); - } - - private SortingCollection writeAlignments(MapFileIterator iterator, SAMFileWriter writer) { - -int wrote = 0; - SortingCollection readNames = StringSortingCollectionFactory.newCollection(); - while (iterator.hasNext()) { - SAMRecord record = iterator.next(); - readNames.add(record.getReadName()); - writer.addAlignment(record); -wrote++; - } -System.out.println("Wrote " + wrote + " alignment records."); - return readNames; - } - - public void setCommandLine(String commandLine) { this.commandLine = commandLine; } - public String getCommandLine() { return this.commandLine; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java b/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java deleted file mode 100644 index bc3741b02..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java +++ /dev/null @@ -1,133 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.aligner.Aligner; - -import java.io.File; -import java.util.Map; -import java.util.List; -import java.util.HashMap; -import java.util.ArrayList; - -/** - * CommandLineProgram to generate to invoke BustardToBamWriter - * - * @author Kathleen Tibbetts - */ -public class RunMaq extends CommandLineProgram { - private static final String PROGRAM_VERSION = "1.0"; - - // The following attributes define the command-line arguments - @Usage - public String USAGE = - "Usage: " + getClass().getName() + " [options]\n\n" + - "Invoke the Maq aligner.\n" + - "Version: " + PROGRAM_VERSION +"\n"; - - @Option(shortName="I", doc="The BAM file to parse.", optional=true) - public File INPUT; - @Option(shortName="O", doc="The directory and file prefix for all output.", optional=false) - public String OUTPUT; - @Option(shortName="L", doc="The read length.", optional=false) - public Integer READ_LENGTH; - @Option(shortName="S", doc="Stringency of the alignment.", optional=true) - public Aligner.Stringency STRINGENCY; - @Option(shortName="R", doc="Directory where the reference file is located.", optional=true) - public String REFERENCE; - @Option(shortName="C", doc="Clip points for the alignment.", optional=true, minElements=0, maxElements=4) - public List CLIP_POINT = new ArrayList(); - @Option(shortName="E", doc="Expected insert size.", optional=true) - public Integer EXPECTED_INSERT_SIZE; - @Option(doc="Whether this is a paired-end run.", optional=false) - public Boolean PE; - @Option(shortName="NUM", doc="Number of reads to align (null = all).", optional=true) - public Integer READS_TO_ALIGN; - @Option(shortName="CUSTOM", doc="Custom parameter in the form name=value.", optional=true) - public List CUSTOM_PARAMETER = new ArrayList(); - @Option(shortName="PREP", doc="Whether to prepare inputs for the alignement.", optional=true) - public Boolean PREPARE = true; - @Option(doc="Whether to do the alignement.", optional=true) - public Boolean ALIGN = true; - @Option(shortName="BAM", doc="Whether to generate a BAM file from the alignment output.", optional=true) - public Boolean BAM_OUTPUT = true; - @Option(doc="Whether to clean up intermediate input and output.", optional=true) - public Boolean CLEANUP = true; - - protected int doWork() { - int clipPoints[] = null; - if (CLIP_POINT != null) { - clipPoints = new int[4]; - int index=0; - for (Integer i : CLIP_POINT) { - clipPoints[index++] = i; - } - } - Map params = null; - if (CUSTOM_PARAMETER != null) { - params = new HashMap(); - for (String param : CUSTOM_PARAMETER) { - String nameAndVal[] = param.split("="); - params.put(nameAndVal[0], nameAndVal[1]); - } - } - Aligner aligner = new MaqAligner(STRINGENCY, INPUT, OUTPUT, REFERENCE, clipPoints, - EXPECTED_INSERT_SIZE, READS_TO_ALIGN, params, PE, READ_LENGTH); - if (PREPARE) { - aligner.prepareInputs(); - } - if (ALIGN) { - aligner.align(); - } - if (BAM_OUTPUT) { - aligner.prepareOutput(); - } - if (CLEANUP) { - aligner.cleanup(); - } - return 0; - } - - /** - * This is kind of a mess. Almost everything is optional, since you don't have to do all of the steps in the - * alignement. - * @return - */ - protected boolean customCommandLineValidation() { - if (PREPARE) { - if( INPUT == null) { - System.err.println("ERROR: INPUT must be specified when preparing inputs for the alignment."); - return false; - } - if (CLIP_POINT.size() != 0 && CLIP_POINT.size() != 4) { - System.err.println("ERROR: You must supply either 0 or 4 values for CLIP_POINT: " + CLIP_POINT.size()); - return false; - } - } - if (ALIGN) { - if (STRINGENCY == null) { - System.err.println("ERROR: STRINGENCY must be specified when doing an alignment."); - return false; - } - if (REFERENCE == null) { - System.err.println("ERROR: REFERENCE must be specified when doing an alignment."); - return false; - } - } - return true; - } - - public static void main(String[] argv) { - System.exit(new RunMaq().instanceMain(argv)); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java deleted file mode 100644 index cfe74bbcc..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -public class CommandLineParseException extends RuntimeException{ - public CommandLineParseException() { - } - - public CommandLineParseException(String s) { - super(s); - } - - public CommandLineParseException(String s, Throwable throwable) { - super(s, throwable); - } - - public CommandLineParseException(Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java deleted file mode 100644 index 69b681abb..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java +++ /dev/null @@ -1,638 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.io.*; -import java.lang.reflect.Constructor; -import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.ParameterizedType; -import java.lang.reflect.Type; -import java.util.*; - -import edu.mit.broad.picard.util.StringUtil; -import edu.mit.broad.picard.PicardException; - -/** - * Annotation-driven utility for parsing command-line arguments, checking for errors, and producing usage message. - * - * This class supports options of the form KEY=VALUE, plus positional arguments. Positional arguments must not contain - * an equal sign lest they be mistaken for a KEY=VALUE pair. - * - * The caller must supply an object that both defines the command line and has the parsed options set into it. - * For each possible KEY=VALUE option, there must be a public data member annotated with @Option. The KEY name is - * the name of the data member. An abbreviated name may also be specified with the shortName attribute of @Option. - * If the data member is a List, then the option may be specified multiple times. The type of the data member, - * or the type of the List element must either have a ctor T(String), or must be an Enum. List options must - * be initialized by the caller with some kind of list. Any other option that is non-null is assumed to have the given - * value as a default. If an option has no default value, and does not have the optional attribute of @Option set, - * is required. For List options, minimum and maximum number of elements may be specified in the @Option annotation. - * - * A single List data member may be annotated with the @PositionalArguments. This behaves similarly to a Option - * with List data member: the caller must initialize the data member, the type must be constructable from String, and - * min and max number of elements may be specified. If no @PositionalArguments annotation appears in the object, - * then it is an error for the command line to contain positional arguments. - * - * A single String public data member may be annotated with @Usage. This string, if present, is used to - * construct the usage message. Details about the possible options are automatically appended to this string. - * If @Usage does not appear, a boilerplate usage message is used. - */ -public class CommandLineParser { - // For formatting option section of usage message. - private static final int OPTION_COLUMN_WIDTH = 30; - private static final int DESCRIPTION_COLUMN_WIDTH = 50; - - private static final Boolean[] TRUE_FALSE_VALUES = {Boolean.TRUE, Boolean.FALSE}; - - // Use these if no @Usage annotation - private static final String defaultUsagePreamble = "Usage: program [options...]\n"; - private static final String defaultUsagePreambleWithPositionalArguments = - "Usage: program [options...] [positional-arguments...]\n"; - private static final String OPTIONS_FILE = "OPTIONS_FILE"; - - /** - * A typical command line program will call this to get the beginning of the usage message, - * and then append a description of the program, like this: - * - * \@Usage(programVersion=PROGRAM_VERSION) - * public String USAGE = CommandLineParser.getStandardUsagePreamble(getClass()) + "Frobnicates the freebozzle." - */ - public static String getStandardUsagePreamble(Class mainClass) { - return "USAGE: " + mainClass.getName() + " [options]\n\n"; - } - - // This is the object that the caller has provided that contains annotations, - // and into which the values will be assigned. - private final Object callerOptions; - - private String usagePreamble; - // null if no @PositionalArguments annotation - private Field positionalArguments; - private int minPositionalArguments; - private int maxPositionalArguments; - - // List of all the data members with @Option annotation - private final List optionDefinitions = new ArrayList(); - - // Maps long name, and short name, if present, to an option definition that is - // also in the optionDefinitions list. - private final Map optionMap = new HashMap(); - - // For printing error messages when parsing command line. - private PrintStream messageStream; - - // In case implementation wants to get at arg for some reason. - private String[] argv; - - - /** - * This attribute is here just to facilitate printing usage for OPTIONS_FILE - */ - public File IGNORE_THIS_PROPERTY; - - /** - * Prepare for parsing command line arguments, by validating annotations. - * @param callerOptions This object contains annotations that define the acceptable command-line options, - * and ultimately will receive the settings when a command line is parsed. - */ - public CommandLineParser(final Object callerOptions) { - this.callerOptions = callerOptions; - - for (final Field field : this.callerOptions.getClass().getFields()) { - if (field.getAnnotation(PositionalArguments.class) != null) { - handlePositionalArgumentAnnotation(field); - } - if (field.getAnnotation(Usage.class) != null) { - handleUsageAnnotation(field); - } - if (field.getAnnotation(Option.class) != null) { - handleOptionAnnotation(field); - } - } - - if (usagePreamble == null) { - if (positionalArguments == null) { - usagePreamble = defaultUsagePreamble; - } else { - usagePreamble = defaultUsagePreambleWithPositionalArguments; - } - } - } - - /** - * Print a usage message based on the options object passed to the ctor. - * @param stream Where to write the usage message. - */ - public void usage(final PrintStream stream) { - stream.print(usagePreamble); - if (!optionDefinitions.isEmpty()) { - stream.println("\nOptions:\n"); - for (final OptionDefinition optionDefinition : optionDefinitions) { - printOptionUsage(stream, optionDefinition); - } - } - final Field fileField; - try { - fileField = getClass().getField("IGNORE_THIS_PROPERTY"); - } catch (NoSuchFieldException e) { - throw new PicardException("Should never happen", e); - } - final OptionDefinition optionsFileOptionDefinition = - new OptionDefinition(fileField, OPTIONS_FILE, "", - "File of OPTION_NAME=value pairs. No positional parameters allowed. Unlike command-line options, " + - "unrecognized options are ignored. " + "A single-valued option set in an options file may be overridden " + - "by a subsequent command-line option. " + - "A line starting with '#' is considered a comment.", false, true, 0, Integer.MAX_VALUE, null, new String[0]); - printOptionUsage(stream, optionsFileOptionDefinition); - } - - /** - * Parse command-line options, and store values in callerOptions object passed to ctor. - * @param messageStream Where to write error messages. - * @param args Command line tokens. - * @return true if command line is valid. - */ - public boolean parseOptions(final PrintStream messageStream, final String[] args) { - this.argv = args; - this.messageStream = messageStream; - for (final String arg: args) { - if (arg.equals("-h") || arg.equals("--help")) { - usage(messageStream); - return false; - } - final String[] pair = arg.split("=", 2); - if (pair.length == 2) { - if (pair[0].equals(OPTIONS_FILE)) { - if (!parseOptionsFile(pair[1])) { - messageStream.println(); - usage(messageStream); - return false; - } - } else { - if (!parseOption(pair[0], pair[1], false)) { - messageStream.println(); - usage(messageStream); - return false; - } - } - } else if (!parsePositionalArgument(arg)) { - messageStream.println(); - usage(messageStream); - return false; - } - } - if (!checkNumArguments()) { - messageStream.println(); - usage(messageStream); - return false; - } - return true; - } - - /** - * After command line has been parsed, make sure that all required options have values, and that - * lists with minimum # of elements have sufficient. - * @return true if valid - */ - private boolean checkNumArguments() { - try { - for (final OptionDefinition optionDefinition : optionDefinitions) { - StringBuilder mutextOptionNames = new StringBuilder(); - for (String mutexOption : optionDefinition.mutuallyExclusive) { - OptionDefinition mutextOptionDef = optionMap.get(mutexOption); - if (mutextOptionDef != null && mutextOptionDef.hasBeenSet) { - mutextOptionNames.append(" ").append(mutextOptionDef.name); - } - } - if (optionDefinition.hasBeenSet && mutextOptionNames.length() > 0) { - messageStream.println("ERROR: Option '" + optionDefinition.name + - "' cannot be used in conjunction with option(s)" + - mutextOptionNames.toString()); - return false; - } - if (optionDefinition.isCollection) { - final Collection c = (Collection)optionDefinition.field.get(callerOptions); - if (c.size() < optionDefinition.minElements) { - messageStream.println("ERROR: Option '" + optionDefinition.name + "' must be specified at least " + - optionDefinition.minElements + " times."); - return false; - } - } else if (!optionDefinition.optional && !optionDefinition.hasBeenSet && mutextOptionNames.length() == 0) { - messageStream.print("ERROR: Option '" + optionDefinition.name + "' is required"); - if (optionDefinition.mutuallyExclusive.isEmpty()) { - messageStream.println("."); - } else { - messageStream.println(" unless any of " + optionDefinition.mutuallyExclusive + " are specified."); - } - return false; - } - } - if (positionalArguments != null) { - final Collection c = (Collection)positionalArguments.get(callerOptions); - if (c.size() < minPositionalArguments) { - messageStream.println("ERROR: At least " + minPositionalArguments + - " positional arguments must be specified."); - return false; - } - } - return true; - } catch (IllegalAccessException e) { - // Should never happen because lack of publicness has already been checked. - throw new RuntimeException(e); - } - } - - private boolean parsePositionalArgument(final String stringValue) { - if (positionalArguments == null) { - messageStream.println("ERROR: Invalid argument '" + stringValue + "'."); - return false; - } - final Object value; - try { - value = constructFromString(getUnderlyingType(positionalArguments), stringValue); - } catch (CommandLineParseException e) { - messageStream.println("ERROR: " + e.getMessage()); - return false; - } - final Collection c; - try { - c = (Collection)positionalArguments.get(callerOptions); - } catch (IllegalAccessException e) { - throw new RuntimeException(e); - } - if (c.size() >= maxPositionalArguments) { - messageStream.println("ERROR: No more than " + maxPositionalArguments + - " positional arguments may be specified on the command line."); - return false; - } - c.add(value); - return true; - } - - private boolean parseOption(String key, final String stringValue, final boolean optionsFile) { - key = key.toUpperCase(); - final OptionDefinition optionDefinition = optionMap.get(key); - if (optionDefinition == null) { - if (optionsFile) { - // Silently ignore unrecognized option from options file - return true; - } - messageStream.println("ERROR: Unrecognized option: " + key); - return false; - } - if (!optionDefinition.isCollection) { - if (optionDefinition.hasBeenSet && !optionDefinition.hasBeenSetFromOptionsFile) { - messageStream.println("ERROR: Option '" + key + "' cannot be specified more than once."); - return false; - } - } - final Object value; - try { - value = constructFromString(getUnderlyingType(optionDefinition.field), stringValue); - } catch (CommandLineParseException e) { - messageStream.println("ERROR: " + e.getMessage()); - return false; - } - try { - if (optionDefinition.isCollection) { - final Collection c = (Collection)optionDefinition.field.get(callerOptions); - if (c.size() >= optionDefinition.maxElements) { - messageStream.println("ERROR: Option '" + key + "' cannot be used more than " + - optionDefinition.maxElements + " times."); - return false; - } - c.add(value); - } else { - optionDefinition.field.set(callerOptions, value); - optionDefinition.hasBeenSet = true; - optionDefinition.hasBeenSetFromOptionsFile = optionsFile; - } - } catch (IllegalAccessException e) { - // Should never happen because we only iterate through public fields. - throw new RuntimeException(e); - } - return true; - } - - /** - * Parsing of options from file is looser than normal. Any unrecognized options are - * ignored, and a single-valued option that is set in a file may be overridden by a - * subsequent appearance of that option. - * A line that starts with '#' is ignored. - * @param optionsFile - * @return false if a fatal error occurred - */ - private boolean parseOptionsFile(final String optionsFile) { - try { - final BufferedReader reader = new BufferedReader(new FileReader(optionsFile)); - String line; - while ((line = reader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - final String[] pair = line.split("=", 2); - if (pair.length == 2) { - if (!parseOption(pair[0], pair[1], true)) { - messageStream.println(); - usage(messageStream); - return false; - } - } else { - messageStream.println("Strange line in OPTIONS_FILE " + optionsFile + ": " + line); - usage(messageStream); - return false; - } - } - reader.close(); - return true; - - } catch (IOException e) { - throw new PicardException("I/O error loading OPTIONS_FILE=" + optionsFile, e); - } - } - - private void printOptionUsage(final PrintStream stream, final OptionDefinition optionDefinition) { - final String type = getUnderlyingType(optionDefinition.field).getSimpleName(); - String optionLabel = optionDefinition.name + "=" + type; - stream.print(optionLabel); - if (optionDefinition.shortName.length() > 0) { - stream.println(); - } - if (optionDefinition.shortName.length() > 0) { - optionLabel = optionDefinition.shortName + "=" + type; - stream.print(optionLabel); - } - int numSpaces = OPTION_COLUMN_WIDTH - optionLabel.length(); - if (optionLabel.length() > OPTION_COLUMN_WIDTH) { - stream.println(); - numSpaces = OPTION_COLUMN_WIDTH; - } - printSpaces(stream, numSpaces); - final StringBuilder sb = new StringBuilder(); - if (optionDefinition.doc.length() > 0) { - sb.append(optionDefinition.doc); - sb.append(" "); - } - if (optionDefinition.optional && !optionDefinition.isCollection) { - sb.append("Default value: "); - sb.append(optionDefinition.defaultValue); - sb.append(". "); - } else if (!optionDefinition.isCollection){ - sb.append("Required. "); - } - Object[] enumConstants = getUnderlyingType(optionDefinition.field).getEnumConstants(); - if (enumConstants == null && getUnderlyingType(optionDefinition.field) == Boolean.class) { - enumConstants = TRUE_FALSE_VALUES; - } - if (enumConstants != null) { - sb.append("Possible values: {"); - for (int i = 0; i < enumConstants.length; ++i) { - if (i > 0) { - sb.append(", "); - } - sb.append(enumConstants[i].toString()); - } - sb.append("} "); - } - if (optionDefinition.isCollection) { - if (optionDefinition.minElements == 0) { - if (optionDefinition.maxElements == Integer.MAX_VALUE) { - sb.append("This option may be specified 0 or more times."); - } else { - sb.append("This option must be specified no more than " + optionDefinition.maxElements + "times."); - } - } else if (optionDefinition.maxElements == Integer.MAX_VALUE) { - sb.append("This option must be specified at least " + optionDefinition.minElements + " times."); - } else { - sb.append("This option may be specified between " + optionDefinition.minElements + - " and " + optionDefinition.maxElements + " times."); - } - } - if (!optionDefinition.mutuallyExclusive.isEmpty()) { - sb.append(" Cannot be used in conjuction with option(s)"); - for (String option : optionDefinition.mutuallyExclusive) { - OptionDefinition mutextOptionDefinition = optionMap.get(option); - sb.append(" ").append(mutextOptionDefinition.name); - if (mutextOptionDefinition.shortName.length() > 0) { - sb.append(" (").append(mutextOptionDefinition.shortName).append(")"); - } - } - } - final String wrappedDescription = StringUtil.wordWrap(sb.toString(), DESCRIPTION_COLUMN_WIDTH); - final String[] descriptionLines = wrappedDescription.split("\n"); - for (int i = 0; i < descriptionLines.length; ++i) { - if (i > 0) { - printSpaces(stream, OPTION_COLUMN_WIDTH); - } - stream.println(descriptionLines[i]); - } - stream.println(); - } - - private void printSpaces(final PrintStream stream, final int numSpaces) { - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < numSpaces; ++i) { - sb.append(" "); - } - stream.print(sb); - } - - private void handleOptionAnnotation(final Field field) { - try { - final Option optionAnnotation = field.getAnnotation(Option.class); - final boolean isCollection = isCollectionField(field); - if (isCollection) { - if (optionAnnotation.maxElements() == 0) { - throw new CommandLineParserDefinitionException("@Option member " + field.getName() + - "has maxElements = 0"); - } - if (optionAnnotation.minElements() > optionAnnotation.maxElements()) { - throw new CommandLineParserDefinitionException("In @Option member " + field.getName() + - ", minElements cannot be > maxElements"); - } - } - if (!canBeMadeFromString(getUnderlyingType(field))) { - throw new CommandLineParserDefinitionException("@Option member " + field.getName() + - " must have a String ctor or be an enum"); - } - - final OptionDefinition optionDefinition = new OptionDefinition(field, - field.getName(), - optionAnnotation.shortName(), - optionAnnotation.doc(), optionAnnotation.optional() || (field.get(callerOptions) != null), - isCollection, optionAnnotation.minElements(), - optionAnnotation.maxElements(), field.get(callerOptions), - optionAnnotation.mutex()); - - for (String option : optionAnnotation.mutex()) { - OptionDefinition mutextOptionDef = optionMap.get(option); - if (mutextOptionDef != null) { - mutextOptionDef.mutuallyExclusive.add(field.getName()); - } - } - if (optionMap.containsKey(optionDefinition.name)) { - throw new CommandLineParserDefinitionException(optionDefinition.name + " has already been used"); - } - optionMap.put(optionDefinition.name, optionDefinition); - if (optionDefinition.shortName.length() > 0) { - if (optionMap.containsKey(optionDefinition.shortName)) { - throw new CommandLineParserDefinitionException(optionDefinition.shortName + " has already been used"); - } - optionMap.put(optionDefinition.shortName, optionDefinition); - } - optionDefinitions.add(optionDefinition); - } catch (IllegalAccessException e) { - throw new CommandLineParserDefinitionException(field.getName() + - " must have public visibility to have @Option annotation"); - } - } - - private void handleUsageAnnotation(final Field field) { - if (usagePreamble != null) { - throw new CommandLineParserDefinitionException - ("@Usage cannot be used more than once in an option class."); - } - try { - usagePreamble = (String)field.get(callerOptions); - final Usage usageAnnotation = field.getAnnotation(Usage.class); - if (usageAnnotation.programVersion().length() > 0) { - usagePreamble += "Version: " + usageAnnotation.programVersion() + "\n"; - } - } catch (IllegalAccessException e) { - throw new CommandLineParserDefinitionException("@Usage data member must be public"); - } catch (ClassCastException e) { - throw new CommandLineParserDefinitionException - ("@Usage can only be applied to a String data member."); - } - } - - private void handlePositionalArgumentAnnotation(final Field field) { - if (positionalArguments != null) { - throw new CommandLineParserDefinitionException - ("@PositionalArguments cannot be used more than once in an option class."); - } - positionalArguments = field; - if (!isCollectionField(field)) { - throw new CommandLineParserDefinitionException("@PositionalArguments must be applied to a Collection"); - } - - if (!canBeMadeFromString(getUnderlyingType(field))) { - throw new CommandLineParserDefinitionException("@PositionalParameters member " + field.getName() + - "does not have a String ctor"); - } - - final PositionalArguments positionalArgumentsAnnotation = field.getAnnotation(PositionalArguments.class); - minPositionalArguments = positionalArgumentsAnnotation.minElements(); - maxPositionalArguments = positionalArgumentsAnnotation.maxElements(); - if (minPositionalArguments > maxPositionalArguments) { - throw new CommandLineParserDefinitionException("In @PositionalArguments, minElements cannot be > maxElements"); - } - } - - private boolean isCollectionField(final Field field) { - try { - field.getType().asSubclass(Collection.class); - return true; - } catch (ClassCastException e) { - return false; - } - } - - private Class getUnderlyingType(final Field field) { - if (isCollectionField(field)) { - final ParameterizedType clazz = (ParameterizedType)(field.getGenericType()); - final Type[] genericTypes = clazz.getActualTypeArguments(); - if (genericTypes.length != 1) { - throw new CommandLineParserDefinitionException("Strange collection type for field " + field.getName()); - } - return (Class)genericTypes[0]; - - } else { - return field.getType(); - } - } - - // True if clazz is an enum, or if it has a ctor that takes a single String argument. - private boolean canBeMadeFromString(final Class clazz) { - if (clazz.isEnum()) { - return true; - } - try { - clazz.getConstructor(String.class); - return true; - } catch (NoSuchMethodException e) { - return false; - } - } - - private Object constructFromString(final Class clazz, final String s) { - try { - if (clazz.isEnum()) { - try { - return Enum.valueOf(clazz, s); - } catch (IllegalArgumentException e) { - throw new CommandLineParseException("'" + s + "' is not a valid value for " + - clazz.getSimpleName() + ".", e); - } - } - final Constructor ctor = clazz.getConstructor(String.class); - return ctor.newInstance(s); - } catch (NoSuchMethodException e) { - // Shouldn't happen because we've checked for presence of ctor - throw new CommandLineParseException(e); - } catch (InstantiationException e) { - throw new CommandLineParseException("Abstract class '" + clazz.getSimpleName() + - "'cannot be used for an option value type.", e); - } catch (IllegalAccessException e) { - throw new CommandLineParseException("String constructor for option value type '" + clazz.getSimpleName() + - "' must be public.", e); - } catch (InvocationTargetException e) { - throw new CommandLineParseException("Problem constructing " + clazz.getSimpleName() + " from the string '" + s + "'.", - e.getCause()); - } - } - - public String[] getArgv() { - return argv; - } - - private class OptionDefinition { - final Field field; - final String name; - final String shortName; - final String doc; - final boolean optional; - final boolean isCollection; - final int minElements; - final int maxElements; - final String defaultValue; - boolean hasBeenSet = false; - boolean hasBeenSetFromOptionsFile = false; - Set mutuallyExclusive; - - private OptionDefinition(final Field field, final String name, final String shortName, final String doc, final boolean optional, final boolean collection, - final int minElements, final int maxElements, final Object defaultValue, String[] mutuallyExclusive) { - this.field = field; - this.name = name.toUpperCase(); - this.shortName = shortName.toUpperCase(); - this.doc = doc; - this.optional = optional; - isCollection = collection; - this.minElements = minElements; - this.maxElements = maxElements; - if (defaultValue != null) { - this.defaultValue = defaultValue.toString(); - } else { - this.defaultValue = "null"; - } - this.mutuallyExclusive = new HashSet(Arrays.asList(mutuallyExclusive)); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java deleted file mode 100644 index 088755e2a..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -public class CommandLineParserDefinitionException extends RuntimeException { - public CommandLineParserDefinitionException() { - } - - public CommandLineParserDefinitionException(String s) { - super(s); - } - - public CommandLineParserDefinitionException(String s, Throwable throwable) { - super(s, throwable); - } - - public CommandLineParserDefinitionException(Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java deleted file mode 100644 index 10ee7635f..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.util.StringUtil; -import edu.mit.broad.picard.metrics.Header; -import edu.mit.broad.picard.metrics.StringHeader; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.picard.metrics.MetricBase; - -import java.io.File; -import java.util.Date; -import java.util.List; -import java.util.ArrayList; - -/** - * Abstract class to facilitate writing command-line programs. - * - * To use: - * - * 1. Extend this class with a concrete class that has data members annotated with @Option, @PositionalArguments - * and/or @Usage annotations. - * - * 2. If there is any custom command-line validation, override customCommandLineValidation(). When this method is - * called, the command line has been parsed and set into the data members of the concrete class. - * - * 3. Implement a method doWork(). This is called after successful comand-line processing. The value it returns is - * the exit status of the program. It is assumed that the concrete class emits any appropriate error message before - * returning non-zero. doWork() may throw unchecked exceptions, which are caught and reported appropriately. - * - * 4. Implement the following static method in the concrete class: - * - * public static void main(String[] argv) { - System.exit(new MyConcreteClass().instanceMain(argv)); - } - - - */ -public abstract class CommandLineProgram { - - @Option - public File TMP_DIR = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name")); - - @Option(doc = "Control verbosity of logging") - public Log.LogLevel VERBOSITY = Log.LogLevel.INFO; - - @Option(doc = "Whether to suppress job-summary info on System.out") - public Boolean QUIET = false; - - private final String standardUsagePreamble = CommandLineParser.getStandardUsagePreamble(getClass()); - - /** - * Initialized in parseArgs. Subclasses may want to access this to do - * their own validation, and then print usage using clp. - */ - protected CommandLineParser clp; - - private final List
defaultHeaders = new ArrayList
(); - - /** - * Do the work after command line has been parsed. - * RuntimeException may be thrown by this method, and are reported appropriately. - * @return program exit status. - */ - protected abstract int doWork(); - - public int instanceMain(final String[] argv) { - // Build the default headers - final Date startDate = new Date(); - final String cmdline = getClass().getName() + " " + StringUtil.join(" ", argv); - this.defaultHeaders.add(new StringHeader(cmdline)); - this.defaultHeaders.add(new StringHeader("Started on: " + startDate)); - - if (!parseArgs(argv)) { - return 1; - } - - Log.setGlobalLogLevel(VERBOSITY); - - if (!TMP_DIR.exists()) { - // Intentially not checking the return value, because it may be that the program does not - // need a tmp_dir. If this fails, the problem will be discovered downstream. - TMP_DIR.mkdir(); - } - System.setProperty("java.io.tmpdir", TMP_DIR.getAbsolutePath()); - if (!QUIET) { - System.out.println("[" + new Date() + "] " + cmdline); - } - final int ret = doWork(); - if (!QUIET) { - System.out.println("[" + new Date() + "] " + getClass().getName() + " done."); - System.out.println("Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory()); - } - return ret; - } - - /** - * Put any custom command-line validation in an override of this method. - * clp is initialized at this point and can be used to print usage and access argv. - * Any options set by command-line parser can be validated. - * @return true if command line is valid. - */ - protected boolean customCommandLineValidation() { - return true; - } - - /** - * - * @return true if command line is valid - */ - protected boolean parseArgs(final String[] argv) { - clp = new CommandLineParser(this); - final boolean ret = clp.parseOptions(System.err, argv); - if (!ret) { - return false; - } - return customCommandLineValidation(); - } - - /** Gets a MetricsFile with default headers already written into it. */ - protected MetricsFile getMetricsFile() { - final MetricsFile file = new MetricsFile(); - for (final Header h : this.defaultHeaders) { - file.addHeader(h); - } - - return file; - } - - public String getStandardUsagePreamble() { - return standardUsagePreamble; - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java deleted file mode 100644 index 0702f3bc7..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.io.*; -import java.util.regex.Pattern; - -public class CommandLineUtils { - /** Regex for splitting on spaces. */ - public static final Pattern SPACE_SPLITTER = Pattern.compile(" "); - - // Regexes to split things apart on white space - public static final Pattern TAB_SPLITTER = Pattern.compile("\\t"); - - /** Checks that a file exists and is readable, and then returns a buffered reader for it. */ - public static BufferedReader getReader(File file) throws IOException { - return new BufferedReader(new InputStreamReader(getInputStream(file))); - } - - /** Checks that a file exists and is readable, and then returns a input stream for it. */ - public static InputStream getInputStream(File file) throws IOException { - if (!file.exists()) { - throw new RuntimeException("Specified file does not exist: " + file); - } - - if (!file.canRead()) { - throw new RuntimeException("Specified file is not readable: " + file); - } - - return new FileInputStream(file); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/Option.java b/java/lib/edu/mit/broad/picard/cmdline/Option.java deleted file mode 100644 index b7ffebdd9..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/Option.java +++ /dev/null @@ -1,60 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.lang.annotation.Documented; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -/** - * Used to annotate which fields of a CommandLineProgram are options given at the command line. - * If a command line call looks like "cmd option=foo x=y bar baz" the CommandLineProgram - * would have annotations on fields to handle the values of option and x. All options - * must be in the form name=value on the command line. The java type of the option - * will be inferred from the type of the field or from the generic type of the collection - * if this option is allowed more than once. The type must be an enum or - * have a constructor with a single String parameter. - * - * @author Alec Wysoker - */ -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -@Documented -public @interface Option { - /** The name of the option as it would appear on the command line. */ - String shortName() default ""; - - /** Text that appears for this option in text describing usage of the command line program. */ - String doc() default ""; - - /** - * If set to false, an exception will be thrown if the option is not specified. - * If 2 options are mutually exclusive and both have optional=false it will be - * interpreted as one or the other is required and an exception will only be thrown if - * neither are specified. - */ - boolean optional() default false; - - /** - * Array of option names that cannot be used in conjunction with this one. - * If 2 options are mutually exclusive and both have optional=false it will be - * interpreted as one OR the other is required and an exception will only be thrown if - * neither are specified. - */ - String[] mutex() default {}; - - /** The minimum number of times that this option is required. */ - int minElements() default 0; - - /** The maximum number of times this option is allowed. */ - int maxElements() default Integer.MAX_VALUE; -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java b/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java deleted file mode 100644 index f45301439..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java +++ /dev/null @@ -1,38 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.lang.annotation.Documented; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -/** - * Used to annotate which field of a CommandLineProgram should store parameters given at the - * command line which are not options. Fields with this annotation must be a Collection - * (and probably should be a List if order is important). - * If a command line call looks like "cmd option=foo x=y bar baz" the values "bar" and "baz" - * would be added to the collection with this annotation. The java type of the arguments - * will be inferred from the generic type of the collection. The type must be an enum or - * have a constructor with a single String parameter. - * - * @author Alec Wysoker - */ -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -@Documented -public @interface PositionalArguments { - /** The minimum number of arguments required. */ - int minElements() default 0; - - /** The maximum number of arguments allowed. */ - int maxElements() default Integer.MAX_VALUE; -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/Usage.java b/java/lib/edu/mit/broad/picard/cmdline/Usage.java deleted file mode 100644 index 13aef9467..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/Usage.java +++ /dev/null @@ -1,26 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.lang.annotation.Documented; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -/** - * Annotates the field that contains text to be displayed in a usage message. - */ -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -@Documented -public @interface Usage { - String programVersion() default ""; -} diff --git a/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java b/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java deleted file mode 100644 index 75fb98b16..000000000 --- a/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java +++ /dev/null @@ -1,62 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.BasicTextFileParser; -import edu.mit.broad.picard.util.Interval; -import edu.mit.broad.picard.util.FormatUtil; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; - -import java.io.File; -import java.util.List; - -/** - * Converts an arachne style map file to the new interval list format. - * - * @author Tim Fennell - */ -public class ArachneMapToIntervalList extends CommandLineProgram { - @Option(shortName="M", doc="The path to an archne style map file") public File MAP; - @Option(shortName="SD", doc="A sequence dictionary in SAM or BAM format") public File SEQUENCE_DICTIONARY; - @Option(shortName="O", doc="The output file to write the interval list to") public File OUTPUT; - @Option(shortName="P", doc="Prefix to use when generating names") public String PREFIX; - - /** Stock main method. */ - public static void main(String[] argv) { - System.exit(new ArachneMapToIntervalList().instanceMain(argv)); - } - - protected int doWork() { - IoUtil.assertFileIsReadable(MAP); - IoUtil.assertFileIsReadable(SEQUENCE_DICTIONARY); - IoUtil.assertFileIsWritable(OUTPUT); - - SAMFileReader sam = new SAMFileReader(SEQUENCE_DICTIONARY); - SAMFileHeader header = sam.getFileHeader(); - List seqs = header.getSequences(); - IntervalList list = new IntervalList(header); - - BasicTextFileParser parser = new BasicTextFileParser(true, 3, MAP); - FormatUtil format = new FormatUtil(); - int i=1; - - while (parser.hasNext()) { - String[] fields = parser.next(); - int seqIndex = format.parseInt(fields[0]); - int start = format.parseInt(fields[1]) + 1; - int end = format.parseInt(fields[2]) + 1; - String seq = seqs.get(seqIndex).getSequenceName(); - - Interval interval = new Interval(seq, start, end, false, PREFIX + "_" + i++); - list.add(interval); - } - - list.sort(); - list.write(OUTPUT); - - return 0; - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java b/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java deleted file mode 100644 index d3be86825..000000000 --- a/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java +++ /dev/null @@ -1,51 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.sam.SAMFileReader; - -import java.io.File; - -/** - * Calculates a set of HS metrics from a sam or bam file. - * - * @author Tim Fennell - */ -public class CalculateHsMetrics extends CommandLineProgram { - @Usage public final String USAGE = - "Calculates a set of Hybrid Selection specific metrics from an aligned SAM" + - "or BAM file."; - @Option(shortName="BI") public File BAIT_INTERVALS; - @Option(shortName="TI") public File TARGET_INTERVALS; - @Option(shortName="I") public File INPUT; - @Option(shortName="M") public File METRICS_FILE; - - /** Stock main method. */ - public static void main(String[] argv) { - System.exit(new CalculateHsMetrics().instanceMain(argv)); - } - - /** - * Asserts that files are readable and writable and then fires off an - * HsMetricsCalculator instance to do the real work. - */ - protected int doWork() { - IoUtil.assertFileIsReadable(BAIT_INTERVALS); - IoUtil.assertFileIsReadable(TARGET_INTERVALS); - IoUtil.assertFileIsReadable(INPUT); - IoUtil.assertFileIsWritable(METRICS_FILE); - - HsMetricsCalculator calculator = new HsMetricsCalculator(BAIT_INTERVALS, TARGET_INTERVALS); - SAMFileReader sam = new SAMFileReader(INPUT); - calculator.analyze(sam.iterator()); - - MetricsFile metrics = getMetricsFile(); - metrics.addMetric(calculator.getMetrics()); - - metrics.write(METRICS_FILE); - return 0; - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/GenomeMask.java b/java/lib/edu/mit/broad/picard/directed/GenomeMask.java deleted file mode 100644 index 27be5df71..000000000 --- a/java/lib/edu/mit/broad/picard/directed/GenomeMask.java +++ /dev/null @@ -1,52 +0,0 @@ -package edu.mit.broad.picard.directed; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.BitSet; -import java.util.SortedMap; -import java.util.TreeMap; - -/** - * Utility class to store coordinates of interest in per-sequence bitmasks. - */ -public class GenomeMask { - - // if memory usage becomes a problem... this could be changed to a SparseBitSet - // http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html - private SortedMap data = new TreeMap(); - - - public GenomeMask() { - } - - public boolean get(int contig, int position) { - BitSet bits = data.get(contig); - return (bits != null) && bits.get(position); - } - - public BitSet get(int contig) { - return data.get(contig); - } - - /** - * Get an existing BitSet for the given contig, or create one if not already present. This is - * useful when initializing a GenomeMask from an external source. - * @param contig which BitSet - * @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size. - * @return the BitSet for the given contig, creating one if necessary - */ - public BitSet getOrCreate(int contig, int numBits) { - BitSet ret = data.get(contig); - if (ret == null) { - ret = new BitSet(numBits); - data.put(contig, ret); - } - return ret; - } - - public int getMaxContig() { - return data.lastKey(); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java b/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java deleted file mode 100644 index ba81a7eb6..000000000 --- a/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.picard.util.Interval; -import edu.mit.broad.picard.io.IoUtil; - -import java.util.List; -import java.util.BitSet; -import java.io.File; - -/** - * Create a GenomeMask from an IntervalList or a file containing an IntervalList - */ -public class GenomeMaskFactory { - - public GenomeMask makeGenomeMaskFromIntervalList(IntervalList intervalList) { - if (intervalList.getHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - intervalList.sort(); - } - List uniqueIntervals = intervalList.getUniqueIntervals(); - GenomeMask ret = new GenomeMask(); - - SAMFileHeader samHeader = intervalList.getHeader(); - - for (Interval interval : uniqueIntervals) { - // TODO: Maybe figure out more intelligently how big the bitset might be? - BitSet bitSet = ret.getOrCreate(samHeader.getSequenceIndex(interval.getSequence()), interval.getEnd() + 1); - bitSet.set(interval.getStart(), interval.getEnd() + 1); - } - return ret; - } - - public GenomeMask makeGenomeMaskFromIntervalList(File intervalListFile) { - IoUtil.assertFileIsReadable(intervalListFile); - IntervalList intervalList = IntervalList.fromFile(intervalListFile); - return makeGenomeMaskFromIntervalList(intervalList); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/HsMetrics.java b/java/lib/edu/mit/broad/picard/directed/HsMetrics.java deleted file mode 100644 index 74817f919..000000000 --- a/java/lib/edu/mit/broad/picard/directed/HsMetrics.java +++ /dev/null @@ -1,108 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.metrics.MetricBase; - -/** - * The set of metrics captured that are specific to a hybrid selection analysis. - * - * @author Tim Fennell - */ -public class HsMetrics extends MetricBase { - /** The name of the bait set used in the hybrid selection. */ - public String BAIT_SET; - - /** The number of bases in the reference genome used for alignment. */ - public long GENOME_SIZE; - - /** The number of bases which have one or more baits on top of them. */ - public long BAIT_TERRITORY; - - /** The unique number of target bases in the experiment where target is usually exons etc. */ - public long TARGET_TERRITORY; - - /** Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target. */ - public double BAIT_DESIGN_EFFICIENCY; - - /** The total number of reads in the SAM or BAM file examine. */ - public int TOTAL_READS; - - /** The number of reads that pass the vendor's filter. */ - public int PF_READS; - - /** The number of PF reads that are not marked as duplicates. */ - public int PF_UNIQUE_READS; - - /** PF reads / total reads. The percent of reads passing filter. */ - public double PCT_PF_READS; - - /** PF Unique Reads / Total Reads. */ - public double PCT_PF_UQ_READS; - - /** The number of PF reads that are aligned with mapping score > 0 to the reference genome. */ - public int PF_READS_ALIGNED; - - /** PF Reads Aligned / PF Reads. */ - public double PCT_PF_READS_ALIGNED; - - /** The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps. */ - public int PF_BASES_ALIGNED; - - /** The number of PF aligned bases that mapped to a baited region of the genome. */ - public long ON_BAIT_BASES; - - /** The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region. */ - public long NEAR_BAIT_BASES; - - /** The number of PF aligned bases that mapped to neither on or near a bait. */ - public long OFF_BAIT_BASES; - - /** The number of PF aligned bases that mapped to a targetted region of the genome. */ - public long ON_TARGET_BASES; - - /** On+Near Bait Bases / PF Bases Aligned. */ - public double PCT_SELECTED_BASES; - - /** The percentage of aligned PF bases that mapped neither on or near a bait. */ - public double PCT_OFF_BAIT; - - /** The percentage of on+near bait bases that are on as opposed to near. */ - public double ON_BAIT_VS_SELECTED; - - /** The mean coverage of all baits in the experiment. */ - public double MEAN_BAIT_COVERAGE; - - /** The mean coverage of targets that recieved at least coverage depth = 2 at one base. */ - public double MEAN_TARGET_COVERAGE; - - /** The fold by which the baited region has been amplified above genomic background. */ - public double FOLD_ENRICHMENT; - - /** The number of targets that did not reach coverage=2 over any base. */ - public double ZERO_CVG_TARGETS_PCT; - - /** - * The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to - * the mean coverage level in those targets. - */ - public double FOLD_80_BASE_PENALTY; - - - /** - * Calculates the metrics in this class that can be derived from other metrics in the class. - */ - public void calculateDerivedMetrics() { - BAIT_DESIGN_EFFICIENCY = (double) TARGET_TERRITORY / (double) BAIT_TERRITORY; - - PCT_PF_READS = PF_READS / (double) TOTAL_READS; - PCT_PF_UQ_READS = PF_UNIQUE_READS / (double) TOTAL_READS; - PCT_PF_READS_ALIGNED = PF_READS_ALIGNED / (double) PF_UNIQUE_READS; - - double denominator = (ON_BAIT_BASES + NEAR_BAIT_BASES + OFF_BAIT_BASES); - - PCT_SELECTED_BASES = (ON_BAIT_BASES + NEAR_BAIT_BASES) / denominator; - PCT_OFF_BAIT = OFF_BAIT_BASES / denominator; - ON_BAIT_VS_SELECTED = ON_BAIT_BASES / (double) (ON_BAIT_BASES + NEAR_BAIT_BASES); - MEAN_BAIT_COVERAGE = ON_BAIT_BASES / (double) BAIT_TERRITORY; - FOLD_ENRICHMENT = (ON_BAIT_BASES/ denominator) / ((double) BAIT_TERRITORY / GENOME_SIZE); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java b/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java deleted file mode 100644 index a454642a7..000000000 --- a/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java +++ /dev/null @@ -1,207 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.util.*; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.AlignmentBlock; -import edu.mit.broad.sam.SAMSequenceRecord; - -import java.util.*; -import java.io.*; - -/** - * Calculates HS metrics for a given SAM or BAM file. Requires the input of a list of - * target intervals and a list of bait intervals. Can be invoked either on an entire - * iterator of SAMRecords or be passed SAMRecords one at a time. - * - * @author Tim Fennell - */ -public class HsMetricsCalculator { - // What is considered "near" to the bait - private static final int NEAR_BAIT_DISTANCE = 250; - private static final Log log = Log.getInstance(HsMetricsCalculator.class); - - // Holds file names and other parameter related junk - private SAMFileReader sam; - private File baitFile; - private File targetFile; - private IntervalList baits; - private IntervalList targets; - - // Overlap detector for finding overlaps between reads and the experimental targets - private OverlapDetector targetDetector = new OverlapDetector(0,0); - - // Overlap detector for finding overlaps between the reads and the baits (and the near bait space) - private OverlapDetector baitDetector = new OverlapDetector(-NEAR_BAIT_DISTANCE,0); - - // A Map to accumulate per-bait-region (i.e. merge of overlapping baits) coverage. */ - private Map coverageByTarget = new HashMap(); - - private HsMetrics metrics = new HsMetrics(); - - /** - * Constructor that parses the squashed reference to genome reference file and stores the - * information in a map for later use. - */ - public HsMetricsCalculator(File baits, File targets) { - this.baitFile = baits; - this.targetFile = targets; - this.baits = IntervalList.fromFile(baits); - this.targets = IntervalList.fromFile(targets); - - this.metrics.BAIT_SET = baits.getName(); - int tmp = this.metrics.BAIT_SET.indexOf("."); - if (tmp > 0) { - this.metrics.BAIT_SET = this.metrics.BAIT_SET.substring(0, tmp); - } - - List uniqueBaits = this.baits.getUniqueIntervals(); - this.baitDetector.addAll(uniqueBaits, uniqueBaits); - this.metrics.BAIT_TERRITORY = Interval.countBases(uniqueBaits); - - List uniqueTargets = this.targets.getUniqueIntervals(); - this.targetDetector.addAll(uniqueTargets, uniqueTargets); - this.metrics.TARGET_TERRITORY = Interval.countBases(uniqueTargets); - - for (SAMSequenceRecord seq : this.baits.getHeader().getSequences()) { - this.metrics.GENOME_SIZE += seq.getSequenceLength(); - } - - // Populate the coverage by target map - for (Interval target : this.targets.getIntervals()) { - this.coverageByTarget.put(target, new Coverage(target, 0)); - } - } - - /** Iterates over all records in the file and collects metrics. */ - public void analyze(Iterator records) { - int i = 0; - while (records.hasNext()) { - analyze(records.next()); - - if (++i % 1000000 == 0) { - log.info("Processed " + i + " records so far."); - } - } - } - - /** Adds information about an individual SAMRecord to the statistics. */ - public void analyze(SAMRecord rec) { - // Just plain avoid records that are marked as not-primary - if (rec.getNotPrimaryAlignmentFlag()) return; - - this.metrics.TOTAL_READS += 1; - - // Check for PF reads - if (rec.getReadFailsVendorQualityCheckFlag()) { - return; - } - else { - ++this.metrics.PF_READS; - } - - // Check for reads that are marked as duplicates - if (rec.getDuplicateReadFlag()) { - return; - } - else { - ++this.metrics.PF_UNIQUE_READS; - } - - // Don't bother with reads that didn't align uniquely - if (rec.getReadUnmappedFlag() || rec.getMappingQuality() == 0) { - return; - } - - this.metrics.PF_READS_ALIGNED += 1; - for (AlignmentBlock block : rec.getAlignmentBlocks()) { - this.metrics.PF_BASES_ALIGNED += block.getLength(); - } - - Interval read = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd()); - - // Find the target overlaps - Collection targets = this.targetDetector.getOverlaps(read); - if (targets != null && !targets.isEmpty()) { - for (Interval target : targets) { - Coverage coverage = this.coverageByTarget.get(target); - - for (AlignmentBlock block : rec.getAlignmentBlocks()) { - int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength()); - for (int pos=block.getReferenceStart(); pos<=end; ++ pos) { - if (pos >= target.getStart() && pos <= target.getEnd()) { - ++this.metrics.ON_TARGET_BASES; - coverage.addBase(pos - target.getStart()); - } - } - } - } - } - - // Now do the bait overlaps - int mappedBases = 0; - for (AlignmentBlock block : rec.getAlignmentBlocks()) mappedBases += block.getLength(); - Collection baits = this.baitDetector.getOverlaps(read); - int onBaitBases = 0; - - if (baits != null && !baits.isEmpty()) { - for (Interval bait : baits) { - for (AlignmentBlock block : rec.getAlignmentBlocks()) { - int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength()); - - for (int pos=block.getReferenceStart(); pos<=end; ++pos) { - if (pos >= bait.getStart() && pos <= bait.getEnd()) ++onBaitBases; - } - } - } - - this.metrics.ON_BAIT_BASES += onBaitBases; - this.metrics.NEAR_BAIT_BASES += (mappedBases - onBaitBases); - } - else { - this.metrics.OFF_BAIT_BASES += mappedBases; - } - - } - - /** Calculates a few last summary metrics and then returns the metrics calculated. */ - public HsMetrics getMetrics() { - this.metrics.calculateDerivedMetrics(); - calculateTargetCoverageMetrics(); - return this.metrics; - } - - /** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */ - private void calculateTargetCoverageMetrics() { - short[] depths = new short[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array - int zeroCoverageTargets = 0; - int depthIndex = 0; - double totalCoverage = 0; - int basesConsidered = 0; - - for (Coverage c : this.coverageByTarget.values()) { - if (!c.hasCoverage()) { - ++zeroCoverageTargets; - continue; - } - - final short[] targetDepths = c.getDepths(); - basesConsidered += targetDepths.length; - - for (short depth : targetDepths) { - depths[depthIndex++] = depth; - totalCoverage += depth; - } - } - - this.metrics.MEAN_TARGET_COVERAGE = totalCoverage / basesConsidered; - - // Sort the array (ASCENDING) and then find the base the coverage value that lies at the 80% - // line, which is actually at 20% into the array now - Arrays.sort(depths); - int indexOf80thPercentile = (depths.length - basesConsidered) + (int) (basesConsidered * 0.2); - int coverageAt80thPercentile = depths[indexOf80thPercentile]; - this.metrics.FOLD_80_BASE_PENALTY = this.metrics.MEAN_TARGET_COVERAGE / coverageAt80thPercentile; - this.metrics.ZERO_CVG_TARGETS_PCT = zeroCoverageTargets / (double) this.targets.getIntervals().size(); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/IntervalList.java b/java/lib/edu/mit/broad/picard/directed/IntervalList.java deleted file mode 100644 index 087537c0a..000000000 --- a/java/lib/edu/mit/broad/picard/directed/IntervalList.java +++ /dev/null @@ -1,240 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.util.Interval; -import edu.mit.broad.picard.util.FormatUtil; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.util.StringLineReader; - -import java.util.*; -import java.io.*; - -/** - * Represents a list of intervals against a reference sequence that can be written to - * and read from a file. The file format is relatively simple and reflects the SAM - * alignment format to a degree. - * - * A SAM style header must be present in the file which lists the sequence records - * against which the intervals are described. After the header the file then contains - * records one per line in text format with the following values tab-separated: - * - Sequence name - * - Start position (1-based) - * - End position (1-based, end inclusive) - * - Strand (either + or -) - * - Interval name (an, ideally unique, name for the interval) - * - * @author Tim Fennell - */ -public class IntervalList implements Iterable { - private SAMFileHeader header; - private List intervals = new ArrayList(); - - /** Constructs a new interval list using the supplied header information. */ - public IntervalList(SAMFileHeader header) { - if (header == null) { - throw new IllegalArgumentException("SAMFileHeader must be supplied."); - } - this.header = header; - } - - /** Gets the header (if there is one) for the interval list. */ - public SAMFileHeader getHeader() { return header; } - - /** Returns an iterator over the intervals. */ - public Iterator iterator() { return this.intervals.iterator(); } - - /** Adds an interval to the list of intervals. */ - public void add(Interval interval) { this.intervals.add(interval); } - - /** Sorts the internal collection of intervals by coordinate. */ - public void sort() { - Collections.sort(this.intervals, new IntervalCoordinateComparator(this.header)); - this.header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - } - - /** Gets the set of intervals as held internally. */ - public List getIntervals() { - return Collections.unmodifiableList(this.intervals); - } - - /** - * Merges the list of intervals and then reduces them down where regions overlap - * or are directly adjacent to one another. During this process the "merged" interval - * will retain the strand and name of the 5' most interval merged. - * - * @return the set of unique intervals condensed from the contained intervals - */ - public List getUniqueIntervals() { - List unique = new ArrayList(); - ListIterator iterator = this.intervals.listIterator(); - Interval previous = iterator.next(); - - while (iterator.hasNext()) { - Interval next = iterator.next(); - if (previous.intersects(next) || previous.abuts(next)) { - previous = new Interval(previous.getSequence(), - previous.getStart(), - Math.max(previous.getEnd(), next.getEnd()), - previous.isNegativeStrand(), - previous.getName()); - } - else { - unique.add(previous); - previous = next; - } - } - - if (previous != null) unique.add(previous); - - return unique; - } - - /** Gets the (potentially redundant) sum of the length of the intervals in the list. */ - public long getBaseCount() { - return Interval.countBases(this.intervals); - } - - /** Gets the count of unique bases represented by the intervals in the list. */ - public long getUniqueBaseCount() { - return Interval.countBases(getUniqueIntervals()); - } - - /** - * Parses an interval list from a file. - * @param file the file containing the intervals - * @return an IntervalList object that contains the headers and intervals from the file - */ - public static IntervalList fromFile(File file) { - BufferedReader in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file))); - - try { - // Setup a reader and parse the header - StringBuilder builder = new StringBuilder(4096); - String line = null; - - while ((line = in.readLine()) != null) { - if (line.startsWith("@")) { - builder.append(line).append('\n'); - } - else { - break; - } - } - - if (builder.length() == 0) { - throw new IllegalStateException("Interval list file must contain header: " + file.getAbsolutePath()); - } - - StringLineReader headerReader = new StringLineReader(builder.toString()); - SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - IntervalList list = new IntervalList(codec.decode(headerReader, file)); - - // Then read in the intervals - FormatUtil format = new FormatUtil(); - do { - if (line.trim().length() == 0) continue; // skip over blank lines - - // Make sure we have the right number of fields - String fields[] = line.split("\t"); - if (fields.length != 5) { - throw new PicardException("Invalid interval record contains " + - fields.length + " fields: " + line); - } - - // Then parse them out - String seq = fields[0]; - int start = format.parseInt(fields[1]); - int end = format.parseInt(fields[2]); - - boolean negative; - if (fields[3].equals("-")) negative = true; - else if (fields[3].equals("+")) negative = false; - else throw new IllegalArgumentException("Invalid strand field: " + fields[3]); - - String name = fields[4]; - - Interval interval = new Interval(seq, start, end, negative, name); - list.intervals.add(interval); - } - while ((line = in.readLine()) != null); - - return list; - } - catch (IOException ioe) { - throw new PicardException("Error parsing interval list file: " + file.getAbsolutePath(), ioe); - } - finally { - try { in.close(); } catch (Exception e) { /* do nothing */ } - } - } - - /** - * Writes out the list of intervals to the supplied file. - * @param file a file to write to. If exists it will be overwritten. - */ - public void write(File file) { - try { - BufferedWriter out = new BufferedWriter(new OutputStreamWriter(IoUtil.openFileForWriting(file))); - FormatUtil format = new FormatUtil(); - - // Write out the header - if (this.header != null) { - SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - codec.encode(out, this.header); - } - - // Write out the intervals - for (Interval interval : this) { - out.write(interval.getSequence()); - out.write('\t'); - out.write(format.format(interval.getStart())); - out.write('\t'); - out.write(format.format(interval.getEnd())); - out.write('\t'); - out.write(interval.isPositiveStrand() ? '+' : '-'); - out.write('\t'); - out.write(interval.getName()); - out.newLine(); - } - - out.flush(); - out.close(); - } - catch (IOException ioe) { - throw new PicardException("Error writing out interval list to file: " + file.getAbsolutePath(), ioe); - } - } -} - -/** - * Comparator that orders intervals based on their sequence index, by coordinate - * then by strand and finally by name. - */ -class IntervalCoordinateComparator implements Comparator { - private SAMFileHeader header; - - /** Constructs a comparator using the supplied sequence header. */ - IntervalCoordinateComparator(SAMFileHeader header) { - this.header = header; - } - - public int compare(Interval lhs, Interval rhs) { - int lhsIndex = this.header.getSequenceIndex(lhs.getSequence()); - int rhsIndex = this.header.getSequenceIndex(rhs.getSequence()); - int retval = lhsIndex - rhsIndex; - - if (retval == 0) retval = lhs.getStart() - rhs.getStart(); - if (retval == 0) retval = lhs.getEnd() - rhs.getEnd(); - if (retval == 0) { - if (lhs.isPositiveStrand() && rhs.isNegativeStrand()) retval = -1; - else if (lhs.isNegativeStrand() && rhs.isPositiveStrand()) retval = 1; - } - if (retval == 0) { - retval = lhs.getName().compareTo(rhs.getName()); - } - - return retval; - } -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java b/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java deleted file mode 100644 index 3ee558c99..000000000 --- a/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -import java.util.List; - -/** - * Aggregates multiple filters and provides a method for applying them all to a given record with - * one method call. - */ -public class AggregateFilter implements SamRecordFilter { - - private final List filters; - - /** - * Constructor - * @param filters the list of filters that this Aggregator applies - */ - public AggregateFilter(List filters) { - this.filters = filters; - } - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches at least one filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - for (SamRecordFilter filter : filters) { - if (filter.filterOut(record)) { - return true; - } - } - return false; - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java b/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java deleted file mode 100644 index 3e0c9bb3f..000000000 --- a/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java +++ /dev/null @@ -1,28 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -/** - * Filter for filtering out reads that do not pass the quality filter - */ -public class FailsVendorReadQualityFilter implements SamRecordFilter { - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - return record.getReadFailsVendorQualityCheckFlag(); - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java b/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java deleted file mode 100644 index ddb85b9d7..000000000 --- a/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java +++ /dev/null @@ -1,94 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.picard.util.CloserUtil; - -import java.util.Iterator; -import java.util.NoSuchElementException; - -/** - * Filtering Iterator which takes a filter and an iterator and iterates - * through only those records which are not rejected by the filter. - * - * @author Kathleen Tibbetts - */ -public class FilteringIterator implements CloseableIterator { - - private final Iterator iterator; - private final SamRecordFilter filter; - private SAMRecord next = null; - - /** - * Constructor - * - * @param iterator the backing iterator - * @param filter the filter (which may be a FilterAggregator) - */ - public FilteringIterator(Iterator iterator, SamRecordFilter filter) { - this.iterator = iterator; - this.filter = filter; - next = getNextRecord(); - } - - /** - * Returns true if the iteration has more elements. - * - * @return true if the iteration has more elements. Otherwise returns false. - */ - public boolean hasNext() { - return next != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration - * @throws java.util.NoSuchElementException - */ - public SAMRecord next() { - if (next == null) { - throw new NoSuchElementException("Iterator has no more elements."); - } - SAMRecord result = next; - next = getNextRecord(); - return result; - } - - /** - * Required method for Iterator API. - * - * @throws UnsupportedOperationException - */ - public void remove() { - throw new UnsupportedOperationException("Remove() not supported by FilteringIterator"); - } - - public void close() { - CloserUtil.close(iterator); - } - - /** - * Gets the next record from the underlying iterator that passes the filter - * - * @return SAMRecord the next filter-passing record - */ - private SAMRecord getNextRecord() { - while (iterator.hasNext()) { - SAMRecord record = iterator.next(); - if (!filter.filterOut(record)) { - return record; - } - } - return null; - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java b/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java deleted file mode 100644 index d8936ca8a..000000000 --- a/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java +++ /dev/null @@ -1,26 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -/** - * API for filtering SAMRecords - */ -public interface SamRecordFilter { - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record); -} diff --git a/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java b/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java deleted file mode 100644 index 9969ae2e3..000000000 --- a/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java +++ /dev/null @@ -1,37 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.picard.util.SequenceUtil; -import edu.mit.broad.sam.SAMRecord; - -/** - * Filter to determine whether a read is "noisy" due to a poly-A run that is a sequencing artifact. - * Currently we filter out only reads that are composed entirely of As. - */ -public class SolexaNoiseFilter implements SamRecordFilter { - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - byte sequence[] = record.getReadBases(); - for (byte base : sequence) { - if (base != 'A' && base != 'a' && - !SequenceUtil.isNoCall(base)) { - return false; - } - } - return true; - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/TagFilter.java b/java/lib/edu/mit/broad/picard/filter/TagFilter.java deleted file mode 100644 index f35957ba0..000000000 --- a/java/lib/edu/mit/broad/picard/filter/TagFilter.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -import java.util.List; -import java.util.Arrays; - -/** - * Filter class for matching tag attributes in SAMRecords - */ -public class TagFilter implements SamRecordFilter { - - private final String tag; // The key of the tag to match - private final List values; // The list of matching values - - /** - * Constructor for a single value - * - * @param tag the key of the tag to match - * @param value the value to match - */ - public TagFilter(String tag, Object value) { - this.tag = tag; - this.values = Arrays.asList(value); - } - - /** - * Constructor for multiple values - * - * @param tag the key of the tag to match - * @param values the matching values - */ - public TagFilter(String tag, List values) { - this.tag = tag; - this.values = values; - } - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - return values.contains(record.getAttribute(tag)); - } - } diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliException.java b/java/lib/edu/mit/broad/picard/genotype/GeliException.java deleted file mode 100644 index 5d6fed76c..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliException.java +++ /dev/null @@ -1,30 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.genotype; - -import edu.mit.broad.picard.PicardException; - -/** - * Generic exception thrown by GELI format machinery. - * - * @author Doug Voet - */ -public class GeliException extends PicardException { - - public GeliException(String message, Throwable throwable) { - super(message, throwable); - } - - public GeliException(String message) { - super(message); - } - -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java deleted file mode 100644 index 6f1496251..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java +++ /dev/null @@ -1,20 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.genotype; - -/** - * Misc constants for GELI format - * - * @author Doug Voet - */ -public interface GeliFileConstants { - public static final byte[] GELI_MAGIC = "GELI".getBytes(); -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java deleted file mode 100644 index de72b1639..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.picard.genotype; - - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; - -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.util.BlockCompressedInputStream; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.RuntimeIOException; - - -/** - * Class for reading GELI (GEnotype LIkelihood) files. - * - * @author Doug Voet - */ -public class GeliFileReader implements Iterable -{ - private ReaderImplementation mReader = null; - - /** - * Internal interface for SAM/BAM file reader implementations. - * Implemented as an abstract class to enforce better access control. - */ - static abstract class ReaderImplementation { - abstract SAMFileHeader getFileHeader(); - abstract CloseableIterator getIterator(); - abstract void close(); - } - - - public GeliFileReader(final InputStream stream) { - try { - final BufferedInputStream bufferedStream = toBufferedStream(stream); - if (isValidGELIFile(bufferedStream)) { - mReader = new GeliFileReaderImplementation(bufferedStream); - } else { - throw new GeliException("Unrecognized file format"); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - public GeliFileReader(final File file) { - try { - final BufferedInputStream bufferedStream = - new BufferedInputStream(new FileInputStream(file)); - if (isValidGELIFile(bufferedStream)) { - bufferedStream.close(); - final GeliFileReaderImplementation reader = new GeliFileReaderImplementation(file); - mReader = reader; - } else { - bufferedStream.close(); - throw new GeliException("Unrecognized file format"); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - public void close() { - if (mReader != null) { - mReader.close(); - } - mReader = null; - } - - public SAMFileHeader getFileHeader() { - return mReader.getFileHeader(); - } - - public CloseableIterator iterator() { - return mReader.getIterator(); - } - - private boolean isValidGELIFile(final InputStream stream) - throws IOException { - return BlockCompressedInputStream.isValidFile(stream); - } - - private BufferedInputStream toBufferedStream(final InputStream stream) { - if (stream instanceof BufferedInputStream) { - return (BufferedInputStream) stream; - } else { - return new BufferedInputStream(stream); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java deleted file mode 100644 index 7f544532e..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.picard.genotype; - - -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.LineNumberReader; -import java.io.StringReader; -import java.util.Arrays; - -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.BlockCompressedInputStream; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.StringLineReader; - -/** - * Internal class for reading GELI files. - */ -class GeliFileReaderImplementation extends GeliFileReader.ReaderImplementation { - - private boolean mIsSeekable = false; - private BinaryCodec mStream = null; - private final BlockCompressedInputStream mCompressedInputStream; - private SAMFileHeader mFileHeader = null; - private long mFirstRecordPointer = 0; - private CloseableIterator mCurrentIterator = null; - - - GeliFileReaderImplementation(final InputStream stream) - throws IOException { - mIsSeekable = false; - mCompressedInputStream = new BlockCompressedInputStream(stream); - mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); - readHeader(null); - } - - GeliFileReaderImplementation(final File file) - throws IOException { - mIsSeekable = true; - mCompressedInputStream = new BlockCompressedInputStream(file); - mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); - readHeader(file); - mFirstRecordPointer = mCompressedInputStream.getFilePointer(); - } - - void close() { - if (mStream != null) { - mStream.close(); - } - mStream = null; - mFileHeader = null; - } - - SAMFileHeader getFileHeader() { - return mFileHeader; - } - - CloseableIterator getIterator() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (mIsSeekable) { - try { - mCompressedInputStream.seek(mFirstRecordPointer); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - mCurrentIterator = new GELIFileIterator(); - return mCurrentIterator; - } - - private void readHeader(final File file) - throws IOException { - - final byte[] buffer = new byte[4]; - mStream.readBytes(buffer); - if (!Arrays.equals(buffer, GeliFileConstants.GELI_MAGIC)) { - throw new IOException("Invalid GELI file header"); - } - - final int headerTextLength = mStream.readInt(); - final String textHeader = mStream.readString(headerTextLength); - mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), - file); - - final int sequenceCount = mStream.readInt(); - if (sequenceCount != mFileHeader.getSequences().size()) { - throw new GeliException("Number of sequences in text header (" + mFileHeader.getSequences().size() + - ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); - } - for (int i = 0; i < sequenceCount; i++) { - readSequenceRecord(file); -// final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); -// if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { -// throw new GELIException("For sequence " + i + ", text and binary have different names in file " + -// file); -// } -// if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { -// throw new GELIException("For sequence " + i + ", text and binary have different lengths in file " + -// file); -// } - } - } - - private SAMSequenceRecord readSequenceRecord(final File file) { - final int nameLength = mStream.readInt(); - if (nameLength <= 1) { - throw new GeliException("Invalid BAM file header: missing sequence name in file " + file); - } - final String sequenceName = mStream.readString(nameLength - 1); - // Skip the null terminator - mStream.readByte(); - final int sequenceLength = mStream.readInt(); - final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName); - record.setSequenceLength(sequenceLength); - return record; - } - - private class GELIFileIterator - implements CloseableIterator { - - private GenotypeLikelihoods mNextRecord = null; - private final GenotypeLikelihoodsCodec likelihoodsCodec = new GenotypeLikelihoodsCodec(); - - - GELIFileIterator() { - this(true); - } - - GELIFileIterator(final boolean advance) { - likelihoodsCodec.setInputStream(mStream.getInputStream()); - if (advance) { - advance(); - } - } - - public void close() { - if (this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - public boolean hasNext() { - return (mNextRecord != null); - } - - public GenotypeLikelihoods next() { - final GenotypeLikelihoods result = mNextRecord; - advance(); - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - void advance() { - try { - mNextRecord = getNextRecord(); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - GenotypeLikelihoods getNextRecord() - throws IOException { - return likelihoodsCodec.decode(); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java deleted file mode 100644 index 84196b239..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.genotype; - -import java.io.DataOutputStream; -import java.io.File; -import java.io.StringWriter; - -import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.SAMFileHeader.SortOrder; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.BlockCompressedOutputStream; -import edu.mit.broad.sam.util.SortingCollection; - -/** - * Class for writing GELI (GEnotype LIkelihood) files. - */ -public class GeliFileWriter { - private static final int MAX_RECORDS_IN_RAM = 1000000; - private SAMFileHeader.SortOrder sortOrder = SortOrder.coordinate; - private SAMFileHeader header; - private SortingCollection likelihoodsSorter; - - // These two fields are for validating presorted records. - private GenotypeLikelihoods prevLikelihoods; - private GenotypeLikelihoodsComparator presortedComparator; - - // If true, records passed to addAlignment are already in the order specified by sortOrder - private boolean presorted; - protected final BinaryCodec outputBinaryCodec; - private GenotypeLikelihoodsCodec genotypeLikelihoodsCodec = null; - - public GeliFileWriter(final File path) { - this(path, false); - } - - public GeliFileWriter(final File path, boolean presorted) { - outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path))); - outputBinaryCodec.setOutputFileName(path.toString()); - this.presorted = presorted; - } - - /** - * Must be called before addAlignment. - * @param header - */ - public void setHeader(final SAMFileHeader header) - { - this.header = header; - header.setSortOrder(sortOrder); - final StringWriter headerTextBuffer = new StringWriter(); - new SAMTextHeaderCodec().encode(headerTextBuffer, header); - final String headerText = headerTextBuffer.toString(); - - writeHeader(headerText); - - if (presorted) { - presortedComparator = makeComparator(); - } else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { - likelihoodsSorter = SortingCollection.newInstance(GenotypeLikelihoods.class, - new GenotypeLikelihoodsCodec(), makeComparator(), MAX_RECORDS_IN_RAM); - } - } - - protected SAMFileHeader getHeader() { - return header; - } - - private GenotypeLikelihoodsComparator makeComparator() { - return new GenotypeLikelihoodsComparator(); - } - - public void addGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) - { - if (presorted) { - assertPresorted(genotypeLikelihoods); - writeGenotypeLikelihoods(genotypeLikelihoods); - } else { - likelihoodsSorter.add(genotypeLikelihoods); - } - } - - private void assertPresorted(final GenotypeLikelihoods genotypeLikelihoods) { - if (prevLikelihoods != null) { - if (presortedComparator.compare(prevLikelihoods, genotypeLikelihoods) > 0) { - throw new IllegalArgumentException("GenotypeLikelihoods added out of order in GELIFileWriterImpl.addGenotypeLikelihoods for " + - getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at [" - + prevLikelihoods.getReferenceIndex() + ":" + prevLikelihoods.getPosition() + "] and [" - + genotypeLikelihoods.getReferenceIndex() + ":" + genotypeLikelihoods.getPosition() + "]"); - } - } - prevLikelihoods = genotypeLikelihoods; - } - - public final void close() - { - if (likelihoodsSorter != null) { - for (final GenotypeLikelihoods genotypeLikelihoods : likelihoodsSorter) { - writeGenotypeLikelihoods(genotypeLikelihoods); - } - likelihoodsSorter.cleanup(); - } - finish(); - } - - private void prepareToWriteAlignments() { - if (genotypeLikelihoodsCodec == null) { - genotypeLikelihoodsCodec = new GenotypeLikelihoodsCodec(); - genotypeLikelihoodsCodec.setOutputStream(outputBinaryCodec.getOutputStream()); - } - } - - /** - * Writes the record to disk. Sort order has been taken care of by the time - * this method is called. - * @param alignment - */ - protected void writeGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) { - prepareToWriteAlignments(); - genotypeLikelihoodsCodec.encode(genotypeLikelihoods); - } - - /** - * Write the header to disk. Header object is available via getHeader(). - * @param textHeader for convenience if the implementation needs it. - */ - protected void writeHeader(final String textHeader) { - outputBinaryCodec.writeBytes(GeliFileConstants.GELI_MAGIC); - - // calculate and write the length of the SAM file header text and the header text - outputBinaryCodec.writeInt(textHeader.length()); - outputBinaryCodec.writeBytes(textHeader.getBytes()); - - // write the sequences binarily. This is redundant with the text header - outputBinaryCodec.writeInt(getHeader().getSequences().size()); - for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) { - outputBinaryCodec.writeInt(sequenceRecord.getSequenceName().length() + 1); - outputBinaryCodec.writeBytes(sequenceRecord.getSequenceName().getBytes()); - outputBinaryCodec.writeByte(0); - outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength()); - } - } - - /** - * Do any required flushing here. - */ - protected void finish() { - outputBinaryCodec.close(); - } - - /** - * For producing error messages. - * @return Output filename, or null if there isn't one. - */ - protected String getFilename() { - return outputBinaryCodec.getOutputFileName(); - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java b/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java deleted file mode 100644 index d19a637c4..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java +++ /dev/null @@ -1,164 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.genotype; - -import java.util.Arrays; -import java.util.Comparator; - -/** - * Data object for Genotype Likelihoods. One object represents one row in a GELI file. - * - * @author Doug Voet - */ -public class GenotypeLikelihoods { - /** this is a guess at how much memory an instance of this object occupies */ - public static final int OBJECT_SIZE_BYTES = 150; - - public static final int AA_GENOTYPE = 0; - public static final int AC_GENOTYPE = 1; - public static final int AG_GENOTYPE = 2; - public static final int AT_GENOTYPE = 3; - public static final int CC_GENOTYPE = 4; - public static final int CG_GENOTYPE = 5; - public static final int CT_GENOTYPE = 6; - public static final int GG_GENOTYPE = 7; - public static final int GT_GENOTYPE = 8; - public static final int TT_GENOTYPE = 9; - - private static final char[][] GENOTYPES = { - "AA".toCharArray(), - "AC".toCharArray(), - "AG".toCharArray(), - "AT".toCharArray(), - "CC".toCharArray(), - "CG".toCharArray(), - "CT".toCharArray(), - "GG".toCharArray(), - "GT".toCharArray(), - "TT".toCharArray() - }; - - /** compares first by reference index then by position */ - public static class GenotypeLikelihoodsComparator implements Comparator { - @Override - public int compare(GenotypeLikelihoods thing1, GenotypeLikelihoods thing2) { - long refCompare = thing1.referenceIndex - thing2.referenceIndex; - if (refCompare == 0) { - long posCompare = thing1.position - thing2.position; - return (int) posCompare; - } else { - return (int) refCompare; - } - } - } - - - private long referenceIndex; - private long position; - private byte referenceBase; - private int numReads; - private short maxMappingQuality; - private float[] likelihoods = new float[10]; - private byte bestLikelihoodIndex = -1; // stored as byte to reduce memory footprint - private byte secondBestLikelihoodIndex = -1; // stored as byte to reduce memory footprint - - public static int getLikelihoodIndex(char[] genotype) { - char first = Character.isLowerCase(genotype[0]) ? Character.toUpperCase(genotype[0]) : genotype[0]; - char second = Character.isLowerCase(genotype[1]) ? Character.toUpperCase(genotype[1]) : genotype[1]; - if (first > second) { - char temp = first; - first = second; - second = temp; - } - for (int i=0; i>> 32)); - result = prime * result + referenceBase; - result = prime * result + (int) (referenceIndex ^ (referenceIndex >>> 32)); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - GenotypeLikelihoods other = (GenotypeLikelihoods) obj; - if (!Arrays.equals(likelihoods, other.likelihoods)) - return false; - if (maxMappingQuality != other.maxMappingQuality) - return false; - if (numReads != other.numReads) - return false; - if (position != other.position) - return false; - if (referenceBase != other.referenceBase) - return false; - if (referenceIndex != other.referenceIndex) - return false; - return true; - } - - public long getReferenceIndex() { return referenceIndex; } - public void setReferenceIndex(long sequenceIndex) { this.referenceIndex = sequenceIndex; } - public long getPosition() { return position; } - public void setPosition(long position) { this.position = position; } - public byte getReferenceBase() { return referenceBase; } - public void setReferenceBase(byte referenceBase) { this.referenceBase = referenceBase; } - public int getNumReads() { return numReads; } - public void setNumReads(int numReads) { this.numReads = numReads; } - public short getMaxMappingQuality() { return maxMappingQuality; } - public void setMaxMappingQuality(short maxMappingQuality) { this.maxMappingQuality = maxMappingQuality; } - float[] getLikelihoods() { return likelihoods; } - public int getBestLikelihoodIndex() { return bestLikelihoodIndex; } - public void setBestLikelihoodIndex(int bestLikelihoodIndex) { this.bestLikelihoodIndex = (byte) bestLikelihoodIndex; } - public int getSecondBestLikelihoodIndex() { return secondBestLikelihoodIndex; } - public void setSecondBestLikelihoodIndex(int secondBestLikelihoodIndex) { this.secondBestLikelihoodIndex = (byte) secondBestLikelihoodIndex; } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java b/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java deleted file mode 100644 index aa0679941..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.genotype; - -import java.io.InputStream; -import java.io.OutputStream; - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.RuntimeEOFException; -import edu.mit.broad.sam.util.SortingCollection; - -public class GenotypeLikelihoodsCodec implements SortingCollection.Codec { - private static final int SIG_FIG_MULTIPLIER = 100; - private static final short BLOCK_SIZE = 12 + 10 * 4; - - private OutputStream os; - private InputStream is; - private BinaryCodec binaryCodec; - - /** Returns a new genotype likelihood codec. */ - public SortingCollection.Codec clone() { - return new GenotypeLikelihoodsCodec(); - } - - /** - * Write object to OutputStream. - * - * @param genotypeLikelihoods what to write - */ - public void encode(final GenotypeLikelihoods genotypeLikelihoods) { - this.binaryCodec.writeShort(BLOCK_SIZE); - this.binaryCodec.writeUInt(genotypeLikelihoods.getReferenceIndex()); - this.binaryCodec.writeUInt(genotypeLikelihoods.getPosition()); - this.binaryCodec.writeByte(genotypeLikelihoods.getReferenceBase()); - this.binaryCodec.writeUShort(genotypeLikelihoods.getNumReads()); - this.binaryCodec.writeByte(genotypeLikelihoods.getMaxMappingQuality()); - - for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) { - writeLikelihood(genotypeLikelihoods.getLikelihoods()[i]); - } - } - - /** - * Read the next record from the input stream and convert into a java object. - * - * @return null if no more records. Should throw exception if EOF is encountered in the middle of - * a record. - */ - public GenotypeLikelihoods decode() { - int recordLength = 0; - try { - recordLength = this.binaryCodec.readShort(); - } catch (RuntimeEOFException e) { - return null; - } - if (recordLength != BLOCK_SIZE) { - throw new GeliException("Invalid record length: " + recordLength); - } - - final GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods(); - genotypeLikelihoods.setReferenceIndex(this.binaryCodec.readUInt()); - genotypeLikelihoods.setPosition(this.binaryCodec.readUInt()); - genotypeLikelihoods.setReferenceBase(this.binaryCodec.readByte()); - genotypeLikelihoods.setNumReads(this.binaryCodec.readUShort()); - genotypeLikelihoods.setMaxMappingQuality(this.binaryCodec.readByte()); - - int bestIndex = -1; - int secondBestIndex = -1; - for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) { - float likelihood = readLikelihood(); - genotypeLikelihoods.getLikelihoods()[i] = likelihood; - - if (bestIndex == -1 || genotypeLikelihoods.getLikelihood(bestIndex) < likelihood) { - secondBestIndex = bestIndex; - bestIndex = i; - } else if (secondBestIndex == -1 || genotypeLikelihoods.getLikelihood(secondBestIndex) < likelihood) { - secondBestIndex = i; - } - } - genotypeLikelihoods.setBestLikelihoodIndex(bestIndex); - genotypeLikelihoods.setSecondBestLikelihoodIndex(secondBestIndex); - - return genotypeLikelihoods; - } - - /** - * Where to write encoded output - * - * @param os - */ - public void setOutputStream(final OutputStream os) { - this.os = os; - this.binaryCodec = new BinaryCodec(os); - } - - /** - * Where to read encoded input from - * - * @param is - */ - public void setInputStream(final InputStream is) { - this.is = is; - this.binaryCodec = new BinaryCodec(is); - } - - private void writeLikelihood(float likelihood) { - float shiftedLikelihood = likelihood * SIG_FIG_MULTIPLIER; - this.binaryCodec.writeInt((int) Math.round(shiftedLikelihood)); - } - - /** - * @return - */ - private float readLikelihood() { - float likelihood = (float) this.binaryCodec.readInt() / SIG_FIG_MULTIPLIER; - return likelihood; - } - -} diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java deleted file mode 100644 index 3893e7bd1..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java +++ /dev/null @@ -1,192 +0,0 @@ -package edu.mit.broad.picard.genotype.caller; - -import edu.mit.broad.picard.sam.SamLocusIterator; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; -import edu.mit.broad.picard.reference.ReferenceSequence; -import edu.mit.broad.picard.PicardException; - -import java.io.IOException; -import java.io.BufferedWriter; -import java.io.File; -import java.util.SortedSet; -import java.util.List; - -/** - * Base class for AlleleCallers. Handles efficient access to the reference, output of data to a - * standard file format, and application of priors - */ -public abstract class AbstractAlleleCaller { - // writer for output - private final BufferedWriter writer; - - // for providing access to reference data - private final ReferenceSequenceFile referenceSequenceFile; - private final SAMFileHeader samHeader; - private ReferenceSequence referenceSequence; - - public AbstractAlleleCaller(final File reference, final SAMFileHeader samHeader, final BufferedWriter writer) { - this.writer = writer; - this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference); - this.samHeader = samHeader; - } - - - /** - * emit allele calls to the writer specified in the constructor - * - * @param li Locus to call - */ - public void callAlleles(final SamLocusIterator.LocusInfo li) throws IOException { - - - cacheReferenceSequence(li.getSequenceIndex()); - - final char ref = Character.toUpperCase((char)(referenceSequence.getBases()[li.getPosition() - 1] & 0xff)); - - - // delegate to the specific implementation - final SortedSet likelihoods = call(ref, li.getBasesAsString(), li.getQualities()); - - - final GenotypeTheory bestTheory = likelihoods.first(); - GenotypeTheory nextBestTheory = null; - GenotypeTheory refTheory = null; - final String refString = new String(new char[]{ref,ref}); - final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString); - - - final StringBuilder theoryString = new StringBuilder(); - int k=0; - for(final GenotypeTheory t : likelihoods) { - if (k == 1) { nextBestTheory = t; } - if (t.getGenotype() == refGenotype) { refTheory = t; } - - theoryString.append(t.getGenotype()) - .append(":") - .append(String.format("%.2f",t.getLikelihood())) - .append(" "); - k++; - } - - final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood(); - final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood(); - - final DiploidGenotype gt = likelihoods.first().getGenotype(); - - final String type; - if (!gt.isHet() && gt.getAllele1() == ref) { - type = "homozygous"; - } else if (!gt.isHet() && gt.getAllele1() != ref) { - type = "homozygous-SNP"; - } else { - type = "heterozygous-SNP"; - } - - final String bases = li.getBasesAsString(); - int a = 0,c = 0,g = 0,t = 0; - for(int i=0; i= the arg in the previous - * call to this method. - */ - private void cacheReferenceSequence(int sequenceIndex) { - if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) { - return; - } - referenceSequence = null; - for(referenceSequence = referenceSequenceFile.nextSequence(); - referenceSequence != null; - referenceSequence = referenceSequenceFile.nextSequence()) { - // Sanity check the sequence names against the sequence dictionary while scanning through. - if (!referenceSequence.getName().equals(samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName())) { - throw new PicardException("Sequence name mismatch at sequence index " + referenceSequence.getContigIndex() + - ": " + referenceSequence.getName() + " != " + - samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName()); - } - if (referenceSequence.getContigIndex() == sequenceIndex) { - break; - } - if (referenceSequence.getContigIndex() > sequenceIndex) { - throw new PicardException("Never found reference sequence with index " + sequenceIndex); - } - } - if (referenceSequence == null) { - throw new PicardException("Reference sequence with index " + sequenceIndex + " was not found"); - } - } - - /** - * Override this to implement a concrete genotype caller - * @param ref the reference base - * @param bases each element in the String is the base at current locus for a given read - * @param quals same length as bases. the ith element corresponds to the ith element of bases. - * @return - */ - abstract protected SortedSet call(char ref, String bases, List quals); - - - /** - * Apply a general population-based prior to the likelihood: - *