diff --git a/data/gFFTest.gff b/data/gFFTest.gff new file mode 100644 index 000000000..f458e4fd7 --- /dev/null +++ b/data/gFFTest.gff @@ -0,0 +1,7 @@ +SEQ1 EMBL atg 103 105 . + 0 +SEQ1 EMBL exon 103 172 . + 0 +SEQ1 EMBL splice5 172 173 . + . +SEQ1 netgene splice5 172 173 0.94 + . +SEQ1 genie sp5-20 163 182 2.3 + . +SEQ1 genie sp5-10 168 177 2.1 + . +SEQ2 grail ATG 17 19 2.1 - 0 diff --git a/lib/edu/mit/broad/arachne/Alignment.java b/java/lib/edu/mit/broad/arachne/Alignment.java similarity index 100% rename from lib/edu/mit/broad/arachne/Alignment.java rename to java/lib/edu/mit/broad/arachne/Alignment.java diff --git a/lib/edu/mit/broad/arachne/Fastb2Fasta.java b/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java similarity index 100% rename from lib/edu/mit/broad/arachne/Fastb2Fasta.java rename to java/lib/edu/mit/broad/arachne/Fastb2Fasta.java diff --git a/lib/edu/mit/broad/arachne/FastbReader.java b/java/lib/edu/mit/broad/arachne/FastbReader.java similarity index 100% rename from lib/edu/mit/broad/arachne/FastbReader.java rename to java/lib/edu/mit/broad/arachne/FastbReader.java diff --git a/lib/edu/mit/broad/arachne/GenomeMask.java b/java/lib/edu/mit/broad/arachne/GenomeMask.java similarity index 100% rename from lib/edu/mit/broad/arachne/GenomeMask.java rename to java/lib/edu/mit/broad/arachne/GenomeMask.java diff --git a/lib/edu/mit/broad/arachne/LookAlignReader.java b/java/lib/edu/mit/broad/arachne/LookAlignReader.java similarity index 100% rename from lib/edu/mit/broad/arachne/LookAlignReader.java rename to java/lib/edu/mit/broad/arachne/LookAlignReader.java diff --git a/lib/edu/mit/broad/cnv/AnalyzeCnvs.java b/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java similarity index 100% rename from lib/edu/mit/broad/cnv/AnalyzeCnvs.java rename to java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java diff --git a/lib/edu/mit/broad/cnv/CountAlignments.java b/java/lib/edu/mit/broad/cnv/CountAlignments.java similarity index 100% rename from lib/edu/mit/broad/cnv/CountAlignments.java rename to java/lib/edu/mit/broad/cnv/CountAlignments.java diff --git a/lib/edu/mit/broad/cnv/CountKMers.java b/java/lib/edu/mit/broad/cnv/CountKMers.java similarity index 100% rename from lib/edu/mit/broad/cnv/CountKMers.java rename to java/lib/edu/mit/broad/cnv/CountKMers.java diff --git a/lib/edu/mit/broad/cnv/CountKMers3.java b/java/lib/edu/mit/broad/cnv/CountKMers3.java similarity index 100% rename from lib/edu/mit/broad/cnv/CountKMers3.java rename to java/lib/edu/mit/broad/cnv/CountKMers3.java diff --git a/lib/edu/mit/broad/cnv/GatherAlignments.java b/java/lib/edu/mit/broad/cnv/GatherAlignments.java similarity index 100% rename from lib/edu/mit/broad/cnv/GatherAlignments.java rename to java/lib/edu/mit/broad/cnv/GatherAlignments.java diff --git a/lib/edu/mit/broad/cnv/kmer/CountKMers.java b/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java similarity index 100% rename from lib/edu/mit/broad/cnv/kmer/CountKMers.java rename to java/lib/edu/mit/broad/cnv/kmer/CountKMers.java diff --git a/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java b/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java similarity index 100% rename from lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java rename to java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java diff --git a/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java b/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java similarity index 100% rename from lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java rename to java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java diff --git a/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java b/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java similarity index 100% rename from lib/edu/mit/broad/cnv/util/GenomeBinIndex.java rename to java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java diff --git a/lib/edu/mit/broad/cnv/util/SequenceIterator.java b/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java similarity index 100% rename from lib/edu/mit/broad/cnv/util/SequenceIterator.java rename to java/lib/edu/mit/broad/cnv/util/SequenceIterator.java diff --git a/lib/edu/mit/broad/dcp/CallStatus.java b/java/lib/edu/mit/broad/dcp/CallStatus.java similarity index 100% rename from lib/edu/mit/broad/dcp/CallStatus.java rename to java/lib/edu/mit/broad/dcp/CallStatus.java diff --git a/lib/edu/mit/broad/dcp/CommandRunner.java b/java/lib/edu/mit/broad/dcp/CommandRunner.java similarity index 100% rename from lib/edu/mit/broad/dcp/CommandRunner.java rename to java/lib/edu/mit/broad/dcp/CommandRunner.java diff --git a/lib/edu/mit/broad/dcp/DistributedAlgorithm.java b/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java similarity index 100% rename from lib/edu/mit/broad/dcp/DistributedAlgorithm.java rename to java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java diff --git a/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java b/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java similarity index 100% rename from lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java rename to java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java diff --git a/lib/edu/mit/broad/dcp/DistributedCallServer.java b/java/lib/edu/mit/broad/dcp/DistributedCallServer.java similarity index 100% rename from lib/edu/mit/broad/dcp/DistributedCallServer.java rename to java/lib/edu/mit/broad/dcp/DistributedCallServer.java diff --git a/lib/edu/mit/broad/dcp/DistributedCallService.java b/java/lib/edu/mit/broad/dcp/DistributedCallService.java similarity index 100% rename from lib/edu/mit/broad/dcp/DistributedCallService.java rename to java/lib/edu/mit/broad/dcp/DistributedCallService.java diff --git a/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java b/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java similarity index 100% rename from lib/edu/mit/broad/dcp/message/DistributedCallMessage.java rename to java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java diff --git a/lib/edu/mit/broad/dcp/message/DistributedMessage.java b/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java similarity index 100% rename from lib/edu/mit/broad/dcp/message/DistributedMessage.java rename to java/lib/edu/mit/broad/dcp/message/DistributedMessage.java diff --git a/lib/edu/mit/broad/picard/PicardException.java b/java/lib/edu/mit/broad/picard/PicardException.java similarity index 100% rename from lib/edu/mit/broad/picard/PicardException.java rename to java/lib/edu/mit/broad/picard/PicardException.java diff --git a/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java b/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java rename to java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java diff --git a/lib/edu/mit/broad/picard/aligner/Aligner.java b/java/lib/edu/mit/broad/picard/aligner/Aligner.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/Aligner.java rename to java/lib/edu/mit/broad/picard/aligner/Aligner.java diff --git a/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java b/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java rename to java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java diff --git a/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java b/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java rename to java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java diff --git a/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java rename to java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java diff --git a/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java rename to java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java diff --git a/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java rename to java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java diff --git a/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java b/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java similarity index 100% rename from lib/edu/mit/broad/picard/aligner/maq/RunMaq.java rename to java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java rename to java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/CommandLineParser.java rename to java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java rename to java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java rename to java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java diff --git a/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java rename to java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java diff --git a/lib/edu/mit/broad/picard/cmdline/Option.java b/java/lib/edu/mit/broad/picard/cmdline/Option.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/Option.java rename to java/lib/edu/mit/broad/picard/cmdline/Option.java diff --git a/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java b/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/PositionalArguments.java rename to java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java diff --git a/lib/edu/mit/broad/picard/cmdline/Usage.java b/java/lib/edu/mit/broad/picard/cmdline/Usage.java similarity index 100% rename from lib/edu/mit/broad/picard/cmdline/Usage.java rename to java/lib/edu/mit/broad/picard/cmdline/Usage.java diff --git a/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java b/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java rename to java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java diff --git a/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java b/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java rename to java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java diff --git a/lib/edu/mit/broad/picard/directed/GenomeMask.java b/java/lib/edu/mit/broad/picard/directed/GenomeMask.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/GenomeMask.java rename to java/lib/edu/mit/broad/picard/directed/GenomeMask.java diff --git a/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java b/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java rename to java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java diff --git a/lib/edu/mit/broad/picard/directed/HsMetrics.java b/java/lib/edu/mit/broad/picard/directed/HsMetrics.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/HsMetrics.java rename to java/lib/edu/mit/broad/picard/directed/HsMetrics.java diff --git a/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java b/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java rename to java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java diff --git a/lib/edu/mit/broad/picard/directed/IntervalList.java b/java/lib/edu/mit/broad/picard/directed/IntervalList.java similarity index 100% rename from lib/edu/mit/broad/picard/directed/IntervalList.java rename to java/lib/edu/mit/broad/picard/directed/IntervalList.java diff --git a/lib/edu/mit/broad/picard/filter/AggregateFilter.java b/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java similarity index 100% rename from lib/edu/mit/broad/picard/filter/AggregateFilter.java rename to java/lib/edu/mit/broad/picard/filter/AggregateFilter.java diff --git a/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java b/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java similarity index 100% rename from lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java rename to java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java diff --git a/lib/edu/mit/broad/picard/filter/FilteringIterator.java b/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/filter/FilteringIterator.java rename to java/lib/edu/mit/broad/picard/filter/FilteringIterator.java diff --git a/lib/edu/mit/broad/picard/filter/SamRecordFilter.java b/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java similarity index 100% rename from lib/edu/mit/broad/picard/filter/SamRecordFilter.java rename to java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java diff --git a/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java b/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java similarity index 100% rename from lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java rename to java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java diff --git a/lib/edu/mit/broad/picard/filter/TagFilter.java b/java/lib/edu/mit/broad/picard/filter/TagFilter.java similarity index 100% rename from lib/edu/mit/broad/picard/filter/TagFilter.java rename to java/lib/edu/mit/broad/picard/filter/TagFilter.java diff --git a/lib/edu/mit/broad/picard/genotype/GeliException.java b/java/lib/edu/mit/broad/picard/genotype/GeliException.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GeliException.java rename to java/lib/edu/mit/broad/picard/genotype/GeliException.java diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GeliFileConstants.java rename to java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileReader.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GeliFileReader.java rename to java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java rename to java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java diff --git a/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GeliFileWriter.java rename to java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java diff --git a/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java b/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java rename to java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java diff --git a/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java b/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java rename to java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java diff --git a/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java rename to java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java diff --git a/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java b/java/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java rename to java/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java diff --git a/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java b/java/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java rename to java/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java diff --git a/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java rename to java/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java diff --git a/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java b/java/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java rename to java/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java diff --git a/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java rename to java/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java diff --git a/lib/edu/mit/broad/picard/illumina/BustardFileParser.java b/java/lib/edu/mit/broad/picard/illumina/BustardFileParser.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/BustardFileParser.java rename to java/lib/edu/mit/broad/picard/illumina/BustardFileParser.java diff --git a/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java b/java/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java rename to java/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java diff --git a/lib/edu/mit/broad/picard/illumina/BustardReadData.java b/java/lib/edu/mit/broad/picard/illumina/BustardReadData.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/BustardReadData.java rename to java/lib/edu/mit/broad/picard/illumina/BustardReadData.java diff --git a/lib/edu/mit/broad/picard/illumina/BustardToSam.java b/java/lib/edu/mit/broad/picard/illumina/BustardToSam.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/BustardToSam.java rename to java/lib/edu/mit/broad/picard/illumina/BustardToSam.java diff --git a/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java b/java/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java rename to java/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java diff --git a/lib/edu/mit/broad/picard/illumina/GeraldParser.java b/java/lib/edu/mit/broad/picard/illumina/GeraldParser.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/GeraldParser.java rename to java/lib/edu/mit/broad/picard/illumina/GeraldParser.java diff --git a/lib/edu/mit/broad/picard/illumina/GeraldParserFactory.java b/java/lib/edu/mit/broad/picard/illumina/GeraldParserFactory.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/GeraldParserFactory.java rename to java/lib/edu/mit/broad/picard/illumina/GeraldParserFactory.java diff --git a/lib/edu/mit/broad/picard/illumina/GeraldToSam.java b/java/lib/edu/mit/broad/picard/illumina/GeraldToSam.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/GeraldToSam.java rename to java/lib/edu/mit/broad/picard/illumina/GeraldToSam.java diff --git a/lib/edu/mit/broad/picard/illumina/SimpleMapping.java b/java/lib/edu/mit/broad/picard/illumina/SimpleMapping.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/SimpleMapping.java rename to java/lib/edu/mit/broad/picard/illumina/SimpleMapping.java diff --git a/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java b/java/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java rename to java/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java diff --git a/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java b/java/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java similarity index 100% rename from lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java rename to java/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java diff --git a/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java b/java/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java similarity index 100% rename from lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java rename to java/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java diff --git a/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java b/java/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java similarity index 100% rename from lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java rename to java/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java diff --git a/lib/edu/mit/broad/picard/importer/genotype/SNP.java b/java/lib/edu/mit/broad/picard/importer/genotype/SNP.java similarity index 100% rename from lib/edu/mit/broad/picard/importer/genotype/SNP.java rename to java/lib/edu/mit/broad/picard/importer/genotype/SNP.java diff --git a/lib/edu/mit/broad/picard/io/IoUtil.java b/java/lib/edu/mit/broad/picard/io/IoUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/io/IoUtil.java rename to java/lib/edu/mit/broad/picard/io/IoUtil.java diff --git a/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java b/java/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java rename to java/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java diff --git a/lib/edu/mit/broad/picard/metrics/Header.java b/java/lib/edu/mit/broad/picard/metrics/Header.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/Header.java rename to java/lib/edu/mit/broad/picard/metrics/Header.java diff --git a/lib/edu/mit/broad/picard/metrics/MetricBase.java b/java/lib/edu/mit/broad/picard/metrics/MetricBase.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/MetricBase.java rename to java/lib/edu/mit/broad/picard/metrics/MetricBase.java diff --git a/lib/edu/mit/broad/picard/metrics/MetricCollector.java b/java/lib/edu/mit/broad/picard/metrics/MetricCollector.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/MetricCollector.java rename to java/lib/edu/mit/broad/picard/metrics/MetricCollector.java diff --git a/lib/edu/mit/broad/picard/metrics/MetricsFile.java b/java/lib/edu/mit/broad/picard/metrics/MetricsFile.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/MetricsFile.java rename to java/lib/edu/mit/broad/picard/metrics/MetricsFile.java diff --git a/lib/edu/mit/broad/picard/metrics/StringHeader.java b/java/lib/edu/mit/broad/picard/metrics/StringHeader.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/StringHeader.java rename to java/lib/edu/mit/broad/picard/metrics/StringHeader.java diff --git a/lib/edu/mit/broad/picard/metrics/VersionHeader.java b/java/lib/edu/mit/broad/picard/metrics/VersionHeader.java similarity index 100% rename from lib/edu/mit/broad/picard/metrics/VersionHeader.java rename to java/lib/edu/mit/broad/picard/metrics/VersionHeader.java diff --git a/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java b/java/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java similarity index 100% rename from lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java rename to java/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java diff --git a/lib/edu/mit/broad/picard/quality/QualityScoreCalibrator.java b/java/lib/edu/mit/broad/picard/quality/QualityScoreCalibrator.java similarity index 100% rename from lib/edu/mit/broad/picard/quality/QualityScoreCalibrator.java rename to java/lib/edu/mit/broad/picard/quality/QualityScoreCalibrator.java diff --git a/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java b/java/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java similarity index 100% rename from lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java rename to java/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java diff --git a/lib/edu/mit/broad/picard/reference/FastaSequenceFile.java b/java/lib/edu/mit/broad/picard/reference/FastaSequenceFile.java similarity index 100% rename from lib/edu/mit/broad/picard/reference/FastaSequenceFile.java rename to java/lib/edu/mit/broad/picard/reference/FastaSequenceFile.java diff --git a/lib/edu/mit/broad/picard/reference/ReferenceSequence.java b/java/lib/edu/mit/broad/picard/reference/ReferenceSequence.java similarity index 100% rename from lib/edu/mit/broad/picard/reference/ReferenceSequence.java rename to java/lib/edu/mit/broad/picard/reference/ReferenceSequence.java diff --git a/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java b/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java similarity index 100% rename from lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java rename to java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java diff --git a/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java b/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java similarity index 100% rename from lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java rename to java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java diff --git a/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java b/java/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java rename to java/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java diff --git a/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java b/java/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java rename to java/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java diff --git a/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java b/java/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java rename to java/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java diff --git a/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java b/java/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java rename to java/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java diff --git a/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java b/java/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/DuplicationMetrics.java rename to java/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java diff --git a/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java b/java/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java rename to java/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java diff --git a/lib/edu/mit/broad/picard/sam/MarkDuplicates.java b/java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/MarkDuplicates.java rename to java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java diff --git a/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java b/java/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/MarkDuplicates2.java rename to java/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java diff --git a/lib/edu/mit/broad/picard/sam/MergeSamFiles.java b/java/lib/edu/mit/broad/picard/sam/MergeSamFiles.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/MergeSamFiles.java rename to java/lib/edu/mit/broad/picard/sam/MergeSamFiles.java diff --git a/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java b/java/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java rename to java/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java diff --git a/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java b/java/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/ReservedTagConstants.java rename to java/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java diff --git a/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java b/java/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java rename to java/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java diff --git a/lib/edu/mit/broad/picard/sam/SamLocusIterator.java b/java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/sam/SamLocusIterator.java rename to java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java diff --git a/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java b/java/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java similarity index 100% rename from lib/edu/mit/broad/picard/util/AbstractTextFileParser.java rename to java/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java diff --git a/lib/edu/mit/broad/picard/util/ArrayUtil.java b/java/lib/edu/mit/broad/picard/util/ArrayUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/ArrayUtil.java rename to java/lib/edu/mit/broad/picard/util/ArrayUtil.java diff --git a/lib/edu/mit/broad/picard/util/BasicTextFileParser.java b/java/lib/edu/mit/broad/picard/util/BasicTextFileParser.java similarity index 100% rename from lib/edu/mit/broad/picard/util/BasicTextFileParser.java rename to java/lib/edu/mit/broad/picard/util/BasicTextFileParser.java diff --git a/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java b/java/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java similarity index 100% rename from lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java rename to java/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java diff --git a/lib/edu/mit/broad/picard/util/CloserUtil.java b/java/lib/edu/mit/broad/picard/util/CloserUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/CloserUtil.java rename to java/lib/edu/mit/broad/picard/util/CloserUtil.java diff --git a/lib/edu/mit/broad/picard/util/CoordMath.java b/java/lib/edu/mit/broad/picard/util/CoordMath.java similarity index 100% rename from lib/edu/mit/broad/picard/util/CoordMath.java rename to java/lib/edu/mit/broad/picard/util/CoordMath.java diff --git a/lib/edu/mit/broad/picard/util/Coverage.java b/java/lib/edu/mit/broad/picard/util/Coverage.java similarity index 100% rename from lib/edu/mit/broad/picard/util/Coverage.java rename to java/lib/edu/mit/broad/picard/util/Coverage.java diff --git a/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java b/java/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java similarity index 100% rename from lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java rename to java/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java diff --git a/lib/edu/mit/broad/picard/util/FormatUtil.java b/java/lib/edu/mit/broad/picard/util/FormatUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/FormatUtil.java rename to java/lib/edu/mit/broad/picard/util/FormatUtil.java diff --git a/lib/edu/mit/broad/picard/util/Histogram.java b/java/lib/edu/mit/broad/picard/util/Histogram.java similarity index 100% rename from lib/edu/mit/broad/picard/util/Histogram.java rename to java/lib/edu/mit/broad/picard/util/Histogram.java diff --git a/lib/edu/mit/broad/picard/util/Interval.java b/java/lib/edu/mit/broad/picard/util/Interval.java similarity index 100% rename from lib/edu/mit/broad/picard/util/Interval.java rename to java/lib/edu/mit/broad/picard/util/Interval.java diff --git a/lib/edu/mit/broad/picard/util/IntervalTree.java b/java/lib/edu/mit/broad/picard/util/IntervalTree.java similarity index 100% rename from lib/edu/mit/broad/picard/util/IntervalTree.java rename to java/lib/edu/mit/broad/picard/util/IntervalTree.java diff --git a/lib/edu/mit/broad/picard/util/ListMap.java b/java/lib/edu/mit/broad/picard/util/ListMap.java similarity index 100% rename from lib/edu/mit/broad/picard/util/ListMap.java rename to java/lib/edu/mit/broad/picard/util/ListMap.java diff --git a/lib/edu/mit/broad/picard/util/Log.java b/java/lib/edu/mit/broad/picard/util/Log.java similarity index 100% rename from lib/edu/mit/broad/picard/util/Log.java rename to java/lib/edu/mit/broad/picard/util/Log.java diff --git a/lib/edu/mit/broad/picard/util/MathUtil.java b/java/lib/edu/mit/broad/picard/util/MathUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/MathUtil.java rename to java/lib/edu/mit/broad/picard/util/MathUtil.java diff --git a/lib/edu/mit/broad/picard/util/OverlapDetector.java b/java/lib/edu/mit/broad/picard/util/OverlapDetector.java similarity index 100% rename from lib/edu/mit/broad/picard/util/OverlapDetector.java rename to java/lib/edu/mit/broad/picard/util/OverlapDetector.java diff --git a/lib/edu/mit/broad/picard/util/PasteParser.java b/java/lib/edu/mit/broad/picard/util/PasteParser.java similarity index 100% rename from lib/edu/mit/broad/picard/util/PasteParser.java rename to java/lib/edu/mit/broad/picard/util/PasteParser.java diff --git a/lib/edu/mit/broad/picard/util/PeekableIterator.java b/java/lib/edu/mit/broad/picard/util/PeekableIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/util/PeekableIterator.java rename to java/lib/edu/mit/broad/picard/util/PeekableIterator.java diff --git a/lib/edu/mit/broad/picard/util/ProcessExecutor.java b/java/lib/edu/mit/broad/picard/util/ProcessExecutor.java similarity index 100% rename from lib/edu/mit/broad/picard/util/ProcessExecutor.java rename to java/lib/edu/mit/broad/picard/util/ProcessExecutor.java diff --git a/lib/edu/mit/broad/picard/util/RExecutor.java b/java/lib/edu/mit/broad/picard/util/RExecutor.java similarity index 100% rename from lib/edu/mit/broad/picard/util/RExecutor.java rename to java/lib/edu/mit/broad/picard/util/RExecutor.java diff --git a/lib/edu/mit/broad/picard/util/SamPairUtil.java b/java/lib/edu/mit/broad/picard/util/SamPairUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/SamPairUtil.java rename to java/lib/edu/mit/broad/picard/util/SamPairUtil.java diff --git a/lib/edu/mit/broad/picard/util/SequenceUtil.java b/java/lib/edu/mit/broad/picard/util/SequenceUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/SequenceUtil.java rename to java/lib/edu/mit/broad/picard/util/SequenceUtil.java diff --git a/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java b/java/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java similarity index 100% rename from lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java rename to java/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java diff --git a/lib/edu/mit/broad/picard/util/StringUtil.java b/java/lib/edu/mit/broad/picard/util/StringUtil.java similarity index 100% rename from lib/edu/mit/broad/picard/util/StringUtil.java rename to java/lib/edu/mit/broad/picard/util/StringUtil.java diff --git a/lib/edu/mit/broad/picard/util/TabbedTextFileParser.java b/java/lib/edu/mit/broad/picard/util/TabbedTextFileParser.java similarity index 100% rename from lib/edu/mit/broad/picard/util/TabbedTextFileParser.java rename to java/lib/edu/mit/broad/picard/util/TabbedTextFileParser.java diff --git a/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java b/java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java rename to java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java diff --git a/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java b/java/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/DbSnpFileReader.java rename to java/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java diff --git a/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java b/java/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java rename to java/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java diff --git a/lib/edu/mit/broad/picard/variation/KnownVariant.java b/java/lib/edu/mit/broad/picard/variation/KnownVariant.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/KnownVariant.java rename to java/lib/edu/mit/broad/picard/variation/KnownVariant.java diff --git a/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java b/java/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/KnownVariantCodec.java rename to java/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java diff --git a/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java b/java/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/KnownVariantIterator.java rename to java/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java diff --git a/lib/edu/mit/broad/picard/variation/VariantType.java b/java/lib/edu/mit/broad/picard/variation/VariantType.java similarity index 100% rename from lib/edu/mit/broad/picard/variation/VariantType.java rename to java/lib/edu/mit/broad/picard/variation/VariantType.java diff --git a/lib/edu/mit/broad/sam/AlignmentBlock.java b/java/lib/edu/mit/broad/sam/AlignmentBlock.java similarity index 100% rename from lib/edu/mit/broad/sam/AlignmentBlock.java rename to java/lib/edu/mit/broad/sam/AlignmentBlock.java diff --git a/lib/edu/mit/broad/sam/BAMFileConstants.java b/java/lib/edu/mit/broad/sam/BAMFileConstants.java similarity index 100% rename from lib/edu/mit/broad/sam/BAMFileConstants.java rename to java/lib/edu/mit/broad/sam/BAMFileConstants.java diff --git a/lib/edu/mit/broad/sam/BAMFileIndex.java b/java/lib/edu/mit/broad/sam/BAMFileIndex.java similarity index 100% rename from lib/edu/mit/broad/sam/BAMFileIndex.java rename to java/lib/edu/mit/broad/sam/BAMFileIndex.java diff --git a/lib/edu/mit/broad/sam/BAMFileReader.java b/java/lib/edu/mit/broad/sam/BAMFileReader.java similarity index 100% rename from lib/edu/mit/broad/sam/BAMFileReader.java rename to java/lib/edu/mit/broad/sam/BAMFileReader.java diff --git a/lib/edu/mit/broad/sam/BAMFileWriter.java b/java/lib/edu/mit/broad/sam/BAMFileWriter.java similarity index 100% rename from lib/edu/mit/broad/sam/BAMFileWriter.java rename to java/lib/edu/mit/broad/sam/BAMFileWriter.java diff --git a/lib/edu/mit/broad/sam/BAMRecord.java b/java/lib/edu/mit/broad/sam/BAMRecord.java similarity index 100% rename from lib/edu/mit/broad/sam/BAMRecord.java rename to java/lib/edu/mit/broad/sam/BAMRecord.java diff --git a/lib/edu/mit/broad/sam/BAMRecordCodec.java b/java/lib/edu/mit/broad/sam/BAMRecordCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/BAMRecordCodec.java rename to java/lib/edu/mit/broad/sam/BAMRecordCodec.java diff --git a/lib/edu/mit/broad/sam/BinaryCigarCodec.java b/java/lib/edu/mit/broad/sam/BinaryCigarCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/BinaryCigarCodec.java rename to java/lib/edu/mit/broad/sam/BinaryCigarCodec.java diff --git a/lib/edu/mit/broad/sam/BinaryTagCodec.java b/java/lib/edu/mit/broad/sam/BinaryTagCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/BinaryTagCodec.java rename to java/lib/edu/mit/broad/sam/BinaryTagCodec.java diff --git a/lib/edu/mit/broad/sam/Cigar.java b/java/lib/edu/mit/broad/sam/Cigar.java similarity index 100% rename from lib/edu/mit/broad/sam/Cigar.java rename to java/lib/edu/mit/broad/sam/Cigar.java diff --git a/lib/edu/mit/broad/sam/CigarElement.java b/java/lib/edu/mit/broad/sam/CigarElement.java similarity index 100% rename from lib/edu/mit/broad/sam/CigarElement.java rename to java/lib/edu/mit/broad/sam/CigarElement.java diff --git a/lib/edu/mit/broad/sam/CigarOperator.java b/java/lib/edu/mit/broad/sam/CigarOperator.java similarity index 100% rename from lib/edu/mit/broad/sam/CigarOperator.java rename to java/lib/edu/mit/broad/sam/CigarOperator.java diff --git a/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java b/java/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java similarity index 100% rename from lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java rename to java/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java diff --git a/lib/edu/mit/broad/sam/SAMFileHeader.java b/java/lib/edu/mit/broad/sam/SAMFileHeader.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMFileHeader.java rename to java/lib/edu/mit/broad/sam/SAMFileHeader.java diff --git a/lib/edu/mit/broad/sam/SAMFileReader.java b/java/lib/edu/mit/broad/sam/SAMFileReader.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMFileReader.java rename to java/lib/edu/mit/broad/sam/SAMFileReader.java diff --git a/lib/edu/mit/broad/sam/SAMFileWriter.java b/java/lib/edu/mit/broad/sam/SAMFileWriter.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMFileWriter.java rename to java/lib/edu/mit/broad/sam/SAMFileWriter.java diff --git a/lib/edu/mit/broad/sam/SAMFileWriterFactory.java b/java/lib/edu/mit/broad/sam/SAMFileWriterFactory.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMFileWriterFactory.java rename to java/lib/edu/mit/broad/sam/SAMFileWriterFactory.java diff --git a/lib/edu/mit/broad/sam/SAMFileWriterImpl.java b/java/lib/edu/mit/broad/sam/SAMFileWriterImpl.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMFileWriterImpl.java rename to java/lib/edu/mit/broad/sam/SAMFileWriterImpl.java diff --git a/lib/edu/mit/broad/sam/SAMFormatException.java b/java/lib/edu/mit/broad/sam/SAMFormatException.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMFormatException.java rename to java/lib/edu/mit/broad/sam/SAMFormatException.java diff --git a/lib/edu/mit/broad/sam/SAMLocusIterator.java b/java/lib/edu/mit/broad/sam/SAMLocusIterator.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMLocusIterator.java rename to java/lib/edu/mit/broad/sam/SAMLocusIterator.java diff --git a/lib/edu/mit/broad/sam/SAMProgramRecord.java b/java/lib/edu/mit/broad/sam/SAMProgramRecord.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMProgramRecord.java rename to java/lib/edu/mit/broad/sam/SAMProgramRecord.java diff --git a/lib/edu/mit/broad/sam/SAMReadGroupRecord.java b/java/lib/edu/mit/broad/sam/SAMReadGroupRecord.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMReadGroupRecord.java rename to java/lib/edu/mit/broad/sam/SAMReadGroupRecord.java diff --git a/lib/edu/mit/broad/sam/SAMRecord.java b/java/lib/edu/mit/broad/sam/SAMRecord.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMRecord.java rename to java/lib/edu/mit/broad/sam/SAMRecord.java diff --git a/lib/edu/mit/broad/sam/SAMRecordComparator.java b/java/lib/edu/mit/broad/sam/SAMRecordComparator.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMRecordComparator.java rename to java/lib/edu/mit/broad/sam/SAMRecordComparator.java diff --git a/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java b/java/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java rename to java/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java diff --git a/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java b/java/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java rename to java/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java diff --git a/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java b/java/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMRecordSetBuilder.java rename to java/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java diff --git a/lib/edu/mit/broad/sam/SAMSequenceRecord.java b/java/lib/edu/mit/broad/sam/SAMSequenceRecord.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMSequenceRecord.java rename to java/lib/edu/mit/broad/sam/SAMSequenceRecord.java diff --git a/lib/edu/mit/broad/sam/SAMTag.java b/java/lib/edu/mit/broad/sam/SAMTag.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMTag.java rename to java/lib/edu/mit/broad/sam/SAMTag.java diff --git a/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java b/java/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMTextHeaderCodec.java rename to java/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java diff --git a/lib/edu/mit/broad/sam/SAMTextReader.java b/java/lib/edu/mit/broad/sam/SAMTextReader.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMTextReader.java rename to java/lib/edu/mit/broad/sam/SAMTextReader.java diff --git a/lib/edu/mit/broad/sam/SAMTextWriter.java b/java/lib/edu/mit/broad/sam/SAMTextWriter.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMTextWriter.java rename to java/lib/edu/mit/broad/sam/SAMTextWriter.java diff --git a/lib/edu/mit/broad/sam/SAMTools.java b/java/lib/edu/mit/broad/sam/SAMTools.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMTools.java rename to java/lib/edu/mit/broad/sam/SAMTools.java diff --git a/lib/edu/mit/broad/sam/SAMUtils.java b/java/lib/edu/mit/broad/sam/SAMUtils.java similarity index 100% rename from lib/edu/mit/broad/sam/SAMUtils.java rename to java/lib/edu/mit/broad/sam/SAMUtils.java diff --git a/lib/edu/mit/broad/sam/TextCigarCodec.java b/java/lib/edu/mit/broad/sam/TextCigarCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/TextCigarCodec.java rename to java/lib/edu/mit/broad/sam/TextCigarCodec.java diff --git a/lib/edu/mit/broad/sam/TextTagCodec.java b/java/lib/edu/mit/broad/sam/TextTagCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/TextTagCodec.java rename to java/lib/edu/mit/broad/sam/TextTagCodec.java diff --git a/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java b/java/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/AccumulateCoverage.java rename to java/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java diff --git a/lib/edu/mit/broad/sam/apps/CompareSAMs.java b/java/lib/edu/mit/broad/sam/apps/CompareSAMs.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/CompareSAMs.java rename to java/lib/edu/mit/broad/sam/apps/CompareSAMs.java diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java rename to java/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/AlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/AlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/allelecaller/AlleleCaller.java rename to java/lib/edu/mit/broad/sam/apps/allelecaller/AlleleCaller.java diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java rename to java/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java rename to java/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java rename to java/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java diff --git a/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java similarity index 100% rename from lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java rename to java/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java diff --git a/lib/edu/mit/broad/sam/util/AsciiLineReader.java b/java/lib/edu/mit/broad/sam/util/AsciiLineReader.java similarity index 100% rename from lib/edu/mit/broad/sam/util/AsciiLineReader.java rename to java/lib/edu/mit/broad/sam/util/AsciiLineReader.java diff --git a/lib/edu/mit/broad/sam/util/AsciiWriter.java b/java/lib/edu/mit/broad/sam/util/AsciiWriter.java similarity index 100% rename from lib/edu/mit/broad/sam/util/AsciiWriter.java rename to java/lib/edu/mit/broad/sam/util/AsciiWriter.java diff --git a/lib/edu/mit/broad/sam/util/BinaryCodec.java b/java/lib/edu/mit/broad/sam/util/BinaryCodec.java similarity index 100% rename from lib/edu/mit/broad/sam/util/BinaryCodec.java rename to java/lib/edu/mit/broad/sam/util/BinaryCodec.java diff --git a/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java b/java/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java similarity index 100% rename from lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java rename to java/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java diff --git a/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java b/java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java similarity index 100% rename from lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java rename to java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java diff --git a/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java b/java/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java similarity index 100% rename from lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java rename to java/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java diff --git a/lib/edu/mit/broad/sam/util/CloseableIterator.java b/java/lib/edu/mit/broad/sam/util/CloseableIterator.java similarity index 100% rename from lib/edu/mit/broad/sam/util/CloseableIterator.java rename to java/lib/edu/mit/broad/sam/util/CloseableIterator.java diff --git a/lib/edu/mit/broad/sam/util/CoordMath.java b/java/lib/edu/mit/broad/sam/util/CoordMath.java similarity index 100% rename from lib/edu/mit/broad/sam/util/CoordMath.java rename to java/lib/edu/mit/broad/sam/util/CoordMath.java diff --git a/lib/edu/mit/broad/sam/util/LineReader.java b/java/lib/edu/mit/broad/sam/util/LineReader.java similarity index 100% rename from lib/edu/mit/broad/sam/util/LineReader.java rename to java/lib/edu/mit/broad/sam/util/LineReader.java diff --git a/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java b/java/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java similarity index 100% rename from lib/edu/mit/broad/sam/util/NonDestructiveIterator.java rename to java/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java diff --git a/lib/edu/mit/broad/sam/util/PeekIterator.java b/java/lib/edu/mit/broad/sam/util/PeekIterator.java similarity index 100% rename from lib/edu/mit/broad/sam/util/PeekIterator.java rename to java/lib/edu/mit/broad/sam/util/PeekIterator.java diff --git a/lib/edu/mit/broad/sam/util/RuntimeEOFException.java b/java/lib/edu/mit/broad/sam/util/RuntimeEOFException.java similarity index 100% rename from lib/edu/mit/broad/sam/util/RuntimeEOFException.java rename to java/lib/edu/mit/broad/sam/util/RuntimeEOFException.java diff --git a/lib/edu/mit/broad/sam/util/RuntimeIOException.java b/java/lib/edu/mit/broad/sam/util/RuntimeIOException.java similarity index 100% rename from lib/edu/mit/broad/sam/util/RuntimeIOException.java rename to java/lib/edu/mit/broad/sam/util/RuntimeIOException.java diff --git a/lib/edu/mit/broad/sam/util/SortingCollection.java b/java/lib/edu/mit/broad/sam/util/SortingCollection.java similarity index 100% rename from lib/edu/mit/broad/sam/util/SortingCollection.java rename to java/lib/edu/mit/broad/sam/util/SortingCollection.java diff --git a/lib/edu/mit/broad/sam/util/StringLineReader.java b/java/lib/edu/mit/broad/sam/util/StringLineReader.java similarity index 100% rename from lib/edu/mit/broad/sam/util/StringLineReader.java rename to java/lib/edu/mit/broad/sam/util/StringLineReader.java diff --git a/lib/edu/mit/broad/sam/util/StringUtil.java b/java/lib/edu/mit/broad/sam/util/StringUtil.java similarity index 100% rename from lib/edu/mit/broad/sam/util/StringUtil.java rename to java/lib/edu/mit/broad/sam/util/StringUtil.java diff --git a/src/edu/mit/broad/sting/ValidateSAM.java b/java/src/edu/mit/broad/sting/ValidateSAM.java similarity index 100% rename from src/edu/mit/broad/sting/ValidateSAM.java rename to java/src/edu/mit/broad/sting/ValidateSAM.java diff --git a/src/edu/mit/broad/sting/atk/AnalysisTK.java b/java/src/edu/mit/broad/sting/atk/AnalysisTK.java similarity index 100% rename from src/edu/mit/broad/sting/atk/AnalysisTK.java rename to java/src/edu/mit/broad/sting/atk/AnalysisTK.java diff --git a/src/edu/mit/broad/sting/atk/LocusContext.java b/java/src/edu/mit/broad/sting/atk/LocusContext.java similarity index 100% rename from src/edu/mit/broad/sting/atk/LocusContext.java rename to java/src/edu/mit/broad/sting/atk/LocusContext.java diff --git a/src/edu/mit/broad/sting/atk/LocusIterator.java b/java/src/edu/mit/broad/sting/atk/LocusIterator.java similarity index 100% rename from src/edu/mit/broad/sting/atk/LocusIterator.java rename to java/src/edu/mit/broad/sting/atk/LocusIterator.java diff --git a/src/edu/mit/broad/sting/atk/LocusWalker.java b/java/src/edu/mit/broad/sting/atk/LocusWalker.java similarity index 100% rename from src/edu/mit/broad/sting/atk/LocusWalker.java rename to java/src/edu/mit/broad/sting/atk/LocusWalker.java diff --git a/src/edu/mit/broad/sting/atk/ReadWalker.java b/java/src/edu/mit/broad/sting/atk/ReadWalker.java similarity index 100% rename from src/edu/mit/broad/sting/atk/ReadWalker.java rename to java/src/edu/mit/broad/sting/atk/ReadWalker.java diff --git a/src/edu/mit/broad/sting/atk/TraversalEngine.java b/java/src/edu/mit/broad/sting/atk/TraversalEngine.java similarity index 100% rename from src/edu/mit/broad/sting/atk/TraversalEngine.java rename to java/src/edu/mit/broad/sting/atk/TraversalEngine.java diff --git a/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java b/java/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java similarity index 100% rename from src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java rename to java/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java diff --git a/src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java b/java/src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java similarity index 100% rename from src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java rename to java/src/edu/mit/broad/sting/atk/modules/EmptyLocusWalker.java diff --git a/src/edu/mit/broad/sting/atk/modules/EmptyReadWalker.java b/java/src/edu/mit/broad/sting/atk/modules/EmptyReadWalker.java similarity index 100% rename from src/edu/mit/broad/sting/atk/modules/EmptyReadWalker.java rename to java/src/edu/mit/broad/sting/atk/modules/EmptyReadWalker.java diff --git a/src/edu/mit/broad/sting/atk/modules/PileupWalker.java b/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java similarity index 100% rename from src/edu/mit/broad/sting/atk/modules/PileupWalker.java rename to java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java diff --git a/src/edu/mit/broad/sting/utils/EndlessIterator.java b/java/src/edu/mit/broad/sting/utils/EndlessIterator.java similarity index 100% rename from src/edu/mit/broad/sting/utils/EndlessIterator.java rename to java/src/edu/mit/broad/sting/utils/EndlessIterator.java diff --git a/src/edu/mit/broad/sting/utils/Predicate.java b/java/src/edu/mit/broad/sting/utils/Predicate.java similarity index 100% rename from src/edu/mit/broad/sting/utils/Predicate.java rename to java/src/edu/mit/broad/sting/utils/Predicate.java diff --git a/src/edu/mit/broad/sting/utils/PushbackIterator.java b/java/src/edu/mit/broad/sting/utils/PushbackIterator.java similarity index 100% rename from src/edu/mit/broad/sting/utils/PushbackIterator.java rename to java/src/edu/mit/broad/sting/utils/PushbackIterator.java diff --git a/src/edu/mit/broad/sting/utils/ReferenceIterator.java b/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java similarity index 100% rename from src/edu/mit/broad/sting/utils/ReferenceIterator.java rename to java/src/edu/mit/broad/sting/utils/ReferenceIterator.java diff --git a/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java similarity index 100% rename from src/edu/mit/broad/sting/utils/ReferenceOrderedData.java rename to java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java diff --git a/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java similarity index 100% rename from src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java rename to java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java diff --git a/src/edu/mit/broad/sting/utils/Utils.java b/java/src/edu/mit/broad/sting/utils/Utils.java similarity index 100% rename from src/edu/mit/broad/sting/utils/Utils.java rename to java/src/edu/mit/broad/sting/utils/Utils.java diff --git a/src/scripts/TraverseTest.sh b/java/src/scripts/TraverseTest.sh similarity index 100% rename from src/scripts/TraverseTest.sh rename to java/src/scripts/TraverseTest.sh diff --git a/src/scripts/TraverseTestProf.sh b/java/src/scripts/TraverseTestProf.sh similarity index 100% rename from src/scripts/TraverseTestProf.sh rename to java/src/scripts/TraverseTestProf.sh diff --git a/python/EvalMapping.py b/python/EvalMapping.py new file mode 100755 index 000000000..0137c4d5f --- /dev/null +++ b/python/EvalMapping.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python + +# Evaluates mapping results produced my RunMapper.py by comparig +# the result to the original simulated reads in a sam file + +import getopt, sys, os, re +from operator import attrgetter, itemgetter +from subprocess import Popen, STDOUT, PIPE +from SAM import SAMIO +from pushback_file import pushback_file +from qltout import qltout_file +from aln_file import aln_file + + +#!/usr/bin/env python + +import string, sys + +class SAMRecordWrapper: + def __init__(self, rec): + #print 'Rec', rec + self.rec = rec + + def id(self): return self.rec.getQName() + def contig(self): return self.rec.getRname() + def pos(self): return self.rec.getPos() + def offset(self): return self.pos()-1 + def map_qual(self): return self.rec.getMapq() + + def __str__(self): return str(self.rec) + +def SAMIOWrapper( file ): + print 'SAMIOWrapper', file + return SAMIO( file, debugging=True, func=SAMRecordWrapper) + +class mapper_info: + def __init__(self, name, ext, file_class): + self.name = name.lower() + self.ext = ext + self.file_class = file_class + +mapper_defs = [ mapper_info("ilt", "ilt.qltout", qltout_file), + mapper_info("merlin", "qltout", qltout_file), + mapper_info("swmerlin", "qltout", qltout_file), + mapper_info("bwa", "sam", SAMIOWrapper), + mapper_info("bwa32", "sam", SAMIOWrapper), + mapper_info("maq", "aln.txt", aln_file) ] +mapper_defs = dict( [(m.name, m) for m in mapper_defs] ) + +def Comp_sam_map(comp_head, mapper, file_output, sam_indel): + + # sam_indel is a +/- number corresponding to the size of insertions / deletions + # in generated SAM file that are fixed (1,2,4,8,16...), this doesn't always + # translate to the SAM's cigar, but the full value should be used + + file_output_head = comp_head+"."+mapper.name + if file_output: + # if non-false, output to filename in file_output_head + # if "" or False, leave output going to stdout + #saveout = sys.stdout + feval = open(file_output_head+".eval", "w") + #sys.stdout = feval + + # create a file with a one-line tabular output of stats + # (for merging into Excel / R input) + fevaltab = open(file_output_head+".tab", "w") + + # if maq file, convert aln.map to aln.txt + aln_txt_filename = file_output_head+"."+mapper.ext + if mapper.name == "maq" and not os.path.exists(aln_txt_filename): + maq_exe = "/seq/dirseq/maq-0.7.1/maq" + + # SHOULD BE THIS + #cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map" + + cmd_str = maq_exe+" mapview "+file_output_head+".out.aln.map" + + print >> sys.stderr, "Executing "+cmd_str + fout = open(aln_txt_filename, "w") + p = Popen( cmd_str , shell=True, stdout=fout) + + map_filename = comp_head+"."+mapper.name+"."+mapper.ext + print 'map_filename', map_filename + map_file = mapper.file_class(map_filename) + + SAM_filename = comp_head+".sam" + sam_file = SAMIO(SAM_filename, debugging=True) + + #sams = [s for s in sam_file] + print "Reading map file..." + maps = [q for q in map_file] + + print "Reading SAM file..." + sams = {} + if mapper.name in ["maq", 'bwa', 'bwa32']: + for s in sam_file: + sams[s.getQName()] = s + else: + for i, s in enumerate(sam_file): + sams[i] = s + + #maps = {} + #for m in map_file: + # maps[m.id()] = m + #sams = [s for s in sam_file] + + class smq_t: + def __init__(self, sam, map, qual): + self.sam = sam # SAM record (exists for all reads) + self.map = map # Mapping record, may be None + self.qual = qual # Quality of mapping + + print "Making SMQ records..." + SMQ = [] + for maprec in maps: + samrec = sams.get(maprec.id()) # get mapping otherwise None + SMQ.append( smq_t(samrec, maprec, maprec.map_qual()) ) + + print "Sorting..." + SMQ.sort(key=attrgetter('qual'), reverse=True) + + print "Evaluating placements..." + placed = 0 + incorrectly_placed = 0 + correctly_placed = 0 + fevaltab.write( "\t".join( ["last_qual", "total_reads", "placed", "correctly_placed", "incorrectly_placed", "not_placed"] ) + "\n" ) + if len(SMQ): + last_qual = SMQ[0].qual # grab 1st qual + for smq in SMQ: + if smq.sam == None: + continue + + if smq.qual != last_qual: + total_reads = len(sams)#placed + not_placed + not_placed = len(sams) - placed + fevaltab.write( "\t".join( map(str, [last_qual, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" ) + #print "Wrote all reads with qual >= "+str(last_qual) + last_qual = smq.qual + + placed += 1 + + # Convert the CIGAR string - e.g. 75M1I, 74M2D - into an int corresponding to bases + # inserted / deleted - e.g. +1, -2 + #cigar = smq.sam.getCigar() + #indelled_bases = 0 + #match = re.search(r'(\d+)I', cigar) + #if match: + # indelled_bases += int(match.groups()[0]) + #match = re.search(r'(\d+)D', cigar) + #if match: + # indelled_bases -= int(match.groups()[0]) + + # We consider a read properly aligned if + # - both start positions are the same OR + # - the end position of the mapping = end position of the samq + #if smq.sam.getPos() != smq.map.pos() and smq.sam.getPos() - indelled_bases != smq.map.pos(): + + try: + map_indel_bases = smq.map.indelled_bases() # total number of indels in mapped seq + # are generally at the beginning of an alignment spanning the anchor base + except AttributeError: + map_indel_bases = 0 # Since MAQ doesn't do local alignment, we don't care + # if its sam.py interface doesn't have access to indel data since there are + # no indels in its alignmnets + + if smq.sam.getPos() != smq.map.pos() and smq.map.pos() + sam_indel + map_indel_bases != smq.sam.getPos(): + #print >> feval, "Indelled SAM bases: "+str(indelled_bases) + try: + print >> feval, "Indelled map bases: "+str(map_indel_bases) + except AttributeError: + pass + print >> feval, "SAM ref: %5s %s" % (smq.sam.getRname(), smq.sam.getPos()) + print >> feval, "Mapping: %5s %s" % (smq.map.contig(), smq.map.pos()) + print >> feval, "SAM record:\n"+str(smq.sam) + print >> feval, "Mapping record:\n"+str(smq.map)+"\n" + incorrectly_placed += 1 + else: + correctly_placed +=1 + + + #indexed_maps = [None] * len(sams) + #for q in maps: + # indexed_maps[q.id()] = q + + #def f(s,q): + # if q == None: + # return s + # else: + # return False + + #missing = filter( None, map(f, sams, indexed_maps))#, range(len(indexed_maps))) ) + + + #for miss in missing: + # print miss + #not_placed = len(missing) + #not_placed = len(sams) - len(maps) + total_reads = len(sams) #placed + not_placed + not_placed = len(sams) - placed + + print >> feval, "Total reads : "+str(total_reads) + print >> feval, "+-Placed : "+str(placed) + print >> feval, "| +-Correct : "+str(correctly_placed) + print >> feval, "| +-Incorrect: "+str(incorrectly_placed) + print >> feval, "+-Not placed : "+str(not_placed) + + fevaltab.write( "\t".join( map(str, [0, total_reads, placed, correctly_placed, incorrectly_placed, not_placed]) ) + "\n" ) + print "Wrote all reads with all quals" + + #if file_output_head: + # sys.stdout = saveout + +def remove_escape_codes(s): + import re + return re.sub(r'\x1b\[(\d\d)?\w', "", s) + +def usage(): + print "EvalMapping.py\n" + print "Required arguments:\n" + print " -h HEAD All input and output files start with HEAD" + print " --OR--" + print " " + print " i.e. ls *.sam | EvalMapping.py -m MAPPER" + print + print " -m MAPPER Mapping program output to compare (e.g. merlin, ILT)" + print + print "Optional arguments:\n" + print + print " -o Output to screen instead of HEAD.MAPPER.eval" + print + print "Input files:" + print " HEAD.sam" + print + print "Result files expected for Merlin and ILT:" + print " HEAD.MAPPER.qltout" + print + print "Result files expected for MAQ:" + print + sys.exit(2) + +if __name__ == "__main__": + opts = None + try: + opts, args = getopt.getopt(sys.argv[1:], "h:m:o", ["head","mapper","file-output"]) + except getopt.GetoptError: + usage() + + comp_heads = [] + mapper_str = "" + file_output = True + for opt, arg in opts: + if opt in ("-h", "--head"): + comp_heads = [arg] + if opt in ("-m", "--mapper"): + mapper_str = arg + possible_mappers = mapper_defs.keys() + # Check it later now... + #if mapper_str not in possible_mappers: + # print "--mapper argument was \""+mapper_str+"\"; expected one of the following: "+", ".join(possible_mappers) + # sys.exit(2) + if opt in ("-o", "--file-output"): + file_output = False + + # Check for required args + if (mapper_str == ""): + usage() + elif (mapper_str == "all"): + mappers = mapper_defs.values() + print mappers + else: + try: + mapper_str_items = mapper_str.split(",") + mappers = [mapper_defs.get(mapper.lower()) for mapper in mapper_str_items] + except KeyError: + print "Don't know this mapper given in -m option: "+mapper + print "Try: ilt, merlin, maq, bwa" + sys.exit(1) + + # if we didn't already get a single file from -h + if len(comp_heads) == 0: + # and there's a list on STDIN, + if not sys.stdin.isatty(): # redirected from file or pipe + comp_heads = [ os.path.splitext( remove_escape_codes(file).rstrip() )[0] for file in sys.stdin.readlines() ] + comp_heads = [ f for f in comp_heads if len(f) > 0 ] + else: + # no single file AND no STDIN file list + usage() + + print "\nFile heads to evaluate:" + print "\n".join(comp_heads)+"\n" + + comp_heads.reverse() + for comp_head in comp_heads: #[:1] + for mapper in mappers: + # Guess the original size of simulated indel from filename + m = re.search(r'_den\.(\d+)_', comp_head) + if m: + if ".DELETION_" in comp_head: + indel_size = -1 * int(m.groups()[0]) + elif ".INSERTION_" in comp_head: + indel_size = int(m.groups()[0]) + else: + indel_size = 0 + else: + sys.exit("Couldn't guess simulated read indel size from filename") + + print "Evaluating "+mapper.name+" results with indel size: "+str(indel_size)+" and file header: "+comp_head + Comp_sam_map(comp_head, mapper, file_output, indel_size) + diff --git a/python/FastaQuals2Fastq.py b/python/FastaQuals2Fastq.py new file mode 100755 index 000000000..7bc929153 --- /dev/null +++ b/python/FastaQuals2Fastq.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +# Utility script to convert FASTA + QUALS file into FASTQ required for MAQ +# MAQ needs to convert fastq into bfq file using fastq2bfq in order to use +# this data + +import sys, fasta, os +from itertools import * + +def usage(): + print "Usage: fastq.py fasta_in quals_in fastq_out" + print " fasta_in: *.fasta file" + print " quals_in: *.quals.txt OR *.quala" + sys.exit() + +def output_fastq_record(file_handle, fasta, quals): + #print 'fasta, quals', fasta, quals + #for fasta, qual in zip(fastas, quals): + + qualsVector = map(int, quals.seq.split()) + + print >> fqout, "@"+fasta.id + print >> fqout, fasta.seq + print >> fqout, "+" + print >> fqout, "".join( map(lambda q: chr(q+33), qualsVector ) ) + +def qualStr2fasta( line ): + elts = line.split() + id = elts[0] + quals = ' '.join(elts[1:]) + return fasta.fasta_record(id, quals) + +if __name__ == "__main__": + if len(sys.argv[1:]) != 3: + usage() + else: + fasta_in_file, quals_in_file, fastq_out_file = sys.argv[1:] + + print "Quals input file: "+quals_in_file + print "Fasta input file: "+fasta_in_file + + if os.path.splitext(quals_in_file)[1] == ".txt": + qual_factory = imap( qualStr2fasta, file(quals_in_file) ) + else: + if os.path.splitext(quals_in_file)[1] == ".quala": + # Create fasta factory from the quala file and treat it + # as if it was a fasta file and pull the numbers + qual_factory = fasta.fasta_file(quals_in_file, cleanup=False) + else: + print "Qual input file should be a *.quals.txt or *.quala file" + + print "Fastq ouptut file: "+fastq_out_file + fqout = file(fastq_out_file, "w") + + for qual_record, fasta_record in izip_longest(qual_factory, fasta.fasta_file(fasta_in_file)): + if qual_record == None or fasta_record == None: + # We ran out of one record type, so warn the user! + print "WARNING: Different number of quals and fastas in input files!" + sys.exit(1) + #print 'qual_record', qual_record + #print 'fasta_record', fasta_record + output_fastq_record(fqout, fasta_record, qual_record) + + + diff --git a/python/MergeEvalMapTabs.py b/python/MergeEvalMapTabs.py new file mode 100755 index 000000000..06af40298 --- /dev/null +++ b/python/MergeEvalMapTabs.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python + +# merges .tab files produced by EvalMapping.py into a tabular format + +#from __future__ import print_function # python 2.6 / 3.0 print as a function +import os,sys,re +from rpy2 import * + +def ordered_eval_files(aligner, mutation_type, file_ext): + tab_files = [f for f in os.listdir(".") if f.endswith("."+aligner+"."+file_ext) and (mutation_type in f or "None_den." in f)] + + tabs = [] + for f in tab_files: + num = int(re.search(r'den\.(\d\d?)_', f).groups()[0]) + if "None_den" in f: + num = 0 + tabs.append( (num,f) ) + tabs.sort() + return tabs + +def anchor_hist(filename): + fin = open(filename) + offsets = [] + read_poses = [] + + for line in fin: + id = None + fields = line.split() + if len(fields) > 0 and ".read." in fields[0]: + id = line.split()[0] + read_poses.append(id.split(".")[-1]) + offsets.append(id.split(":")[1].split(".")[0]) + + #if "read name :" in line: + # id = line.split()[3] + + import rpy2.robjects as robj + if len(read_poses): + print len(read_poses) + return robj.r.hist(robj.FloatVector(read_poses), breaks=77, plot=False) + else: + print 0 + return robj.IntVector([]) + +def estimateCPUTime(filename): + times = [] + if os.path.exists(filename): + regex = re.compile('CPU time\s*:\s+([\d\.]+) sec') + #print ('Filename', filename) + def extractTime1(line): + sobj = regex.search(line) + if sobj <> None: + time = float(sobj.group(1)) + #print (line, sobj, sobj.groups(), time) + return time + else: + return None + times = [time for time in map(extractTime1, open(filename)) if time <> None] + #print('times', times) + + return max(times) + +def print_eval_tables(aligner, mut_type, num_var, filename, min_qual_cutoff=0): + filebase = os.path.splitext(filename)[0] + cpuTime = estimateCPUTime(filebase + '.stdout') + + lines = open(filename).readlines() + data = None + if len(lines) > 1: + da_line = lines[0] + for line in lines: + line = line.rstrip() + fields = line.split("\t") + try: + qual_cutoff = int(fields[0]) + except: + qual_cutoff = 1000 + if qual_cutoff <= min_qual_cutoff: + break + else: + da_line = line + + #print ("%2d\t" % num_var, file=sys.stderr), + data = [aligner, mut_type, min_qual_cutoff, num_var] + map(int, da_line.split()) + [cpuTime] + print "%s\t%s\t%2d\t%s\t%.2f" % (aligner, mut_type, num_var, da_line, cpuTime) + return data + +def plot_anchor_hists(): + import rpy2.robjects as robj + #robj.r.par( mfrow = robj.RVector([2,2]) ) + #robj.r('X11(width=24, height=15)') + robj.r('png("lotto_histo.png", width=1500, height=900)') + robj.r('par(mfrow=c(4,6 ), cin=c(3,3))') + + for (aligner, cutoff) in (("maq", 30), ("ilt",-1), ("merlin",-1), ("swmerlin",-1)): + print aligner, ">"+str(cutoff) + + num_file = ordered_eval_files(aligner, mut_type, file_ext="eval") + #print (*num_file, sep="\n") + for num, file in num_file: + + h = anchor_hist( file ) + title = "{aligner} {num} bp insert".format(aligner=aligner, num=num) + robj.r.plot(h, xlab="Anchor position", main=title, col="blue", ylim=robj.FloatVector([0,40]), xlim=robj.FloatVector([0,90])) + + robj.r("dev.off()") + #raw_input("Press any key to exit...") + + +def analyzeData( data, mut_type ): + import rpy2.robjects as robj + # print name / data name + aligners = [('MAQ', 'maq>30'), ('Merlin', 'swmerlin>-1'), ('ILT', 'ilt>-1'), ('BWA', 'bwa32>30')] + + def selectData( key ): + def keepDataP( data ): + return data[0] == key and data[1] == mut_type + return filter( keyDataP, data ) + + def placeRatePlot(): + robj.r('png("x.png", width=1500, height=900)') + x = robj.FloatVector(mutDensity) + y = robj.FloatVector(placeRate) + robj.r.plot(x, y, xlab="Mutation density / size", col="blue") + robj.r("dev.off()") + + return 0 + +if __name__ == "__main__": + + if len(sys.argv) < 2: + print "merge_tabs.py MUTATION_TYPE (e.g. INSERTION, DELETION, SNP)" + sys.exit(1) + else: + mut_type = sys.argv[1] + + if False: + plot_anchor_hists() + else: + data = [] + for (aligner, cutoff) in (("maq", 30), ("maq", 0), ("maq",-1), ("ilt",-1), ("merlin",-1), ("swmerlin",-1), ("bwa", 30), ("bwa", 0), ("bwa", -1), ("bwa32", 30), ("bwa32", 0), ("bwa32", -1)): + name = aligner + ">" + str(cutoff) + #print (aligner, ">"+str(cutoff)) + + num_file = ordered_eval_files(aligner, mut_type, file_ext="tab") + #num_file = ordered_eval_files(aligner, mut_type, file_ext="eval") + for num, file in num_file: + datum = print_eval_tables( name, mut_type, num, file, cutoff ) + if datum <> None: + data.append(datum) + #print + + analyzeData( data, mut_type ) + diff --git a/python/SAM.py b/python/SAM.py new file mode 100644 index 000000000..4a2e03a3c --- /dev/null +++ b/python/SAM.py @@ -0,0 +1,218 @@ +import string +import operator +from pushback_file import pushback_file + +# Type TAG Req? Default Description +SAMHeaderFields = \ + [['HD', 'VN', True, '0.1.0', 'File format version'], \ + ['HD', 'SO', False, None, 'Sort order. Valid values are: unsorted, queryname or coordinate'], \ + ['SQ', 'SN', True, None, 'Sequence name. Unique among all sequence records in the file. The value of this field is used in alignment records.'], \ + ['SQ', 'LN', True, None, 'Sequence length'], \ + ['SQ', 'AS', False, None, 'Genome assembly identifier. Refers to the reference genome assembly in an unambiguous form. Example: HG18.'], \ + ['SQ', 'M5', False, None, 'MD5 checksum of the sequence in the uppercase (gaps and space are removed)'], \ + ['SQ', 'UR', False, None, 'URI of the sequence'], \ + ['SQ', 'SP', False, None, 'Species'], \ + ['RG', 'ID', True, 'ReadGroup1', 'Unique read group identifier. The value of the ID field is used in the RG tags of alignment records.'], \ + ['RG', 'SM', True, 'SampleA', 'Sample (use pool name where a pool is being sequenced)'], \ + ['RG', 'LB', False, None, 'Library'], \ + ['RG', 'DS', False, None, 'Description'], \ + ['RG', 'PU', False, None, 'Platform unit (e.g. lane for Illumina or slide for SOLiD)'], \ + ['RG', 'PI', False, None, 'Predicted median insert size (maybe different from the actual median insert size)'], \ + ['RG', 'CN', False, None, 'Name of sequencing center producing the read'], \ + ['RG', 'DT', False, None, 'Date the run was produced (ISO 8601 date or date/time)'], \ + ['RG', 'PL', False, None, 'Platform/technology used to produce the read'], \ + ['PG', 'ID', True, 'ProgramA','Program name'], \ + ['PG', 'VN', False, None, 'Program version'], \ + ['PG', 'CL', False, None, 'Command line']] + +def SAMHeaderEncode( type, tag ): + return type + '.' + tag + +SAMHeaderDict = dict() +for record in SAMHeaderFields: + type, tag, req, default, desc = record + SAMHeaderDict[SAMHeaderEncode(type, tag)] = [req, default, desc] + +# ----------------------------------------------------------------------------------------------- +# +# A SAM header is a potentially complex object, so we just punt on it. The only really required +# outputs that *must* be user specified are the SQ: SN and SQ: LN fields. +# Everything else is just split up and stored in our hash table by reading the SAMHeaderFields +# data structure (defined above). All required fields are initialized to their default values +# +# ----------------------------------------------------------------------------------------------- +class SAMHeader(dict): + def __init__(self, seqName, seqLen, **keys): + # setup initial header + self.fields = dict() + for record in SAMHeaderFields: + type, tag, req, default, desc = record + self.setField( type, tag, default ) + + self.setField('SQ', 'SN', seqName ) + self.setField('SQ', 'LN', seqLen ) + + # add keyword arguments + for key, value in keys.iteritems(): + type, tag = key.split('.') + self.setField( type, tag, value ) + + def isReq( self, type, tag ): + return SAMHeaderDict[SAMHeaderEncode(type,tag)][0] + + def setField( self, type, tag, value ): + #print 'Setting', type, tag, value + self.fields[SAMHeaderEncode(type, tag)] = value + + def getField( self, type, tag ): + return self.fields[SAMHeaderEncode(type, tag)] + + def __str__( self ): + types = ['HD', 'SQ', 'RG'] + + def formatType( type ): + s = '@' + type + '\t' + for record in SAMHeaderFields: + qtype, tag, req, default, desc = record + if type == qtype and self.isReq(type, tag): + value = self.getField(type, tag) + if value == None: + raise Error('Unset required SAM header ' + type + ' ' + tag) + s += tag + ':' + str(value) + '\t' + return s + return string.join(map( formatType, types ),'\n') + +SAM_SEQPAIRED = 0x0001 # the read is paired in sequencing, no matter whether it is mapped in a pair +SAM_MAPPAIRED = 0x0002 # the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) 1 +SAM_UNMAPPED = 0x0004 # the query sequence itself is unmapped +SAM_MATEUNMAPPED = 0x0008 # the mate is unmapped 1 +SAM_QUERYSTRAND = 0x0010 # strand of the query (0 for forward; 1 for reverse strand) +SAM_MATESTRAND = 0x0020 # strand of the mate 1 +SAM_ISFIRSTREAD = 0x0040 # the read is the first read in a pair 1,2 +SAM_ISSECONDREAD = 0x0080 # the read is the second read in a pair 1,2 +SAM_NOTPRIMARY = 0x0100 # the alignment is not primary (a read having split hits may have multiple primary alignment records) + +def SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ): + r = SAMRecord() + r.setValuesFromArgs( qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig, pairPos, insertSize ) + return r + +def SAMRecordFromString( str ): + r = SAMRecord() + r.setValuesFromString( str ) + return r + +def SAMFlagValue( flags, testFlag ): + return testFlag & flags +def SAMFlagIsSet( flags, testFlag ): + return SAMFlagValue(flags, testFlag) <> 0 + +# ----------------------------------------------------------------------------------------------- +# +# This is really the meat of the SAM I/O system. +# +# setValuesFromArgs takes the required arguments for a SAM record and stores them in a list +# (in SAM ordering) so that writing the values to a string is trivial. Note the conversion +# of int quals to ASCII encoded quals. +# +# ----------------------------------------------------------------------------------------------- +class SAMRecord: + def __init__( self, vals = []): + self.vals = [] + + def setValuesFromArgs( self, qname, flags, rname, pos, mapq, cigar, seq, quals, pairContig = '*', pairPos = 0, insertSize = 0 ): + self.vals = [qname, self.formatFlags(flags), rname, pos, mapq, \ + cigar, pairContig, pairPos, insertSize, seq.lower(), self.toASCII33(quals) ] + + def setValuesFromString( self, line ): + #print 'line is', line + formats = [str, int, str, int, int, str, str, int, int, str, str] + self.vals = map( lambda f, v: f(v), formats, line.split()[0:len(formats)] ) + + def formatFlags( self, flags ): + b = reduce( operator.__or__, flags, 0 ) + #print 'FormatFlags', flags, b + return b + + def getQName(self): return self.vals[0] + def getFlags(self): return self.vals[1] + def getRname(self): return self.vals[2] # Reference sequence name (can be chr1, chr2, etc...) + def getPos(self): return self.vals[3] + def getMapq(self): return self.vals[4] + def getCigar(self): return self.vals[5] # Returns CIGAR which gives match, insertion, and other info as "75M2I" + def getSeq(self): return self.vals[9] + def getQuals(self): return self.fromASCII33(self.vals[10]) + + def toASCII33( self, quals ): + return string.join( map( lambda q: chr(q+33), quals ), '' ) + + def fromASCII33( self, qualStr ): + return map( lambda q: ord(q)-33, qualStr) + + def __str__( self ): + return string.join( map( str, self.vals ), '\t') + +# ----------------------------------------------------------------------------------------------- +# +# Wrapper class for reading and writing records to a SAM file +# +# ----------------------------------------------------------------------------------------------- +class SAMIO: + def __init__( self, fileName, header = None, debugging = False, func = None ): + self.header = header + self.debugging = debugging + + if func == None: + func = lambda x: x + self.func = func + + if self.header == None: + self.fileHandle = pushback_file(fileName, "r") + self.readHeader() + else: + self.fileHandle = file(fileName, "w") + self.writeHeader() + + def close(self): + self.fileHandle.close() + + def readHeader(self): + self.header = "" + while True: + line = self.fileHandle.readline() + if line.startswith("@"): + self.header += line + else: + self.fileHandle.pushback(line) + if self.debugging: + pass #print str(self.header) + return True + + return self.header + + def RecordGenerator( self ): + for line in self.fileHandle: + #print 'Reading line', line + line = line.rstrip("\n") + yield self.func(SAMRecordFromString(line)) + raise StopIteration + return + + def __iter__(self): + return self.RecordGenerator() + + def writeRecord( self, recordIterator ): + for record in recordIterator: + self.writeRecord(record) + + def writeRecord( self, record ): + if self.debugging: + print record + print >> self.fileHandle, record + return True + + def writeHeader( self ): + if self.debugging: + print str(self.header) + print >> self.fileHandle, str(self.header) + return True diff --git a/python/SamWalk.py b/python/SamWalk.py new file mode 100755 index 000000000..5d3f71a60 --- /dev/null +++ b/python/SamWalk.py @@ -0,0 +1,304 @@ +#!/util/bin/python +# +# +# This file really needs to be cleaned up and functions need to be improved +# +# (1) utility operations should be moved to a utility file +# (2) Need to add support for windows paired reads, information about where +# reads come from (which file), and n-base pair loci +# (3) Move to C++ based SAM/BAM i/o system +# (4) Figure out a way to get the reference to stream, and use a automatically pickled version +# (5) Add capability to walk over all reference bases, regardless of coverage. Currently +# we only walk over covered bases. +# +# +from optparse import OptionParser +import os +import sys +import random +import string +import os.path +import heapq +from itertools import * + +from SAM import * +import SimpleSAM +#import SimulateReads # utility functions should be imported + +# Global variable containing command line arguments +OPTIONS = None + +# +# Trivial class that added pushback support to a stream +# +class peakStream(object): + """A simple wrapper around a stream that adds unget operation""" + def __init__( self, stream ): + self.ungot = [] + self.s = stream + + def unget( self, elt ): + """Put elt at the front of the underlying stream, making the next get call return it""" + + #print 'unget', self.ungot + self.ungot.append(elt) + #print ' -> unget', self.ungot + + # we hold the next() operation + def __iter__( self ): return self + + def next(self): + """For python iterator support""" + #print 'ungot', self.ungot + if self.ungot <> []: + elt = self.ungot.pop(0) + else: + elt = self.s.next() + + #print ' -> ungot', self.ungot + #print ' elt', elt + return elt + +# +# Returns a single, combined stream of SAM records in chromosome order from a list of +# input streams, each in reference order +# +def rawReadStream( ref, inputs ): + """Returns a single iterable that returns records in reference order + from each of the input streams""" + def includePriorities( stream ): + return imap( lambda x: (x.getPos(), x), stream ) + def prunePriorities( stream ): + return imap( lambda p: SimpleSAM.MappedReadFromSamRecord(ref, p[1]), stream ) + + with_priorities = map( includePriorities, inputs ) + return prunePriorities( heapq.merge( *with_priorities ) ) + +# +# Just wraps the raw read stream objects in a list as they come out +# +def readStream( ref, inputs ): + """Returns a single iterable that returns SAM records in reference order + from each of the input streams""" + rs = rawReadStream( ref, inputs ) + return imap( lambda x: [x], rs ) + +# +# More complex stream object. Takes a list of input streams and creates a stream +# returning successive loci covered by the reads in the combined input stream. +# +class lociStream(): + """A streaming data structure that returns reads spanning each covered loci in + the input reads, offsets into them + where the bases are equivalent, and the position of the locus. + """ + def __init__( self, ref, inputs ): + self.ref = ref + self.rs = peakStream(rawReadStream( ref, inputs )) + self.window = [] + + self.chr = None + self.pos = None + + def __iter__(self): + return self + + def pushRead( self, read ): + self.window.append(read) + + def cleanWindow( self, chr, pos ): + """Walk over the window of reads, deleting those who no longer cover the current pos""" + if OPTIONS.debug: + print 'cleanWindow start:', chr, pos, self.window + + def keepReadP( read ): + return read.chr == chr and pos >= read.start and pos <= read.end + self.window = filter( keepReadP, self.window ) + + if OPTIONS.debug: + print 'cleanWindow stop:', chr, pos, self.window + + def expandWindow( self, chr, pos ): + """Keep reading from the read stream until we've added all reads from the stream covering pos""" + if OPTIONS.debug: + print 'expandWindow start:', chr, pos, self.window + for read in self.rs: + #print 'read', read, pos + if read.chr == chr and read.start <= pos and read.end >= pos: + self.pushRead(read) + else: + self.rs.unget( read ) + #self.rs = chain( [read], self.rs ) + break + if OPTIONS.debug: + print 'expandWindow stop:', chr, pos, self.window + + + # + # This is the workhorse routine. It constructs a window of reads covering the + # next locus in the genome, and returns the reference, the contig, its position + # in the genome, along with a vector of offsets into the list of reads that denote + # the equivalent base among all the reads and reference. + # + def next(self): + if self.pos <> None: + self.cleanWindow(self.chr, self.pos) # consume reads not covering pos + self.expandWindow(self.chr, self.pos) # add reads to window covering pos + + if self.window == []: + # the window is empty, we need to jump to the first pos of the first read in the stream: + nextRead = self.rs.next() + self.pushRead( nextRead ) + self.chr = nextRead.chr + self.pos = nextRead.start + return self.next() + else: + # at this point, window contains all reads covering the pos, we need to return them + # and the offsets into each read for this loci + def calcOffset( read ): + offset = self.pos - read.start + return offset + + offsets = map(calcOffset, self.window) + currPos = self.pos + self.pos += 1 # we are going to try to get the next position + return self.ref, self.chr, currPos, offsets, self.window + +# +# Reference reader +# +def readRef(referenceFasta): + ref = {} + + header = None + seq = '' + for line in open(referenceFasta): + if line[0] == '>': + if header <> None: + ref[header] = seq.lower() + seq = '' + header = line[1:].strip() + else: + seq += line.strip() + + ref[header] = seq.lower() + #print ref + return ref + +# +# Main() procedure +# +def main(): + global OPTIONS, ROOT + + # ------------------------------------------------------------ + # + # Setup command line options + # + # ------------------------------------------------------------ + usage = "usage: %prog [options] Walker [sam or bam file or file list]+" + parser = OptionParser(usage=usage) + parser.add_option("-r", "--ref", dest="reference", + type="string", default=None, + help="Reference DNA seqeunce in FASTA format") + parser.add_option("-m", "--maxRecords", dest="maxRecords", + type=int, default=None, + help="Max number of SAM records to process") + parser.add_option("-q", "--quiet", dest="quiet", + action='store_true', default=False, + help="Be quiet when generating output") + parser.add_option("-d", "--debug", dest="debug", + action='store_true', default=False, + help="Verbose debugging output?") + + (OPTIONS, args) = parser.parse_args() + print 'args', args + if len(args) < 2: + parser.error("Incorrect number of arguments") + + + # ------------------------------------------------------------ + # + # Dynamically load the walker class (the first cmd line arg) + # and initialize a walker class object + # + # + # load walkers from standard location + # + # ------------------------------------------------------------ + walkerName = args[0] + walkerModule = __import__(walkerName, globals(), locals(), [], -1) + walkerClass = walkerModule.__dict__[walkerName] + walker = walkerClass() + + print walkerName + print walkerModule + print walkerClass + print walker + + print '------------------------------------------------------------' + print 'program:', sys.argv[0] + print '' + print ' ref: ', OPTIONS.reference + print ' walker: ', walker + if walker.hasOption('name'): + print ' -> ', walker.getOption('name') + if walker.hasOption('desc'): + print ' -> ', walker.getOption('desc') + print '------------------------------------------------------------' + + + # ------------------------------------------------------------ + # + # Initialize the engine + # + # ------------------------------------------------------------ + + # read the reference + refs = readRef(OPTIONS.reference) + + # create the low-level SAMIO streams + files = args[1:] + inputs = map( lambda file: SAMIO( file, debugging=OPTIONS.debug ), files ) + + # build the higher level stream object, either by record or by loci + if walker.isRecordWalker(): + stream = readStream( refs, inputs ) + if walker.isLociWalker(): + stream = lociStream( refs, inputs ) + + # ------------------------------------------------------------ + # + # Move the walker object over the stream + # + # For each element in the record or loci stream, invoke the walker + # object on it. Use filter, map, and reduce to construct the sum + # + # ------------------------------------------------------------ + sum = walker.reduceDefault + counter = 0 + for elt in stream: + counter += 1 + #print 'processing elt', counter, elt, OPTIONS.maxRecords + if OPTIONS.maxRecords <> None: + if (counter * 10) % OPTIONS.maxRecords == 0: + print ' => ', (counter * 100.0) / OPTIONS.maxRecords, '% done' + + if counter > OPTIONS.maxRecords: + break + if walker.filterfunc( *elt ): + x = walker.mapfunc( *elt ) + sum = walker.reducefunc( x, sum ) + print 'Result is', sum + +if __name__ == "__main__": + if True: + import cProfile, pstats + cProfile.run('main()', 'prof') + stats = pstats.Stats('prof') + stats.sort_stats('time') + stats.print_stats(20) + else: + main() + + diff --git a/python/SamWalkTest.py b/python/SamWalkTest.py new file mode 100755 index 000000000..fc2412d85 --- /dev/null +++ b/python/SamWalkTest.py @@ -0,0 +1,31 @@ +#!/util/bin/python + +#from SamWalk import * +from SimpleSAM import * +import Walker + +class SamWalkTest(object, Walker.Walker): + def __init__(self): + Walker.Walker.__init__( self, 'byRecord', {}, name = 'SamWalkTest' ) + + def filterfunc( self, read ): + """Return true if you want to keep the read for mapping""" + return True + #return datum.getPos() % 2 == 0 + + def mapfunc( self, read ): + """Do something to the read, and return a result for reduction""" + #print datum + return 1 + #return len(datum.getSeq()) + #return datum.getSeq() + '/' + #return "\n>%s\n%s" % (datum.getQName(), datum.getSeq()) + + #reduceDefault = '' + def reducefunc( self, x, sum ): + """Take the result of mapping, and previous reduce results in sum, and combine them""" + #print x + return x + sum + + + diff --git a/python/SeparateQltout.cc b/python/SeparateQltout.cc new file mode 100644 index 000000000..7644c9603 --- /dev/null +++ b/python/SeparateQltout.cc @@ -0,0 +1,70 @@ +#include "MainTools.h" +#include "Basevector.h" +#include "lookup/LookAlign.h" +#include "lookup/SerialQltout.h" + +unsigned int MatchingEnd(look_align &la, vecbasevector &candidates, vecbasevector &ref) { + //la.PrintParseable(cout); + + for (int i = 0; i < candidates.size(); i++) { + look_align newla = la; + + if (newla.rc1) { candidates[i].ReverseComplement(); } + newla.ResetFromAlign(newla.a, candidates[i], ref[la.target_id]); + + //newla.PrintParseable(cout, &candidates[i], &ref[newla.target_id]); + //cout << newla.Errors() << " " << la.Errors() << endl; + + if (newla.Errors() == la.Errors()) { + return i; + } + } + + //FatalErr("Query id " + ToString(la.query_id) + " had no matches."); + + return candidates.size() + 1; +} + +int main(int argc, char **argv) { + RunTime(); + + BeginCommandArguments; + CommandArgument_String(ALIGNS); + CommandArgument_String(FASTB_END_1); + CommandArgument_String(FASTB_END_2); + CommandArgument_String(REFERENCE); + + CommandArgument_String(ALIGNS_END_1_OUT); + CommandArgument_String(ALIGNS_END_2_OUT); + EndCommandArguments; + + vecbasevector ref(REFERENCE); + vecbasevector reads1(FASTB_END_1); + vecbasevector reads2(FASTB_END_2); + + ofstream aligns1stream(ALIGNS_END_1_OUT.c_str()); + ofstream aligns2stream(ALIGNS_END_2_OUT.c_str()); + + basevector bv; + + SerialQltout sqltout(ALIGNS); + look_align la; + while (sqltout.Next(la)) { + vecbasevector candidates(2); + candidates[0] = reads1[la.query_id]; + candidates[1] = reads2[la.query_id]; + + unsigned int matchingend = MatchingEnd(la, candidates, ref); + if (matchingend < 2) { + bv = (matchingend == 0) ? reads1[la.query_id] : reads2[la.query_id]; + + //la.PrintParseable(cout, &bv, &ref[la.target_id]); + la.PrintParseable(((matchingend == 0) ? aligns1stream : aligns2stream), &bv, &ref[la.target_id]); + } + } + + aligns1stream.close(); + aligns2stream.close(); + + return 0; +} diff --git a/python/SimpleSAM.py b/python/SimpleSAM.py new file mode 100644 index 000000000..d5e673b1d --- /dev/null +++ b/python/SimpleSAM.py @@ -0,0 +1,69 @@ +from SAM import * + +def read2align( refSeq, readSeq, chr, pos, cigar ): + """Converts the read bases, a reference sequence, and a cigar string into a pair of + alignments strings for refSeq and readSeq. The alignment string has base equivalents + between the reference and the read bases""" + # + # only works for non-indel containing reads right now! + # + contig = refSeq[chr] + refAlign = contig[pos:pos+len(readSeq)] + readAlign = readSeq + return refAlign, readAlign + +def MappedReadFromSamRecord(ref, r): + """Converts SameRead into a SimpleSamRecord object""" + # + # This is a temp. function while we wait for Ted's C++ SAM/BAM io system to come online + # + refAlign, readAlign = read2align(ref, r.getSeq(), r.getRname(), r.getPos(), r.getCigar) + return MappedRead( r.getQName(), r.getFlags(), r.getRname(), r.getPos(), r.getMapq(), + r.getCigar(), r.getSeq(), refAlign, readAlign, r.getQuals() ) + +class MappedRead(object): + """Higher level object representing a mapped read. + + Attempts to be more accessible than a raw SAM record, doing a lot of processing on the record w.r.t. + the reference to make analysis of the reads easier. This is the fundamental data structure + used to represent reads through the walker engine""" + + def __init__( self, qname, flags, chr, pos, mapq, cigar, readBases, refAlign, readAlign, qualsByBase ): + # basic information about the read + self.qname = qname + self.mapq = mapq + self.cigar = cigar + self.bases = readBases # actual bases of reads, unaligned, may be fw or reverse + self.quals = qualsByBase # as a vector of floats, by base position + + # dealing with position information, more than a standard record Hasbro + self.chr = chr + self.start = pos + self.readlen = len(readBases) + self.end = pos + len(refAlign) - 1 + + # print 'refAlign', refAlign, len(refAlign) + + # dealing with stand + self.flags = flags + self.isForwardStrand = SAMFlagIsSet(flags, SAM_QUERYSTRAND) + self.isReverseStrand = not SAMFlagIsSet(flags, SAM_QUERYSTRAND) + + self.isPrimary = SAMFlagIsSet(flags, SAM_NOTPRIMARY) + self.isUnmapped = SAMFlagIsSet(flags, SAM_UNMAPPED) + self.isPaired = SAMFlagIsSet(flags, SAM_SEQPAIRED) + self.isMapPaired = SAMFlagIsSet(flags, SAM_MAPPAIRED) + self.isFirstReadInPair = SAMFlagIsSet(flags, SAM_ISFIRSTREAD) + self.isSecondReadInPair = SAMFlagIsSet(flags, SAM_ISSECONDREAD) + + #self.isMapPaired = SAMFlagIsSet(flags, SAM_MATEUNMAPPED) + #self.isMapPaired = SAMFlagIsSet(flags, SAM_MATESTRAND) + + # everything will be reverse complemented already + self.refAlign = refAlign # + self.readAlign = readAlign # all should be forward strand + + def __str__(self): + return '' % (self.chr, self.start, self.end) + + __repr__ = __str__ \ No newline at end of file diff --git a/python/SimulateReads.py b/python/SimulateReads.py new file mode 100755 index 000000000..565ca8f15 --- /dev/null +++ b/python/SimulateReads.py @@ -0,0 +1,753 @@ +#!/util/bin/python + +from optparse import OptionParser +import os +import sys +#import set +import random +import string + +from SAM import * + +def all(iterable): + for element in iterable: + if not element: + return False + return True + +def any(iterable): + for element in iterable: + if element: + return True + return False + +OPTIONS = None + +bases = set(['a', 't', 'g', 'c']) + +class Mutation: + """Representation of a change from one sequence to another""" + + SNP = "SNP" + INSERTION = "Insertion" + DELETION = "Deletion" + TYPES = set([SNP, INSERTION, DELETION]) + + CIGARChars = { SNP : 'M', INSERTION : 'I', DELETION : 'D' } + + def __init__( self, contig, pos, type, original, mutated ): + # example: chr20 123 SNP "a" "t" <- a to t at base 123 on chr20 + # example: chr20 123 INSERTION "" "atg" <- insertion of 3 bases starting at pos 123 + # example: chr20 123 DELETION "atg" "" <- deletion of 3 bases starting at pos 123 + self.contig = contig # contig of the mutation + self._pos = pos # position of the change in contig coordinates + self._type = type # one of the TYPES + assert(self._type in Mutation.TYPES) + self.original = original # String of the original genotype + self.mutated = mutated # String of the mutated genotype + + def pos(self, offset=0): return self._pos - offset + def type(self): return self._type + def __len__(self): + return max( len(self.mutated), len(self.original) ) + + def mutString( self ): + if self._type == Mutation.SNP: + return 'P:%s->%s' % (self.original, self.mutated) + elif self._type == Mutation.INSERTION: + return 'I:%s' % (self.mutated) + elif self._type == Mutation.DELETION: + return 'D:%s' % (self.original) + else: + raise Exception('Unexpected mutation type ' + self._type) + + def CIGARChar( self ): + return Mutation.CIGARChars[self.type()] + + def __str__( self ): + return '%s:%d %s' % (self.contig, self._pos, self.mutString()) + + def __repr__( self ): + return 'Mutation(%s, %d, %s)' % (self.contig, self._pos, self.mutString()) + + def execute( self, offset, refseq, mutseq ): + p = self.pos(offset) + if self.type() == Mutation.SNP: + mutseq[p] = self.mutated + elif self.type() == Mutation.INSERTION: + refseq[p:p] = string.join(['-'] * len(self),'') + mutseq[p:p] = self.mutated + elif self.type() == Mutation.DELETION: + mutseq[p:p+len(self)] = string.join(['-'] * len(self), '') + refseq.extend(self.mutated) + mutseq.extend(self.mutated) + + #print refseq, mutseq + return refseq, mutseq + +# --------------------------------------------------------------------------- +# +# Code for dealing with CIGAR strings +# +# --------------------------------------------------------------------------- +def flatten(l): + out = [] + for item in l: + if isinstance(item, (list, tuple)): + out.extend(flatten(item)) + else: + out.append(item) + return out + +def mutationsCIGAR( readStart, alignStart, readLen, mutations ): + if len(mutations) == 1 and mutations[0].type() <> Mutation.SNP: + mut = mutations[0] + internalPos = mut.pos() - alignStart + + leftLen = max(internalPos - readStart,0) + mutLen = min(internalPos + len(mut) - readStart, readLen - leftLen, len(mut)) + rightLen = readLen - leftLen - mutLen + #print mut, readStart, alignStart, internalPos, leftLen, mutLen, rightLen + #print + + l = [ (leftLen, 'M'), (mutLen, mut.CIGARChar()), (rightLen, 'M') ] + l = filter( lambda x: x[0] > 0, l ) + return string.join(map(str, flatten(l)), '') + + if not all( map( lambda mut: mut.type() == Mutation.SNP, mutations ) ): + # bail + raise Exception('Currently only supports multiple SNPs CIGARS') + + return str(readLen) + 'M' + +# def mutationsCIGARBAD( refStart, readStart, readStop, mutations ): +# sortedMutations = sorted(mutations, key=Mutation.pos) +# CIGAR = [] +# pos = readStart +# +# for mutation in sortedMutations: +# internalMutationStart = mutation.pos() - refStart +# if internalMutationStart >= readStart and internalMutationStart < readStop +# delta = mutation.pos() - pos +# if not OPTIONS.quiet: +# print 'mutationsCIGAR', pos, CIGAR, delta, mutation +# if mutation.type() <> Mutation.SNP and delta > 0: +# CIGAR.append([delta, 'M']) +# pos = mutation.pos() +# +# if mutation.type == Mutation.INSERTION: +# CIGAR.append(['I', len(mutation)]) +# if mutation.type == Mutation.DELETION: +# CIGAR.append(['D', len(mutation)]) +# +# delta = refSeqPos + refLen - pos +# if not OPTIONS.quiet: +# print 'mutationsCIGAR', pos, CIGAR, delta, mutation +# if delta > 0: +# CIGAR.append([delta, 'M']) +# +# s = string.join(map( str, flatten(CIGAR) ), '') +# print 'CIGAR:', CIGAR, s +# return s + +# --------------------------------------------------------------------------- +# +# mutating the reference +# +# --------------------------------------------------------------------------- +def executeMutations( mutations, seqIndex, refseq, mutseq ): + for mutation in mutations: + internalSite = mutation.pos() - seqIndex + refseq, mutseq = mutation.execute( seqIndex, refseq, mutseq ) + return refseq, mutseq + +def isBadSequenceForSim(seq): + if any( map( lambda b: b not in bases, seq.tostring().lower() ) ): + return True + else: + return False + + +def mutateReference(ref, contig, mutSite, mutType, mutParams, nBasesToPad): + #print 'mutateReference' + + # start and stop + refStartIndex = mutSite - nBasesToPad + 1 + internalStartIndex = nBasesToPad - 1 + + if OPTIONS.mutationType == 'SNP' or OPTIONS.mutationType == 'NONE': + refStopIndex = mutSite + nBasesToPad + else: + refStopIndex = mutSite + nBasesToPad - 1 + + refsub = ref[refStartIndex : refStopIndex] + mutseq = refsub.tomutable() + refsub = refsub.tomutable() + mutations = [] + + def success(): + return True, mutations, refsub, mutseq + def fail(): + return False, [], None, None + + if refStartIndex < 0: + return fail() + if isBadSequenceForSim(refsub): + #print 'rejecting seq', refsub.tostring() + #print map( lambda b: b not in bases, refsub.tostring().lower() ) + return fail() + + if not OPTIONS.quiet: + print ref + print refsub + + otherSites = set(range(refStartIndex, refStopIndex)) - set([mutSite]) + # print 'otherSites', otherSites + for site in [mutSite] + random.sample(otherSites, max(OPTIONS.mutationDensity-1,0)): + internalSite = site - refStartIndex + if mutType == 'NONE' or OPTIONS.mutationDensity < 1: + pass + elif mutType == 'SNP': + mutBase = ref[site].lower() + otherBases = bases - set(mutBase) + otherBase = random.choice(list(otherBases)) + assert mutBase <> otherBase + if not OPTIONS.quiet: + print mutBase, ' => ', otherBase + mutations.append( Mutation( contig, site, Mutation.SNP, mutBase, otherBase ) ) + elif mutType == 'INSERTION': + inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '') + mutation = Mutation( contig, site, Mutation.INSERTION, '', inSeq ) + mutations.append( mutation ) + elif mutType == 'DELETION': + inSeq = string.join([ random.choice(list(bases)) for i in range( OPTIONS.indelSize ) ], '') + mutation = Mutation( contig, site, Mutation.DELETION, '', ref[refStopIndex:refStopIndex+OPTIONS.indelSize]) + mutations.append( mutation ) + else: + raise Exception('Unexpected mutation type', mutType) + + # process the mutations + refsub, mutseq = executeMutations( mutations, refStartIndex, refsub, mutseq ) + + return success() + +# version 1.0 -- prototype +# def mutateReference(ref, mutSite, mutType, mutParams, nBasesToPad): +# #print 'mutateReference' +# refStartIndex = mutSite - nBasesToPad + 1 +# refStopIndex = mutSite + nBasesToPad +# internalStartIndex = nBasesToPad - 1 +# +# if refStartIndex < 0: +# return False, None, None +# +# refsub = ref[refStartIndex : refStopIndex] +# print ref +# print refsub +# +# if mutType == 'SNP' or mutType == 'None': +# #print ' In IF' +# mutBase = ref[mutSite] +# bases = set(['A', 'T', 'G', 'C']) +# if mutBase in bases: +# otherBases = bases - set(mutBase) +# otherBase = random.choice(list(otherBases)) +# assert mutBase <> otherBase +# mutseq = refsub.tomutable() +# +# if mutType == 'SNP': +# print mutBase, ' => ', otherBase +# mutseq[internalStartIndex] = otherBase +# +# print refsub +# print mutseq +# align = Bio.Align.Generic.Alignment(Gapped(IUPAC.unambiguous_dna, '-')) +# align.add_sequence("ref", refsub.tostring()) +# align.add_sequence("mut", mutseq.tostring()) +# print str(align) +# +# return True, refsub, mutseq +# +# return False, None, None + +# --------------------------------------------------------------------------- +# +# sample reads from alignments +# +# --------------------------------------------------------------------------- +def accumulateBases( mutSeq, start, readLen ): + count = 0 + for stop in range(start, len(mutSeq)): + if notDash(mutSeq[stop]): + count += 1 + #print stop, count + if count == readLen: + break + + stop += 1 + read = string.join(filter( notDash, mutSeq[start:stop] ), '') + + return read, stop + +# doesn't support paired reads +# +# def sampleReadsFromAlignment(refSeq, mutSeq, alignStart, readLen, nReads, mutations): +# #print 'REF:', refSeq.tostring() +# #print 'MUT:', mutSeq.tostring() +# #print '' +# +# # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation +# # right in the middle +# lastGoodStartSite = readLen - 1 +# if not OPTIONS.quiet: +# print refSeq.tostring(), 'ref' +# print mutSeq.tostring(), 'mut' +# +# # we can potentially start at any site in mutSeq +# nPotentialStarts = len(mutSeq) - readLen + 1 +# potentialStarts = range(nPotentialStarts) +# +# if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts: +# if OPTIONS.uniqueReadsOnly: +# starts = potentialStarts +# else: +# starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts)) +# else: +# starts = random.sample(potentialStarts, nPotentialStarts) +# +# #print 'potentialStarts', potentialStarts +# #print 'Starts', starts +# +# def sampleRead( start ): +# if mutSeq[start] <> '-': +# read, stop = accumulateBases( mutSeq, start, readLen ) +# remaining = len(mutSeq) - stop +# refForRead = refSeq[start:stop] +# #print 'XXXX', start, stop, refForRead, read +# cigar = mutationsCIGAR( alignStart, readLen, mutations ) +# if not OPTIONS.quiet: +# leftPads = string.join(start * ['-'], '') +# rightPads = string.join(remaining * ['-'], '') +# print leftPads + mutSeq.tostring()[start:stop] + rightPads, start, stop, cigar +# return filter(notDash, read), alignStart + start, cigar +# else: +# return False, False, False +# +# reads = map( sampleRead, starts ) +# return [read for read in reads if read[0] <> False] + +class refSite: + def __init__( self, refSeq, mutSeq, alignStart, mutations ): + self.refSeq = refSeq + self.mutSeq = mutSeq + self.alignStart = alignStart + self.mutations = mutations + +def sample1Read( refSite, start, readLen, reverseComplementP = False ): + if refSite.mutSeq[start] <> '-': + read, stop = accumulateBases( refSite.mutSeq, start, readLen ) + + mutSubSeq = refSite.mutSeq[start:stop] + if reverseComplementP: + s = Seq( read, IUPAC.unambiguous_dna ) + #print 'reverseComplementP', read, s.reverse_complement().tostring(), mutSubSeq + read = s.reverse_complement().tostring() + mutSubSeq.complement() + + remaining = len(refSite.mutSeq) - stop + refForRead = refSite.refSeq[start:stop] + #print 'XXXX', start, stop, refForRead, read + cigar = mutationsCIGAR( start, refSite.alignStart, readLen, refSite.mutations ) + leftPads = string.join(start * ['-'], '') + rightPads = string.join(remaining * ['-'], '') + ppReadStr = leftPads + mutSubSeq.tostring() + rightPads # + '(' + read + ')' + return True, filter(notDash, read), refSite.alignStart + start, cigar, ppReadStr + else: + return False, None, None, None, None + +def sampleReadsFromAlignment(wholeRef, refSeq, mutSeq, alignStart, readLen, nReads, mutations, pairedOffset, mutations2, refSeq2, mutSeq2): + #print 'REF:', refSeq.tostring() + #print 'MUT:', mutSeq.tostring() + #print '' + + # pairedSite, mutations2, refSeq2, mutSeq2 can all be None + + refSite1 = refSite( refSeq, mutSeq, alignStart, mutations ) + refSite2 = refSite( refSeq2, mutSeq2, alignStart + pairedOffset, mutations2 ) + + #print refSite1.alignStart, refSite2.alignStart, pairedOffset + + # we are assuming that mutSeq is exactly 1 + 2 * readLen in size, which the mutation + # right in the middle + lastGoodStartSite = readLen - 1 + if not OPTIONS.quiet: + if OPTIONS.generatePairedEnds: + r = wholeRef.seq[refSite1.alignStart:refSite2.alignStart + 2*readLen - 1] + print r.tostring(), 'ref' + print r.complement().tostring(), 'ref, complement' + else: + print refSeq.tostring(), 'ref' + print mutSeq.tostring(), 'mut' + + # we can potentially start at any site in mutSeq + nPotentialStarts = len(mutSeq) - readLen + 1 + potentialStarts = range(nPotentialStarts) + #print 'potentialStarts', potentialStarts + + if nReads.lower() == 'tile' or int(nReads) >= nPotentialStarts: + if OPTIONS.uniqueReadsOnly: + starts = potentialStarts + else: + starts = map( lambda x: random.choice(potentialStarts), range(nPotentialStarts)) + else: + starts = random.sample(potentialStarts, int(nReads)) + + #print 'Starts', starts + + def sampleRead( start ): + good1, read1, pos1, cigar1, ppstr1 = sample1Read( refSite1, start, readLen ) + + if OPTIONS.generatePairedEnds: + good2, read2, pos2, cigar2, ppstr2 = sample1Read( refSite2, start, readLen, True ) + if not OPTIONS.quiet: + offsetDashes = string.join(['-'] * (pairedOffset), '') + print + print ppstr1 + offsetDashes, pos1, cigar1 + print offsetDashes + ppstr2, pos2, cigar2 + return [(read1, pos1, cigar1), (read2, pos2, cigar2)] + else: + if not OPTIONS.quiet: + print ppstr1, pos1, cigar1 + return [(read1, pos1, cigar1), (None, 0, None)] + + reads = map( sampleRead, starts ) + return [read for read in reads if read[0][0] <> None] + +def notDash( c ): + return c <> '-' + +def fakeQuals( seq ): + return len(seq) * [30] + #return range(len(seq)) + +def alignedRead2SAM( siteID, readID, fastaID, read, pos, cigar, read2, pos2, cigar2 ): + rname = fastaID + mapq = 40 + quals = fakeQuals( read ) + + qname = '%s:%d.read.%d' % (fastaID, siteID, readID) + + if not OPTIONS.generatePairedEnds: + # not in paired mode + flags = [] + record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals ) + record2 = None + else: + flags = [SAM_SEQPAIRED, SAM_MAPPAIRED] + insertSize = pos2 - pos - len(read) + record1 = SAMRecordFromArgs( qname, flags, rname, pos, mapq, cigar, read, quals, pairContig = rname, pairPos = pos2, insertSize = insertSize ) + record2 = SAMRecordFromArgs( qname + 'p', flags, rname, pos2, mapq, cigar2, read2, quals, pairContig = rname, pairPos = pos, insertSize = insertSize ) + + #print 'read1, read2', record1.getSeq(), record2.getSeq() + + return [record1, record2] + +# --------------------------------------------------------------------------- +# +# paired end reads +# +# --------------------------------------------------------------------------- +def pairedReadSite( ref, leftMutSite, readLen ): + pairedOffset = int(round(random.normalvariate(OPTIONS.insertSize, OPTIONS.insertDev))) + + def fail(): return False, None + + if pairedOffset < 0: + return fail() + + pairedSite = pairedOffset + leftMutSite + readLen + #print 'pairedSite', pairedOffset, leftMutSite, pairedSite + + if pairedSite + readLen >= len(ref): + return fail() + refsub = ref[pairedSite - readLen:pairedSite + readLen + 1] + if isBadSequenceForSim( refsub ): + return fail() + + return True, pairedSite + + + +# --------------------------------------------------------------------------- +# +# build allowed regions and sampling starts from region +# +# --------------------------------------------------------------------------- +def parseSamplingRange( arg, ref, readLen ): + # returns a list of the allowed regions to sample in the reference + + def one(x): + elts = x.split() + #print 'elts', elts + if len(elts) > 0: + elts1 = elts[0].split(':') + #print 'elts1', elts1 + return map( int, elts1 ) + else: + return False + + if arg <> None: + try: + return [ one(arg) ] + except: + # try to read it as a file + return filter( None, map( one, open(arg).readlines() ) ) + else: + return [[0, len(ref.seq) - readLen]] + +def sampleStartSites( sampleRanges, nsites ): + print 'sampleStartSites', sampleRanges, nsites + + # build a set of potential sites + if len(sampleRanges) > 1: + potentialSites = set() + for start, end in sampleRanges: + #print 'potentialSites', start, end, potentialSites + potentialSites = potentialSites.union(set(xrange(start, end))) + else: + start, end = sampleRanges[0] + potentialSites = xrange(start, end) + + #print 'potentialSites', potentialSites + + # choose sites from potentials + if len(potentialSites) <= nsites: + # tile every site + sites = potentialSites + else: + # we need to sample from the potential sites + sites = random.sample( potentialSites, nsites ) + + # print 'Sites', sites + print 'Number of potential start sites:', len(potentialSites) + print 'Number of start sites that will be generated:', len(sites) + return sites + +def readRef(referenceFasta): + handle = open(referenceFasta) + for seq_record in SeqIO.parse(handle, "fasta"): + print seq_record.id + print seq_record.name + #print repr(seq_record.seq) + print len(seq_record.seq) + yield seq_record + handle.close() + +import os.path +def outputFilename(): + root = os.path.splitext(os.path.split(OPTIONS.reference)[1])[0] + if OPTIONS.generatePairedEnds: + pairedP = 'Yes' + else: + pairedP = 'No' + + params = [['mut',OPTIONS.mutationType], ['den', max(OPTIONS.mutationDensity, OPTIONS.indelSize)], [OPTIONS.readLen,'bp'], \ + ['nsites', OPTIONS.nSites], ['cov', OPTIONS.coverage], ['range', OPTIONS.range], ['paired', pairedP]] + filename = root + '__' + string.join(map( lambda p: string.join(map(str, p),'.'), params), '_') + root = os.path.join(OPTIONS.outputPrefix, filename) + return root, root + '.sam' + +def main(): + global OPTIONS, ROOT + + usage = "usage: %prog [options]" + parser = OptionParser(usage=usage) + parser.add_option("-r", "--ref", dest="reference", + type="string", default=None, + help="Reference DNA seqeunce in FASTA format") + parser.add_option("-o", "--outputPrefix", dest="outputPrefix", + type="string", default='', + help="Output Prefix SAM file") + parser.add_option("-c", "--coverage", dest="coverage", + type="string", default='1', + help="Number of reads to produce that cover each mutated region for the reference, or tile for all possible reads.") + parser.add_option("-n", "--nsites", dest="nSites", + type=int, default=1, + help="Number of sites to generate mutations in the reference") + parser.add_option("-l", "--readlen", dest="readLen", + type=int, default=36, + help="Length of reads to generate") + parser.add_option("-s", "--seed", dest="seed", + type=int, default=123456789, + help="Random number seed. If 0, then system clock is used") + parser.add_option("-t", "--muttype", dest="mutationType", + type="string", default='None', + help="Type of mutation to introduce from reference. Can be None, SNP, insertion, or deletion") + parser.add_option("-e", "--mutdensity", dest="mutationDensity", + type=int, default=1, + help="Number of mutations to introduce from the reference") + parser.add_option("-x", "--range", dest="range", + type="string", default=None, + help="Sampling range restriction, in the form of x:y without a space") + parser.add_option("-u", "--unique", dest="uniqueReadsOnly", + action='store_true', default=True, + help="Assumes that the user wants unique reads generated. If the coverage is greater than the number of unique reads at the site, the region is just tiled") + parser.add_option("-i", "--indelsize", dest="indelSize", + type=int, default=0, + help="Size in bp of the insertion or deletion to introduce") + + # Paired end options + parser.add_option("", "--paired", dest="generatePairedEnds", + action='store_true', default=False, + help="Should we generate paired end reads?") + parser.add_option("", "--insertSize", dest="insertSize", + type=int, default=280, + help="Size in bp of the paired library insertion") + parser.add_option("", "--insertDev", dest="insertDev", + type=int, default=5, + help="Standard deviation in the size in bp of the paired library insertion") + + parser.add_option("-q", "--quiet", dest="quiet", + action='store_true', default=False, + help="Be quiet when generating output") + parser.add_option("-d", "--debug", dest="debug", + action='store_true', default=False, + help="Verbose debugging output?") + (OPTIONS, args) = parser.parse_args() + if len(args) != 0: + parser.error("incorrect number of arguments") + + root, outputSAM = outputFilename() + + if OPTIONS.seed == 0: + random.seed(None) + else: + random.seed(OPTIONS.seed) + + OPTIONS.mutationType = OPTIONS.mutationType.upper() + + print '------------------------------------------------------------' + print 'program:', sys.argv[0] + print '' + print ' ref: ', OPTIONS.reference + print ' output: ', outputSAM + print '' + print ' mutation type: ', OPTIONS.mutationType + print ' mutation density: ', OPTIONS.mutationDensity + print ' indel size: ', OPTIONS.indelSize + print ' readLen: ', OPTIONS.readLen + print ' nSites: ', OPTIONS.nSites + print ' coverage: ', OPTIONS.coverage + print ' range restriction: ', OPTIONS.range + print '' + print ' paired ends?: ', OPTIONS.generatePairedEnds + print ' insert size: ', OPTIONS.insertSize + print ' insert stddev: ', OPTIONS.insertDev + print '' + print ' Debugging?: ', OPTIONS.debug + print '------------------------------------------------------------' + + if OPTIONS.mutationType <> 'SNP' and OPTIONS.mutationDensity > 1: + raise Exception('Does not support mutation density > 1 for mutations of class', OPTIONS.mutationType) + + readLen = OPTIONS.readLen + header = SAMHeader( "MarkD", readLen ) + SAMout = SAMIO( outputSAM, header, debugging=OPTIONS.debug ) + + mutationsout = open(root + '.mutations.txt', 'w') + qualsout = open(root + '.quals.txt', 'w') + refout = open(root + '.ref', 'w') + + fastaout = open(root + '.fasta', 'w') + if OPTIONS.generatePairedEnds: + fastaout2 = open(root + '.pairs.fasta', 'w') + qualsout2 = open(root + '.pairs.quals.txt', 'w') + + counter = 0 + refLen = 0 + for ref in readRef(OPTIONS.reference): + refLen = len(ref.seq) + + # write the crazy ref file info needed by samtools + print >> refout, ref.id + '\t' + str(refLen) + + sampleRanges = parseSamplingRange( OPTIONS.range, ref, readLen ) + sites = sampleStartSites( sampleRanges, OPTIONS.nSites ) + + for mutSite, siteCounter in zip( sites, range(1, len(sites)+1) ): + #print 'Sampling site:', mutSite, refLen + #mutSite = readLen-1 + nSitesSelected = max(int(round(len(sites)/100.0)), 1) + #print siteCounter, len(sites), nSitesSelected) + if siteCounter % nSitesSelected == 0: + print 'Sampled site %d of %d (%.2f%%)' % ( siteCounter, len(sites), (100.0*siteCounter) / len(sites)) + + good, mutations, refSeq, mutSeq = mutateReference(ref.seq, ref.id, mutSite, OPTIONS.mutationType, [], readLen) + + pairedSite, mutations2, refSeq2, mutSeq2 = 0, None, None, None + if good and OPTIONS.generatePairedEnds: + if OPTIONS.mutationType == 'SNP': + pairedMutType = 'SNP' + else: + pairedMutType = 'NONE' + + good, pairedSite = pairedReadSite( ref.seq, mutSite, readLen ) + print 'pairedReadSite', good, pairedSite, good + if good: + good, mutations2, refSeq2, mutSeq2 = mutateReference(ref.seq, ref.id, pairedSite, pairedMutType, [], readLen) + + #print 'Good', good, mutations, refSeq, mutSeq + if good: + print >> mutationsout, ref.id + ':' + str(mutSite) + '\t' + string.join(map(str, mutations), '\t') + #print 'Mutations', mutations + + refSeqPos = mutSite - readLen + 1 + readPairs = sampleReadsFromAlignment(ref, refSeq, mutSeq, refSeqPos, readLen, OPTIONS.coverage, mutations, pairedSite - mutSite, mutations2, refSeq2, mutSeq2) + for i in range(len(readPairs)): + counter += 1 + + read1, read2 = readPairs[i] + seq, pos, cigar = read1 + seq2, pos2, cigar2 = read2 + + # write out the sam and fasta files + def write1( record, recordi ): + if record <> None: + SAMout.writeRecord( record ) + sequences = [SeqRecord(Seq(record.getSeq()), id = record.getQName(), description='')] + #print sequences + + if recordi % 2 == 0: + localFastaOut, localQualsOut = fastaout, qualsout + else: + localFastaOut, localQualsOut = fastaout2, qualsout2 + + SeqIO.write(sequences, localFastaOut, "fasta") + print >> localQualsOut, record.getQName(), string.join(map( str, record.getQuals() ), ' ') + + #print i, read1, read2 + records = alignedRead2SAM( mutSite, i+1, ref.id, seq, pos + 1, cigar, seq2, pos2 + 1, cigar2 ) + map( write1, records, range( len(records) ) ) + + SAMout.close() + qualsout.close() + fastaout.close() + refout.close() + mutationsout.close() + + if OPTIONS.generatePairedEnds: + qualsout2.close() + fastaout2.close() + +# for record in SAMIO( outputSAM ): +# print record + +if __name__ == "__main__": + import Bio + from Bio import SeqIO + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + from Bio.Alphabet import IUPAC, Gapped + main() + + diff --git a/python/SpawnMapperJobs.py b/python/SpawnMapperJobs.py new file mode 100755 index 000000000..ca3045585 --- /dev/null +++ b/python/SpawnMapperJobs.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python + +import getopt, sys, os, string +from farm_commands import * + +#FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py" +FastaQuals2Fastq_exe = "FastaQuals2Fastq.py" + +def isFastaB(filename): + """Is the file a fastb file already?""" + #print os.path.splitext(filename) + return os.path.splitext(filename)[1] == '.fastb' + +def readListOfLanes( listFile ): + """Simply reads a list of files to process from a file""" + lines = map( string.split, map( string.strip, open(listFile).readlines() ) ) + return map( lambda x: x[0], lines ) + #return map( lambda x: x[0], lines ), map( lambda x: x[1], lines ) + + +def run_swmerlin(input_file, input_head, farm=""): + run_merlin(input_file, input_head, farm, sw=True) + +def run_merlin(input_file, input_head, farm="", sw=False): + "sw = Merlin Smith-Waterman option" + if isFastaB(input_file): + input_fastb = input_file + else: + input_fastb = input_head+".fastb" + if regenExistingFiles or not os.path.exists(input_fastb): + cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands) + if sw: + output_head = input_head+".swmerlin" + else: + output_head = input_head+".merlin" + cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head + if sw: + cmd_str += " SW=True" + cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) + +USE_BATCH = False + +def run_ilt(input_file, input_head, farm=""): + #print 'isFastaB', input_file, isFastaB(input_file) + if isFastaB(input_file): + input_fastb = input_file + else: + input_fastb = input_head+".fastb" + if regenExistingFiles or not os.path.exists(input_fastb): + cmd("Fasta2Fastb IN= "+input_file, just_print_commands=justPrintCommands) + + output_head = input_head+".ilt" + + if USE_BATCH: + cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head + cmd(cmd_str, False, input_head, just_print_commands=justPrintCommands) + else: + cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head + cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) + +def run_BWA32(input_file, input_head, farm=""): + run_BWA(input_file, input_head, farm, 32) + +def run_BWA(fasta, input_head, farm="", seed='inf'): + output_head = input_head+".bwa" + quals = input_head+".quals.txt" + fastq = input_head+".fastq" + if regenExistingFiles or not os.path.exists(fastq) : + cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq + cmd(cmd_str) + if seed <> 'inf': + output_head += str(seed) + output_sam = output_head + ".sam" + cmd_str = "bwahuman samse "+str(seed)+" "+fastq+" "+output_sam + cmd(cmd_str, farm, output_head, just_print_commands=justPrintCommands) + +def run_MAQ(input_fasta, head, farm=""): + maq_exe = "/seq/dirseq/maq-0.7.1/maq" + bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa" + + fasta = input_fasta + quals = head+".quals.txt" + fastq = head+".fastq" + if regenExistingFiles or not os.path.exists(fastq) : + cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq + cmd(cmd_str) + + bfq = head+".bfq" + if regenExistingFiles or not os.path.exists(bfq): + cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq, just_print_commands=justPrintCommands ) + + out_head = head+".maq" + maq_out = out_head+".out.aln.map" + cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq + cmd(cmd_str, farm, out_head, just_print_commands=justPrintCommands) + +def usage(): + print "Required arguments:" + print " -i Input FASTA head (*.fasta, *.qualb)" + print " OR" + print " -d Directory to grab all FASTA files from" + print " OR" + print " -l List of FASTA/FASTB files to process" + print + print "Optional arguments:" + print " -f QUEUE Farm jobs to QUEUE on LSF" + print + print " -m MAPPER Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)" + print + print " -x Don't execute commands, just print them" + print + print " -w Output files to current directory (strip path from input file/dir/list" + print + + +def get_all_fasta_files(fasta_dir): + files = os.listdir(fasta_dir) + if not fasta_dir.endswith("/"): fasta_dir += "/" + fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0] + #print fasta_files + return fasta_files + +justPrintCommands = False +regenExistingFiles = False + +if __name__ == "__main__": + opts = None + try: + opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw:r", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "regenExistingFiles", "outputInWorkingDirectory"]) + except getopt.GetoptError: + print sys.argv + usage() + sys.exit(2) + + input_head = "" + fasta_dir = "" + mapper_str = "all" + farm_sub = False + listOfLanes = None + outputInWorkingDirectory = False + + for opt, arg in opts: + print opt, arg + if opt in ("-i", "--input"): + input_head = arg + if opt in ("-l", "--listOfLanes"): + listOfLanes = arg + if opt in ("-d", "--fasta_dir"): + fasta_dir = arg + if opt in ("-f", "--farm"): + farm_sub = arg + if opt in ("-r", "--regenExistingFiles"): + regenExistingFiles = True + if opt in ("-m", "--mapper"): + mapper_str = arg + if opt in ("-x", "--dontexe"): + justPrintCommands = True + if opt in ("-w", "--outputInWorkingDirectory"): + outputInWorkingDirectory = True + + if (input_head == "") and (fasta_dir == "") and (listOfLanes == None): + print input_head, fasta_dir, listOfLanes + usage() + sys.exit(2) + + # Select function(s) for mapper + mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ, "bwa":run_BWA, "bwa32":run_BWA32} + if mapper_str.lower() == "all": + mapper_list = mapper_func_list.values() + else: + mapper_list = [mapper_func_list.get(mapper_str.lower())] + if mapper_list == [None]: + sys.exit("Don't know of mapper argument: "+mapper_str) + + if input_head: + input_heads = [None] + input_files = [input_head + 'fasta'] + elif listOfLanes <> None: + input_files = readListOfLanes(listOfLanes) + input_heads = [None] * len(input_files) + else: + input_files = [file for file in get_all_fasta_files(fasta_dir)] + input_heads = [None] * len(input_files) + + for input_file, input_head in zip(input_files, input_heads): + if input_head == None: + file_head = os.path.splitext(input_file)[0] + if outputInWorkingDirectory: + file_head = os.path.split(file_head)[1] + else: + file_head = input_head + for mapper in mapper_list: + mapper( input_file, file_head, farm=farm_sub ) + print diff --git a/python/SpawnMapperJobs.py.old b/python/SpawnMapperJobs.py.old new file mode 100755 index 000000000..53b9e453a --- /dev/null +++ b/python/SpawnMapperJobs.py.old @@ -0,0 +1,202 @@ +#!/usr/bin/env python + +import getopt, sys, os, string + +FastaQuals2Fastq_exe = "/wga/dev/andrewk/Arachne/AlignerEvaluation/FastaQuals2Fastq.py" + +def cmd(cmd_str, farm_queue=False, output_head=""): + # if farm_queue is non-False, submits to queue, other + + if farm_queue: + farm_stdout = output_head+".stdout" + cmd_str = "bsub -q "+farm_queue+" -o "+farm_stdout+" "+cmd_str #+" TMP_DIR=/wga/scr1/andrewk/tmp" + print "### Farming via "+cmd_str + else: + print "### Executing "+cmd_str + + if not justPrintCommands: + # Actually execute the command if we're not just in debugging output mode + os.system(cmd_str) + +def isFastaB(filename): + """Is the file a fastb file already?""" + #print os.path.splitext(filename) + return os.path.splitext(filename)[1] == '.fastb' + +def readListOfLanes( listFile ): + """Simply reads a list of files to process from a file""" + lines = map( string.split, map( string.strip, open(listFile).readlines() ) ) + return map( lambda x: x[0], lines ), map( lambda x: x[1], lines ) + + +def run_swmerlin(input_file, input_head, farm=""): + run_merlin(input_file, input_head, farm, sw=True) + +def run_merlin(input_file, input_head, farm="", sw=False): + "sw = Merlin Smith-Waterman option" + if isFastaB(input_file): + input_fastb = input_file + else: + input_fastb = input_head+".fastb" + if not os.path.exists(input_fastb): + cmd("Fasta2Fastb IN= "+input_file) + if sw: + output_head = input_head+".swmerlin" + else: + output_head = input_head+".merlin" + cmd_str = "Merlin REF_FASTB= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.fastb REF_MERLIN= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.merlinref.bin FASTB= "+input_fastb+" OUT_HEAD="+output_head + if sw: + cmd_str += " SW=True" + cmd(cmd_str, farm, output_head) + #if farm: + # farm_stdout = output_head+".stdout" + # cmd("bsub -q long -o "+farm_stdout+" "+cmd_str) + #else: + # cmd(cmd_str) + +<<<<<<< SpawnMapperJobs.py +USE_BATCH = True + +def run_ILT(input_file, input_head, farm=""): + print 'isFastaB', input_file, isFastaB(input_file) +======= +def run_ilt(input_file, input_head, farm=""): + #print 'isFastaB', input_file, isFastaB(input_file) +>>>>>>> 1.5 + if isFastaB(input_file): + input_fastb = input_file + else: + input_fastb = input_head+".fastb" + if not os.path.exists(input_fastb): + cmd("Fasta2Fastb IN= "+input_file) + +<<<<<<< SpawnMapperJobs.py + output_head = input_head+".ILT" + + if USE_BATCH: + cmd_str = "~depristo/bin/batchShortQueryLookup2.pl --NUMPROCS=10 --BATCHQUEUE=long --SEQS="+input_fastb+" --L=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup --MAX_FREQ=1000 --O= "+output_head + cmd(cmd_str, False, input_head) + else: + cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head + cmd(cmd_str, farm, input_head) + +======= + output_head = input_head+".ilt" + cmd_str = "ImperfectLookupTable SEQS= "+input_fastb+" L= /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.lookuptable.lookup MAX_FREQ=1000 OUT_PREFIX= "+output_head + cmd(cmd_str, farm, output_head) +>>>>>>> 1.5 + +def run_MAQ(input_fasta, head, farm=""): + maq_exe = "/seq/dirseq/maq-0.7.1/maq" + bfa_ref="/seq/dirseq/ktibbett/maq-0.7.1-test/Homo_sapiens_assembly18.bfa" + + fasta = input_fasta + quals = head+".quals.txt" + fastq = head+".fastq" + if not os.path.exists(fastq) : + cmd_str = FastaQuals2Fastq_exe+" "+fasta+" "+quals+" "+fastq + cmd(cmd_str) + + bfq = head+".bfq" + if not os.path.exists(bfq): + cmd( maq_exe+" fastq2bfq "+fastq+" "+bfq ) + + out_head = head+".maq" + maq_out = out_head+".out.aln.map" + cmd_str = maq_exe+" map -e 100 -a 600 -s 0 "+maq_out+" "+bfa_ref+" "+bfq + cmd(cmd_str, farm, out_head) + +def usage(): + print "Required arguments:" + print " -i Input FASTA head (*.fasta, *.qualb)" + print " OR" + print " -d Directory to grab all FASTA files from" + print " OR" + print " -l List of FASTA/FASTB files to process" + print + print "Optional arguments:" + print " -f QUEUE Farm jobs to QUEUE on LSF" + print + print " -m MAPPER Compare output from MAPPER which can be: ilt, merlin, swmerlin, maq, all (default: all)" + print + print " -x Don't execute commands, just print them" + print + print " -w Output files to current directory (strip path from input file/dir/list" + print + + +def get_all_fasta_files(fasta_dir): + files = os.listdir(fasta_dir) + if not fasta_dir.endswith("/"): fasta_dir += "/" + fasta_files = [fasta_dir+f for f in files if f.endswith(".fasta") and os.path.getsize(fasta_dir+f) > 0] + #print fasta_files + return fasta_files + +justPrintCommands = False + +if __name__ == "__main__": + opts = None + try: + opts, args = getopt.getopt(sys.argv[1:], "i:d:f:m:l:xw", ["input","fasta_dir","farm","mapper","listOfLanes", "dontexe", "outputInWorkingDirectory"]) + except getopt.GetoptError: + print sys.argv + usage() + sys.exit(2) + + input_head = "" + fasta_dir = "" + mapper_str = "all" + farm_sub = False + listOfLanes = None + outputInWorkingDirectory = False + + for opt, arg in opts: + print opt, arg + if opt in ("-i", "--input"): + input_head = arg + if opt in ("-l", "--listOfLanes"): + listOfLanes = arg + if opt in ("-d", "--fasta_dir"): + fasta_dir = arg + if opt in ("-f", "--farm"): + farm_sub = arg + if opt in ("-m", "--mapper"): + mapper_str = arg + if opt in ("-x", "--dontexe"): + justPrintCommands = True + if opt in ("-w", "--outputInWorkingDirectory"): + outputInWorkingDirectory = True + + if (input_head == "") and (fasta_dir == "") and (listOfLanes == None): + print input_head, fasta_dir, listOfLanes + usage() + sys.exit(2) + + # Select function(s) for mapper + mapper_func_list = {"ilt":run_ilt, "merlin":run_merlin, "swmerlin":run_swmerlin, "maq":run_MAQ} + if mapper_str.lower() == "all": + mapper_list = mapper_func_list.values() + else: + mapper_list = [mapper_func_list.get(mapper_str.lower())] + if mapper_list == [None]: + sys.exit("Don't know of mapper argument: "+mapper_str) + + if input_head: + input_heads = [None] + input_files = [input_head + 'fasta'] + elif listOfLanes <> None: + input_heads, input_files = readListOfLanes(listOfLanes) + else: + input_files = [file for file in get_all_fasta_files(fasta_dir)] + input_heads = [None] * len(input_files) + + for input_file, input_head in zip(input_files, input_heads): + if input_head == None: + file_head = os.path.splitext(input_file)[0] + if outputInWorkingDirectory: + file_head = os.path.split(file_head)[1] + else: + file_head = input_head + for mapper in mapper_list: + mapper( input_file, file_head, farm=farm_sub ) + print diff --git a/python/WalkLociTest.py b/python/WalkLociTest.py new file mode 100755 index 000000000..1423cac26 --- /dev/null +++ b/python/WalkLociTest.py @@ -0,0 +1,58 @@ +#!/util/bin/python + +import Walker +from SimpleSAM import * + +class WalkLociTest(object, Walker.Walker): + def __init__(self): + Walker.Walker.__init__( self, 'byLoci', {}, name = 'WalkLociTest' ) + self.debug = False + + def filterfunc( self, *data ): + return True + #return datum.getPos() % 2 == 0 + + def mapfunc( self, ref, chrom, pos, offsets, reads ): + + if self.debug: + print '------------------------------' + print ' locus', chrom, pos + print ' offsets = ', offsets + print ' reads = ', reads + + # + # This is the heart of the test walker. You've got the reference, contig, + # pos, offsets, and reads. The line below generate pile-ups + # + def getField(field): + return map( lambda r, o: r.__getattribute__(field)[o], reads, offsets ) + + def toASCII33( quals ): + return ''.join( map( lambda q: chr(q+33), quals )) + + if False: + # actually do some useful work + refbase = ref[chrom][pos-1] + bases = ''.join(getField('bases')) + quals = toASCII33(getField('quals')) + print chrom, pos, refbase, len(reads), all(map(lambda b: b == refbase, bases)), bases, quals + + if self.debug: + for offset, read in zip( offsets, reads ): + if self.debug: + print ' offset, read', offset, read + print read.bases[offset], read.quals[offset] + + #print datum + return 1 + #return len(datum.getSeq()) + #return datum.getSeq() + '/' + #return "\n>%s\n%s" % (datum.getQName(), datum.getSeq()) + + #reduceDefault = '' + def reducefunc( self, x, sum ): + #print x + return x + sum + + + diff --git a/python/Walker.py b/python/Walker.py new file mode 100755 index 000000000..7be6150d5 --- /dev/null +++ b/python/Walker.py @@ -0,0 +1,88 @@ +class Walker: + """Represents a functional walking object to be applied to SAM records. + + Actually useful programs will inherit from this walker object to implement + one or more of the blank operators provided by this module. Effectly, this + engine applies a walker object in a standard order to each record in a SAM/BAM + file: + + walker = MyWalker(args) + samRecords = parseReadsInFiles( files ): + results = [] + + foreach record in files: + if walker.filterfunc( record ): + x = walker.mapfunc( record ) + results.append(x) + reduced = reduce( walker.reducefunc, results, walker.reduceDefault ) + + + where: + + filterfunc( data ) -> returns true or false if the data should be + included or excluded from mapping + + mapfunc( data ) -> applied to each record, locus, or pair of records, depending + on the walker type + + reducefunc( x, sum ) -> combines partial results x with the accumulating + results sum across all the data + + """ + def __init__(self, walkerType, options, name = None, desc = None ): + self.options = options + self.setOption( 'walkerType', walkerType ) + self.setOption( 'name', name ) + self.setOption( 'desc', desc ) + + print self.options + + reduceDefault = 0 + + # + # walker types + # + def isRecordWalker( self ): + return self.getOption('walkerType') == 'byRecord' + def isPairedRecordWalker( self ): + return self.getOption('walkerType') == 'byRecordPairs' + def isLociWalker( self ): + return self.getOption('walkerType') == 'byLoci' + + # + # Option processing + # + def getOption( self, flag ): + if self.hasOption( flag ): + return self.options[flag] + else: + return None + + def setOption( self, flag, value ): + self.options[flag] = value + + def hasOption(self, flag): + return flag in self.options + + def optionEnabled(self, flag): + return flag in self.options and self.options[flag] <> False + + def optionDisabled(self, flag): + return not self.optionEnabled(flag) + + # + # Default functions + # + def filterfunc( self, datum ): + return True + + def mapfunc( self, datum ): + return datum + + def reducefunc( self, mapResult, sum ): + return sum + 1 + +if __name__ == "__main__": + main() + + diff --git a/python/aln_file.nocvs.py b/python/aln_file.nocvs.py new file mode 100755 index 000000000..57d8e318d --- /dev/null +++ b/python/aln_file.nocvs.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +import string, sys + +class aln_record: + "Stores one record of data from a MAQ .aln.txt file" + field_names = ( + "read name", + "chromosome", + "position", + "strand", + "insert size from the outer coorniates of a pair", + "paired flag", + "mapping quality", + "single-end mapping quality", + "alternative mapping quality", + "number of mismatches of the best hit", + "sum of qualities of mismatched bases of the best hit", + "number of 0-mismatch hits of the first 24bp", + "number of 1-mismatch hits of the first 24bp on the reference", + "length of the read", + "read sequence", + "sequence quality" + ) + max_field_name_len = max(map(len, field_names)) + + def __init__(self, obj, parse = True): + self.fields = [] + if type(obj) == str: + self.setValuesFromString( obj, parse ); + else: + raise TypeError("aln_record did not recognize type: "+str(type(obj))) + + def setValuesFromString( self, line, parse = True ): + if parse: + formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str] + self.fields = map( lambda f, v: f(v), formats, line.split() ) + else: + self.fields = line.split() + + def __str__(self): + s = "" + for n,v in zip(aln_record.field_names, self.fields): + s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v)) + return s + + #return string.join( map( str, self.fields ), ' ') + + def id(self): return self.fields[0] + def contig(self): return self.fields[1] + + def offset(self): return self.fields[2]-1 + def pos(self): return self.fields[2] + + # Quality of read mapping (only maq gives this field) + def map_qual(self): return self.fields[6] + + #def offset_end(self): return self.fields[8] + #def pos_end(self): return self.fields[8]+1 + + #def linear_start(self): return self.fields[9] + + +class aln_file: + def __init__(self, filename, parse=True): + self.filename = filename + self.parse = parse + self.faln = open(self.filename) + + def RecordGenerator(self): + for line in self.faln: + yield aln_record(line, self.parse) + raise StopIteration + + def __iter__(self): + return self.RecordGenerator() + +if __name__ == "__main__": + if len(sys.argv) != 2: + print "To test aln_file class:\naln_file.py ALN_FILE" + else: + count = 0 + for aln in aln_file(sys.argv[1]): + print aln + count += 1 + #if count > 5: + # break diff --git a/python/aln_file.py b/python/aln_file.py new file mode 100755 index 000000000..57d8e318d --- /dev/null +++ b/python/aln_file.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +import string, sys + +class aln_record: + "Stores one record of data from a MAQ .aln.txt file" + field_names = ( + "read name", + "chromosome", + "position", + "strand", + "insert size from the outer coorniates of a pair", + "paired flag", + "mapping quality", + "single-end mapping quality", + "alternative mapping quality", + "number of mismatches of the best hit", + "sum of qualities of mismatched bases of the best hit", + "number of 0-mismatch hits of the first 24bp", + "number of 1-mismatch hits of the first 24bp on the reference", + "length of the read", + "read sequence", + "sequence quality" + ) + max_field_name_len = max(map(len, field_names)) + + def __init__(self, obj, parse = True): + self.fields = [] + if type(obj) == str: + self.setValuesFromString( obj, parse ); + else: + raise TypeError("aln_record did not recognize type: "+str(type(obj))) + + def setValuesFromString( self, line, parse = True ): + if parse: + formats = [str, str, int, str, int, int, int, int, int, int, int, int, int, int, str, str] + self.fields = map( lambda f, v: f(v), formats, line.split() ) + else: + self.fields = line.split() + + def __str__(self): + s = "" + for n,v in zip(aln_record.field_names, self.fields): + s += ("%"+str(aln_record.max_field_name_len)+"s : %s\n") % (n, str(v)) + return s + + #return string.join( map( str, self.fields ), ' ') + + def id(self): return self.fields[0] + def contig(self): return self.fields[1] + + def offset(self): return self.fields[2]-1 + def pos(self): return self.fields[2] + + # Quality of read mapping (only maq gives this field) + def map_qual(self): return self.fields[6] + + #def offset_end(self): return self.fields[8] + #def pos_end(self): return self.fields[8]+1 + + #def linear_start(self): return self.fields[9] + + +class aln_file: + def __init__(self, filename, parse=True): + self.filename = filename + self.parse = parse + self.faln = open(self.filename) + + def RecordGenerator(self): + for line in self.faln: + yield aln_record(line, self.parse) + raise StopIteration + + def __iter__(self): + return self.RecordGenerator() + +if __name__ == "__main__": + if len(sys.argv) != 2: + print "To test aln_file class:\naln_file.py ALN_FILE" + else: + count = 0 + for aln in aln_file(sys.argv[1]): + print aln + count += 1 + #if count > 5: + # break diff --git a/python/compSNPCalls.py b/python/compSNPCalls.py new file mode 100644 index 000000000..63f857e3e --- /dev/null +++ b/python/compSNPCalls.py @@ -0,0 +1,591 @@ +#!/usr/bin/env python + +import sys, string +import os +import re +from itertools import * +from optparse import OptionParser +from memo import DiskMemoize, time_func + +class ref_genome: + """Reads reference genome in FASTA format into a dict""" + + def __init__(self, ref_genome_file): + ref_genome.chr_offset = [[] for i in range(45)] + chr_id = 0 + seq = "" + for line in open(ref_genome_file): + if line.startswith(">"): + print line[1:], + if line.startswith(">chrM"): # skip first > line + continue + ref_genome.chr_offset[chr_id] = seq + chr_id += 1 + seq = " " # make it 1 indexed instead of 0 indexed + #if chr_id > 2: + # break + else: + seq += line.rstrip().upper() + ref_genome.chr_offset[chr_id] = seq + + def __getitem__(self, key): + return ref_genome.chr_offset[key] + +AffyChr2Index = dict() +for i in range(1,23): + AffyChr2Index[str(i)] = i +AffyChr2Index['MT'] = 0 +AffyChr2Index['X'] = 23 +AffyChr2Index['Y'] = 24 + +class GenotypeCall: + #ref = time_func(ref_genome)("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") + def __init__( self, chr, pos, genotype, snpP, lod ): + self.chr = chr + self.pos = int(pos) + self._site = chr + ':' + str(self.pos) + self._isSNP = snpP + self.genotype = string.join(map(string.upper, sorted(genotype)), '/') # sorted list of bases at position + self.lod = lod + + def refbase(self): + return GenotypeCall.ref[AffyChr2Index[self.chr]][self.pos] + + def __hash__(self): + return hash(self._site) + + def __eq__(self, other): + return self._site == other._site + + def site(self): return self._site + def isSNP(self): return self._isSNP + + def ref_het_hom(self): + if self.genotype[0] <> self.genotype[2]: + return 1 # het(erozygous non-ref) + else: + # homozygous something + if self.genotype[0] == self.refbase: + return 0 # ref + else: + return 2 # hom(ozygous non-ref) + + def isHET(self): return self.genotype[0] <> self.genotype[2] + def isHOM(self): return self.genotype[0] == self.genotype[2] + + def __str__(self): + return "%s:%s %s %s" % ( self.chr, self.pos, self.genotype, self.lod) + +MAQGenotypeEncoding = { + 'A' : ['A', 'A'], + 'C' : ['C', 'C'], + 'T' : ['T', 'T'], + 'G' : ['G', 'G'], + "M" : ['A', 'C'], + 'K' : ['G', 'T'], + 'Y' : ['C', 'T'], + 'R' : ['A', 'G'], + 'W' : ['A', 'T'], + 'S' : ['C', 'G'], + 'D' : ['A', 'G', 'T'], + 'B' : ['C', 'G', 'T'], + 'H' : ['A', 'C', 'T'], + 'V' : ['A', 'C', 'G'], + 'N' : ['A', 'C', 'G', 'T'] } + +MAQ2STDChr = dict() +for i in range(1,23): + MAQ2STDChr['chr'+str(i)] = str(i) +MAQ2STDChr['chrM'] = 'MT' +MAQ2STDChr['chrX'] = 'X' +MAQ2STDChr['chrY'] = 'Y' + +def convertMAQChr(maqChr): + #print 'convertMAQChr:', maqChr, MAQ2STDChr[maqChr] + if maqChr in MAQ2STDChr: + return MAQ2STDChr[maqChr] + else: + return '?' + +def convertMAQGenotype( oneBaseCode ): + return MAQGenotypeEncoding[oneBaseCode] + +def internalReadSNPFile( parse1, filename ): + result = [] + snps_extracted = 0 + for snp in imap( parse1, open(filename) ): + if snp: + result.append(snp) + snps_extracted += 1 + if snps_extracted > OPTIONS.debug_lines: + break + + print len(result),"genotypes extracted" + return result + +def snpMAP( snps ): + #d = dict( map( lambda x: [x.site(), x], snps ) ) + d = dict() + for snp in snps: + d[snp.site()] = snp#d + + #print 'snps', snps, d + return d + +def overlappingSites( snps1, snps2 ): + map1 = snpMAP(snps1) + map2 = snpMAP(snps2) + shared = set(map1.keys()) & set(map2.keys()) + print 'Number of snp1 records', len(map1) + print 'Number of snp2 records', len(map2) + print 'Number of shared sites', len(shared) + print "\n".join(map(str,snps1)) + return shared + +def readMAQSNPs(filename): + # Each line consists of: + # chromosome + # position + # reference base + # consensus base + # Phred-like consensus quality + # read depth + # the average number of hits of reads covering this position + # the highest mapping quality of the reads covering the position + # the minimum consensus quality in the 3bp flanking regions at each side of the site (6bp in total) + # the second best call + # log likelihood ratio of the second best and the third best call + # and the third best call. + # + # Also, note that: + # + # What do those "S", "M" and so on mean in the cns2snp output? + # They are IUB codes for heterozygotes. Briefly: + # + # M=A/C, K=G/T, Y=C/T, R=A/G, W=A/T, S=G/C, D=A/G/T, B=C/G/T, H=A/C/T, V=A/C/G, N=A/C/G/T + def read1(line): + formats = [str, int, str, str, int, int] + vals = map( lambda f, x: f(x), formats, line.split()[0:6] ) + alignQual = vals[4] + if alignQual >= (10*OPTIONS.lod): + return GenotypeCall( convertMAQChr(vals[0]), vals[1], convertMAQGenotype(vals[3]), vals[2] <> vals[3], alignQual/10.0 ) + else: + #print 'Filtering', alignQual, vals + return False + + return internalReadSNPFile( read1, filename ) + +OPTIONS = None + +def MerlinChr( index ): + if index == 0: + return 'MT' + elif index == 23: + return 'X' + elif index == 24: + return 'Y' + else: + return str(index) + +def readMerlinSNPs(filename): + # 0:72 G GG 155.337967 0.000000 homozygous A:0 C:2 G:510 T:2 514 0 1 1 GG:-5.59 CG:-160.92 GT:-161.51 AG:-162.11 CT:-1293.61 CC:-1293.61 TT:-1294.19 AC:-1294.21 AT:-1294.80 AA:-1295.40 + # 0:149 T CC 118.595886 1131.024696 homozygous-SNP A:2 C:442 G:1 T:7 452 0 1 1 CC:-24.21 CT:-142.81 AC:-156.33 CG:-156.96 TT:-1155.23 AT:-1159.41 GT:-1160.04 AA:-1173.26 AG:-1173.56 GG:-1174.20 + # chr:pos ref genotype bestVsRef bestVsNextBest class ... + def read1(line): + formats = [lambda x: x.split(':'), str, sorted, float, float, str] + vals = map( lambda f, x: f(x), formats, line.split()[0:6] ) + bestVsRef, bestVsNext = vals[3:5] + isSNP = vals[5].find('-SNP') <> -1 + if bestVsRef >= OPTIONS.lod and isSNP: + return GenotypeCall( MerlinChr(int(vals[0][0])), int(vals[0][1]) + 1, vals[2], isSNP, bestVsRef ) + else: + return False + + return internalReadSNPFile( read1, filename ) + +def readSNPfile( filename, format ): + formats = { 'merlin' : readMerlinSNPs, 'maq' : readMAQSNPs } + if format.lower() in formats: + return list(formats[format.lower()](filename)) + else: + raise Exception('Unknown SNP file format ' + format) + +def readAffyFile(filename): + # chrom position genotype probe_set_id dbsnp_id + # 1 84647761 TC SNP_A-1780419 rs6576700 + # 5 156323558 GG SNP_A-1780418 rs17054099 + def read1(line): + formats = [str, int, sorted, str, str] + vals = map( lambda f, x: f(x), formats, line.split() ) + + try: + chr = str(int(vals[0])) + except: + chr = convertMAQChr(vals[0]) + #print 'CHR', chr, vals[0] + return GenotypeCall( chr, vals[1], vals[2], False, 100 ) + + file = open(filename) + file.readline() # skip header + #affyData = map( read1, file ) + affyData = [] + for index, line in enumerate(file): + affyData.append(read1(line)) + if index > OPTIONS.debug_lines: + break + if index % 10000 == 0: + print index + # Give a chance to use list before creating dictionary + return affyData + + #print "1111111" + #return dict( zip( map( GenotypeCall.site, affyData ), affyData ) ) + +def equalSNPs( snp1, snp2 ): + return snp1.genotype == snp2.genotype + +# def concordance( truthSet, testSet, includeVector = None ): +# # calculates a bunch of useful stats about the two +# # data genotype call sets above +# +# states = [[x,0] for x in ['tested', 'shared', 'shared-equal', 'test-snp', 'hom-ref', 'hom-snp', 'het-snp', 'het-ref']] +# counts = dict(states) +# +# def incShared(state, equalP ): +# counts[state] += 1 +# if equalP: +# counts[state + '-equal'] += 1 +# +# nTestSites = 0 +# for i, testSNP in izip( count(), testSet ): +# if includeVector <> None and not includeVector[i]: +# # we are skiping this site +# continue +# +# nTestSites += 1 +# +# if testSNP.isSNP(): +# counts['test-snp'] += 1 +# +# #print testSNP.site() +# if testSNP.site() in truthSet: +# truth = truthSet[testSNP.site()] +# eql = equalSNPs( testSNP, truth ) +# +# incShared( 'shared', eql ) +# if testSNP.isSNP(): +# if truth.isHOM(): incShared( 'hom-snp', eql ) +# else: incShared( 'het-snp', eql ) +# else: +# if truth.isHOM(): incShared( 'hom-ref', eql ) +# else: incShared( 'het-ref', eql ) +# +# if OPTIONS.verbose and nTestSites % 100 == 0 and nSharedSites > 0: +# #print nTestSites, nSharedSites, nEqualSites +# print nTestSites, counts +# +# counts['tested'] = nTestedSites +# +# return counts + +class ConcordanceData: + def __init__(self, name, file1count, file2count): + self.name = name + self.nFile1Sites = file1count # num sites in file 1 + self.nFile2Sites = file2count # num sites in file 1 + self.nSharedSites = 0 # num SNP pairs that map to same position on the genome + self.nEqualSites = 0 # num SNPs pars with the same genotype + + def inc( self, truthSNP, testSNP ): + self.nSharedSites += 1 + if equalSNPs( testSNP, truthSNP ): # if the genotypes are equal + self.nEqualSites += 1 + + def rate(self): + return (100.0 * self.nEqualSites) / max(self.nSharedSites,1) + + def __str__(self): + return '%d %d %.2f' % ( self.nSharedSites, self.nEqualSites, self.rate() ) + +def concordance( truthSet, testSet, sharedSites = None ): + # calculates a bunch of useful stats about the two + # data genotype call sets above + # + # The 2 calls in main work like this: + # affy, snp1, snp1_snp2_shared + # affy, snp2, snp1_snp2_shared + + nTestSites = 0 + + # Now for each of the calls to concordance, we generate 3 sets: + # - allData: all SNP1 sites that are also in Affy + allData = ConcordanceData('all', len(truthSet), len(testSet)) + # - sharedData: SNP1 sites that are also SNP2 sites that are alse in Affy + sharedData = ConcordanceData('shared', len(truthSet), len(testSet)) + # - uniqueData: SNP1 sites that are not SNP2 sites but that are in Affy + uniqueData = ConcordanceData('unique', len(truthSet), len(testSet)) + for i, testSNP in izip( count(), testSet ): + nTestSites += 1 + if testSNP.site() in truthSet: + truthSNP = truthSet[testSNP.site()] + + allData.inc( truthSNP, testSNP ) + if sharedSites <> None: + if testSNP.site() in sharedSites: + sharedData.inc( truthSNP, testSNP ) + else: + uniqueData.inc( truthSNP, testSNP ) + + if OPTIONS.verbose and nTestSites % 100000 == 0: + #print nTestSites, nSharedSites, nEqualSites + print nTestSites, allData, sharedData, uniqueData + + return nTestSites, allData, sharedData, uniqueData + +# def concordance( truthSet, testSet, includeVector = None ): +# # calculates a bunch of useful stats about the two +# # data genotype call sets above +# +# states = [[x,0] for x in ['tested', 'shared', 'test-snp', 'shared-hom-ref', 'shared-het-snp', 'shared-hom-snp']] +# counts = dict(states) +# +# nTestSites = 0 +# nSharedSites = 0 +# nEqualSites = 0 +# for i, testSNP in izip( count(), testSet ): +# nTestSites += 1 +# #print testSNP.site() +# if testSNP.site() in truthSet: +# nSharedSites += 1 +# if equalSNPs( testSNP, truthSet[testSNP.site()] ): +# nEqualSites += 1 +# #else: +# # print '~', testSNP, truthSet[testSNP.site()] +# if OPTIONS.verbose and nTestSites % 100000 == 0 and nSharedSites > 0: +# #print nTestSites, nSharedSites, nEqualSites +# print nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites +# +# return [nTestSites, nSharedSites, nEqualSites, (100.0 * nEqualSites) / nSharedSites] + + +def printConcordanceResults( filename1, filename2, results, hasSharedSites = False ): + nTestSites, allData, sharedData, uniqueData = results + + def print1(data): + print '------------------------------------------------------------' + print 'Concordance results', data.name, 'sites' + print ' -> Genotype file1', filename1 + print ' -> Genotype file2', filename2 + print ' -> Number of tested sites (%s %s): %d' % (filename2, data.name, nTestSites) + print ' -> Number of sites shared between files (%s %s): %d' % (filename2, data.name, data.nSharedSites) + print ' -> Percent sites in file1 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile1Sites)) + print ' -> Percent sites in file2 shared (%s %s): %.2f' % (filename1, data.name, data.nSharedSites / (0.01 * data.nFile2Sites)) + print ' -> Number of genotypically equivalent sites (%s %s): %d' % (filename2, data.name, data.nEqualSites) + print ' -> Concordance rate (%s %s): %.2f' % (filename2, data.name, data.rate()) + + print1(allData) + if hasSharedSites: + print1( sharedData ) # shared between SNP1 and SNP2 + print1( uniqueData ) # unique to SNP1 or to SNP2 only + +def dump_shared_snps( affys, snp_list1, snp_list2 ): + print len(affys), len(snp_list1), len(snp_list2) + snps1 = snpMAP(snp_list1) + snps2 = snpMAP(snp_list2) + snp1_sites = set(snps1.keys()); print "SNP1s:",len(snp1_sites) + snp2_sites = set(snps2.keys()); print "SNP2s:",len(snp2_sites) + affy_sites = set(affys.keys()); print "Affys:",len(affy_sites) + snp1or2_affy_sites = (snp1_sites | snp2_sites) & affy_sites + snp1and2_affy_sites = (snp1_sites & snp2_sites) & affy_sites + print "SNP 1 or 2 and Affy: ",len(snp1or2_affy_sites) + print "SNP 1 and 2 and Affy:",len(snp1and2_affy_sites) + + fsnp = open ("snp.tab","w") + print >>fsnp, "site lod1 lod2 lod1v2 gen1 gen2 genaff inc1 inc2 inc12 lodm genm incm ref_het_hom refbase" + for site in snp1and2_affy_sites: + snp1 = snps1[site] + snp2 = snps2[site] + affy = affys[site] + + print >>fsnp, "%-11s %5.2f %5.2f" % (site, snp1.lod, snp2.lod), + try: + snp1div2 = snp1.lod / snp2.lod + except ZeroDivisionError: + snp1div2 = 1000 + print >>fsnp, "%5.2f" % snp1div2, + print >>fsnp, snp1.genotype, snp2.genotype, affy.genotype, + print >>fsnp, "%1d %1d %1d" % (not equalSNPs(snp1, affy), not equalSNPs(snp2,affy), not equalSNPs(snp1,snp2)), + + # Calculte meta_lod from the two lods + if snp1.genotype == snp2.genotype: + meta_lod = snp1.lod + snp2.lod + meta_genotype = snp1.genotype + else: + if snp1.lod > snp2.lod: + meta_lod = snp1.lod + meta_genotype = snp1.genotype + else: + meta_lod = snp2.lod + meta_genotype = snp2.genotype + meta_inc = meta_genotype != affy.genotype + print >>fsnp, "%5.2f %3s %1d" % (meta_lod, meta_genotype, meta_inc), + print >>fsnp, affy.ref_het_hom(), + print >>fsnp, affy.refbase() + +def intersection_union_snps( affy, snps1, snps2 ): + map1 = snpMAP(snps1) + map2 = snpMAP(snps2) + shared_nonaffy_sites = (set(map1.keys()) & set(map2.keys())).difference(affy.keys()) + nonaffy_shared = [(map1[site].lod, map2[site].lod) for site in shared_nonaffy_sites] + shared_affy_sites = set(map1.keys()) & set(map2.keys()) & set(affy.keys()) + + #shared = [] + #for site in shared_sites: + # shared.append((map1[site], map2[site])) + #print "Shared:",len( shared ) + #shared_x = [s[0].lod for s in shared] + #shared_y = [s[1].lod for s in shared] + + both_corr = [] + snp1_corr = [] + snp2_corr = [] + neither_corr = [] + # given two bools telling whether snp1 and snp2 are correct, + # return the correct object + # + # snp2 incorrect, snp2 correct + truth_list = [[neither_corr, snp2_corr], # snp1 incorrect + [snp1_corr, both_corr]] # snp1 correct + + for site in shared_affy_sites: + snp1_true = equalSNPs(map1[site], affy[site]) + snp2_true = equalSNPs(map2[site], affy[site]) + truth_list[snp1_true][snp2_true].append( (map1[site].lod, map2[site].lod) ) + + print "Beginning plot..." + import rpy2.robjects as robj + robj.r('X11(width=15, height=15)') + XY_MAX = 25 + plots = ((nonaffy_shared, "gray45"),(both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue")) + plots = ((both_corr, "black"), (snp1_corr, "red"), (snp2_corr, "green"), (neither_corr, "blue")) + robj.r.plot([0, XY_MAX], [0, XY_MAX], \ + xlab = os.path.splitext(OPTIONS.snp1)[0].capitalize()+" LOD", \ + ylab = os.path.splitext(OPTIONS.snp2)[0].capitalize()+" LOD", \ + main = "Shared SNP site LODs", \ + xlim = robj.FloatVector([0,XY_MAX]), \ + ylim = robj.FloatVector([0,min(XY_MAX,25)]), \ + pch = 19, \ + cex = 0.0) + for xy, color in plots: + print "X" + robj.r.points([pt[0] for pt in xy], [pt[1] for pt in xy], col=color, pch=19, cex=.3) + print "Color:",color + print "Len:",len(xy) + #print "\n".join(["%25s %25s" % pt for pt in xy]) + + #print "\n".join(["%25s %25s" % shared_snp for shared_snp in shared]) + #robj.r.plot(shared_x, shared_y, xlab=OPTIONS.format1.capitalize()+" LOD", ylab=OPTIONS.format2.capitalize()+" LOD", main="Shared SNP site LODs", col="black", xlim=robj.FloatVector([0,XY_MAX]), ylim=robj.FloatVector([0,min(XY_MAX,25)]), pch=19, cex=0.3) + raw_input("Press enter to continue") + + return + + ss1 = set(snps1) + ss2 = set(snps2) + print "Shared:",len( ss1.intersection(ss2) ) + print "Snp1 only:",len( ss1.difference(ss2) ) + print "Snp2 only:",len( ss2.difference(ss1) ) + print "Snp1 total:",len( ss1 ) + print "Snp2 total:",len( ss2 ) + +def count_het_sites(snp_list): + hets = 0 + for snp in snp_list: + if snp.isHET(): + hets += 1 + print hets,"hets,",len(snp_list),"total,", + print "%.1f" % (float(hets)/len(snp_list)*100) + +def main(argv): + global OPTIONS, ROOT + + usage = "usage: %prog --truth affy-truth-file --snp1 snpfile1 --snp2 snpfile2" + parser = OptionParser(usage=usage) + parser.add_option("-t", "--truth", dest="affy", + type="string", default=None, + help="Affy truth file") + parser.add_option("-1", "--snp1", dest="snp1", + type="string", default=None, + help="Affy truth file") + parser.add_option("-2", "--snp2", dest="snp2", + type="string", default=None, + help="Affy truth file") + parser.add_option("", "--f1", dest="format1", + type="string", default=None, + help="File type of snpfile1") + parser.add_option("", "--f2", dest="format2", + type="string", default=None, + help="File type of snpfile2") + parser.add_option("-l", "--lod", dest="lod", + type="float", default=5.0, + help="Minimum LOD of confident SNP call in Merlin or Q (10x) in MAQ") + parser.add_option("-v", "--verbose", dest="verbose", + action='store_true', default=False, + help="Verbose output") + parser.add_option("-d", "--debug_lines", dest="debug_lines", + type='float', default=sys.maxint, + help="Number of input data lines to process for debugging") + + (OPTIONS, args) = parser.parse_args() + if len(args) != 0: + parser.error("incorrect number of arguments") + + if OPTIONS.affy == None: + parser.error("No affy data specified") + + if OPTIONS.format1 == None: + parser.error("First format cannot be none") + + if OPTIONS.snp2 <> None and OPTIONS.format2 == None: + parser.error("snp2 file given but format was not specified") + + # Load reference genome + #ref = ref_genome("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") + #sys.exit() + + if 1: + print "Reading Affy truth data..." + #readAffyFile2 = DiskMemoize( readAffyFile, "readAffyFile", global_deps = ["OPTIONS.lod"] ) + readAffyFile2 = time_func(readAffyFile) + affy_list = readAffyFile2( filename=OPTIONS.affy ) + #count_het_sites(affy_list) + #sys.exit() + affy = dict( zip( map( GenotypeCall.site, affy_list ), affy_list ) ) + print 'Read affy truth data:' + print ' -> number of genotyped loci', len(affy) + + #readSNPfile2 = DiskMemoize( readSNPfile, "readSNPfile", global_deps = ["OPTIONS.lod"] ) + readSNPfile2 = time_func(readSNPfile) + print "Reading SNPs 1 file..." + snps1 = readSNPfile2( filename=OPTIONS.snp1, format=OPTIONS.format1 ) + if OPTIONS.snp2 <> None: + print "Reading SNPs 2 file..." + snps2 = readSNPfile2( filename=OPTIONS.snp2, format=OPTIONS.format2 ) + + dump_shared_snps( affy, snps1, snps2 ) + #intersection_union_snps( affy, snps1, snps2 ) + #sys.exit() + + sharedSites = None + if OPTIONS.snp2 <> None: + sharedSites = overlappingSites( snps1, snps2 ) + results1 = concordance( affy, snps1, sharedSites ) + printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results1, True ) + results2 = concordance( affy, snps2, sharedSites ) + printConcordanceResults( OPTIONS.affy, OPTIONS.snp2, results2, True ) + + else: + results = concordance( affy, snps1 ) + printConcordanceResults( OPTIONS.affy, OPTIONS.snp1, results, sharedSites ) + +if __name__ == "__main__": + main(sys.argv) diff --git a/python/countCoverageWithSamtools.py b/python/countCoverageWithSamtools.py new file mode 100644 index 000000000..70d3c0cb9 --- /dev/null +++ b/python/countCoverageWithSamtools.py @@ -0,0 +1,46 @@ +import farm_commands +import os.path +import sys + +for line in open(sys.argv[1]): + fastb = line.strip() + head, fastb_filename = os.path.split(fastb) + filebase = os.path.splitext(fastb_filename)[0] + fasta = filebase + '.fasta' + + # convert the fasta + if not os.path.exists(fasta): + cmd = "Fastb2Fasta IN="+fastb+" OUT="+fasta + farm_commands.cmd(cmd) + + qualb = os.path.join(head, filebase + '.new.qualb') + quala = filebase + '.quala' + if not os.path.exists(quala): + cmd = "Qualb2Quala IN="+qualb+" OUT="+quala + farm_commands.cmd(cmd) + + fastq = filebase + '.fastq' + if not os.path.exists(fastq): + cmd = "FastaQuals2Fastq.py "+fasta+" "+quala+ " "+fastq + farm_commands.cmd(cmd) + + filteredFastq = filebase + '.filtered.fastq' + if not os.path.exists(filteredFastq): + cmd = "/seq/dirseq/maq-0.7.1/maq catfilter "+fastq+" > "+filteredFastq + farm_commands.cmd(cmd) + + filteredFasta = filebase + '.filtered.fasta' + print 'Looping' + if not os.path.exists(filteredFasta): + out = open(filteredFasta,'w') + iter = open(filteredFastq).__iter__(); + for line in iter: + if line[0] == '@': + print >> out, '>%s' % line[1:].strip() + print >> out, iter.next().strip() + + sam = filebase + '.sam' + cmd = "bwahuman samse 32 " + fastq + " " + sam + print cmd + +samtools view tcga-freeze3-tumor.rev_2.bam chr1:15,000,000-15,002,000 | wc --lines \ No newline at end of file diff --git a/python/farm_commands.py b/python/farm_commands.py new file mode 100644 index 000000000..8e9ad0cab --- /dev/null +++ b/python/farm_commands.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import os +#justPrintCommands = False + +def cmd(cmd_str, farm_queue=False, output_head="", just_print_commands=False): + # if farm_queue is non-False, submits to queue, other + + if farm_queue: + farm_stdout = output_head+".stdout" + cmd_str = "bsub -q "+farm_queue+" -o "+farm_stdout+" "+cmd_str #+" TMP_DIR=/wga/scr1/andrewk/tmp" + print ">>> Farming via "+cmd_str + else: + print ">>> Executing "+cmd_str + + if just_print_commands or (globals().has_key("justPrintCommands") and globals().justPrintCommands): + return -1 + else: + # Actually execute the command if we're not just in debugging output mode + status = os.system(cmd_str) + if not farm_queue: + print "<<< Exit code:", status,"\n" + return status + diff --git a/python/fasta.py b/python/fasta.py new file mode 100755 index 000000000..231c94dff --- /dev/null +++ b/python/fasta.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python + +import string + +class fasta_record: + "Record containing one FASTA sequence" + def __init__(self, id, seq): + self.id = id + self.seq = seq + + def __str__(self): + return '['+self.id+" "+self.seq+']' + +class fasta_file: + "Iterable object based on FASTA file format" + def __init__(self, filename, cleanup=True): + "cleanup removes spaces from fasta text (default: True)" + self.filename = filename + self.fin = open(self.filename) + self._cleanup = cleanup + + def RecordGenerator(self): + line = self.fin.readline().rstrip() + assert line[0] == ">" + id = line[1:] + seq = "" + for line in self.fin: + line = line.rstrip() + if line[0] == ">": + yield fasta_record(id, seq) + id = line[1:] + seq = "" + else: + if self._cleanup: + seq += line.replace(" ","") + else: + seq += line + + yield fasta_record(id, seq) # Yield last seq + raise StopIteration # No more lines + + def __iter__(self): + return self.RecordGenerator() + + +if __name__ == "__main__": + print "Testing fast.py on file 5seqs.fa..." + for fasta_rec in fasta_file("5seqs.fa"): + print fasta_rec + diff --git a/python/memo.py b/python/memo.py new file mode 100755 index 000000000..561d0e364 --- /dev/null +++ b/python/memo.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python + +import time +class timer: + def __init__(self): + self.start() + def start(self): + self.start_time = time.time() + def stop(self): + return time.time() - self.start_time + +class time_func: + def __init__(self, fn): + self._fn = fn + def __call__(self, *args, **keywords): + tmr = timer() + result = self._fn(*args, **keywords) + elapsed = tmr.stop() + print "%.5fs elapsed." % elapsed + return result + +# DiskMemoize caches the results of function calls on disk for later, speedy retrieval +import cPickle, os, sys, math +class DiskMemoize: + """Memoize(fn) - an instance which acts like function fn given at instantiation + but memoizes the returned result of fn according to the function's arguments. + Memoized results are stored on disk + """ + # Supposedly will only work on functions with non-mutable arguments + def __init__(self, fn, fn_name, global_deps): + """fn - name of function to call when object is called as a function + fn_name - string corresponding to fn that is used to name memoization disk file + global_deps - global variables that affect the outcome of the function call provided + as a list of variable names as strings""" + self.fn = fn + self.fn_name = fn_name + self.global_deps = global_deps + #self.memo = {} + def __call__(self, global_deps = [], volatile_keywords = {}, skip_cache=False, verbose=True, *args, **keywords): + """Calls the function assigned at initialization - self.fn - attempting to + use a memoized result if it has been stored on disk. + + The function arguments are (in order of most likely use): + args - non-keyword arguments are DISABLED and will cause an exception to be raised + keywords - keyword arguments provided in standard function call style + volatile_keywords - hash of keywords (var_name, value) pairs to be passed to the + function call that do NOT affect the cached result in a meaningful way (e.g. verbose flag) + skip_cache - skips checking the cache for a memoized result, only memoize a new result + to disk (default: True)""" + + #print "BONUS:",BONUS + #print globals()["BONUS"] + + # Raise an exception if any non-keyword arguments were provided + if len(args) > 0: + raise Exception("DiskMemoize.__call__ encountered non-keyword arguments (args); it can only be called with keyword arguments (keywords)") + + # Pickling filename specifying all dependencies + pkl_filename = self.fn_name+"__" + pkl_filename += "__".join([str(arg)+"."+str(val) for arg,val in keywords.items()] + \ + ["global_"+arg_name+"."+str(globals()[arg_name]) for arg_name in global_deps]) + pkl_filename += ".pkl" + + if os.path.exists(pkl_filename) and not skip_cache: + # Just read the memoized result + pkl_file = open(pkl_filename, "rb") + print "DiskMemoize: Loading result from",pkl_filename, ;sys.stdout.flush() + tmr = timer() + result = cPickle.load(pkl_file) + calc_time = cPickle.load(pkl_file) + elapsed = tmr.stop() + speedup = calc_time/elapsed if elapsed != 0 else 9999.9999 + print ": %.5fs loading, %.5fs original calc. time, %.2f X speedup" % (elapsed, calc_time, speedup) + return result + else: + # Calculate the result and memoize it to disk + pkl_file = open(pkl_filename, "wb") + + print "DiskMemoize: Calculating result for:", + #print self.fn_name+" ("+(", ".join(map(str,args))) + ")" + keywords.update(volatile_keywords) + tmr = timer() + print self.fn_name+" ("+(", ".join(["=".join(map(str, it)) for it in keywords.items()])) + ")", + sys.stdout.flush() + result = self.fn(*args,**keywords) + calc_time = tmr.stop() + print ": %.5fs calculating" % calc_time + + tmr = timer() + print "DiskMemoize: Writing result to",pkl_filename, ;sys.stdout.flush() + cPickle.dump(result, pkl_file, protocol=2) + cPickle.dump(calc_time, pkl_file, protocol=2) + print ": %.5fs writing" % tmr.stop() + + return result + +def primes(first_prime, last_prime, verbose=False): + """Generates lists of prime numbers from first_prime to last_prime. + Used to test DiskMemoize""" + + #if prime < 2: return + primes = range(first_prime, last_prime) + if verbose: + print "I'm being verbose" + print "Prime range is",first_prime,"to",last_prime + last_div = int(math.sqrt(last_prime)+1) + for div in range(2, last_div): # (len(primes)/2.0)+1): + next_primes = [] + for x in primes: + if x % div != 0 or x == div: + #primes[x] = 0 + next_primes.append(x) + #primes.pop(i) + #pop_list.append(i) + primes = next_primes + if "BONUS" in globals(): + primes += [0] * BONUS + return primes * 10000 + +BONUS = 0 + +if __name__ == "__main__": + + print_checks = False + + print "Running DiskMemoize test suite." + primes = DiskMemoize(primes, "primes", global_deps = ['BONUS']) + + prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True, volatile_keywords = {'verbose' : True}) + if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 20, last_prime = 100) + if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + print "skip_check = True" + prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True) + if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 20, last_prime = 100) + if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True) + if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + BONUS = 5 + print "BONUS:",BONUS + prime_list = primes(first_prime = 20, last_prime = 100, skip_cache = True) + if print_checks: print "primes:", prime_list, "\nLength:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 20, last_prime = 10000, skip_cache = True) + if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 20, last_prime = 10000) + if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 1, last_prime = 30000, skip_cache = True) + if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + prime_list = primes(first_prime = 1, last_prime = 30000) + if print_checks: print "Length:", len(prime_list), "\nHash:", hash(str(prime_list)), "\n" + + + + + +#INTJ???? +#INTP???? +#extreme P + +# cookies with oatmeal, coconut, chocolate chip + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# For functions taking mutable arguments, use the cPickle module, as +# in class MemoizeMutable: + +#class MemoizeMutable: +# """Memoize(fn) - an instance which acts like fn but memoizes its arguments +# Will work on functions with mutable arguments (slower than Memoize) +# """ +# def __init__(self, fn): +# self.fn = fn +# self.memo = {} +# def __call__(self, *args): +# import cPickle +# str = cPickle.dumps(args) +# if not self.memo.has_key(str): +# self.memo[str] = self.fn(*args) +# return self.memo[str] diff --git a/python/pushback_file.py b/python/pushback_file.py new file mode 100644 index 000000000..7b1914265 --- /dev/null +++ b/python/pushback_file.py @@ -0,0 +1,17 @@ +class pushback_file(file): + """Opens a file using the standard file interface adding the ability +to pushback or unread some section of the file that was read from the file.""" + + def __init__(self, fname, mode='r', bufsize=0): + file.__init__(self, fname, mode, bufsize) + self.pushed_back = [] + + def next(self): + if len(self.pushed_back): + return self.pushed_back.pop() + else: + return file.next(self) + + def pushback(self, item): + """Put some item (bytes, line, etc.) back on the \"front\" of the file""" + self.pushed_back.append(item) diff --git a/python/qltout.py b/python/qltout.py new file mode 100755 index 000000000..4964d64cf --- /dev/null +++ b/python/qltout.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import string + +class align_block: + def __init__(self, indel_count, length, mismatches): + self.indel_count = int(indel_count) + self.length = int(length) + self.mismatches = int(mismatches) + def __str__(self): + return string.join( map( str, [self.indel_count, self.length, self.mismatches]), ' ') + +class qltout_record: + + def __init__(self, obj): + self.fields = [] + self.align_blocks = [] + if type(obj) == str: + self.setValuesFromString( obj ); + else: + raise TypeError("qltout_record did not recognize type: "+str(type(obj))) + + def setValuesFromString( self, line ): + formats = [str, int, int, int, int, int, int, int, int, int, int] + + split_line = line.split() + self.fields = map( lambda f, v: f(v), formats, split_line[:11] ) + + next_align_block = 11 + while next_align_block < len(split_line): + self.align_blocks.append( align_block(*split_line[next_align_block:next_align_block+3]) ) + next_align_block += 3 + #print self.__str__() + + def indelled_bases(self): + return sum([ab.indel_count for ab in self.align_blocks]) + + def __str__(self): + return string.join( map( str, self.fields ), ' ')+' '+string.join( map( str, self.align_blocks ), ' ') + + def id(self): return self.fields[1] + def contig(self): return self.fields[6] + + def offset(self): return self.fields[7] + def pos(self): return self.fields[7]+1 + + def offset_end(self): return self.fields[8] + def pos_end(self): return self.fields[8]+1 + + def linear_start(self): return self.fields[9] + def map_qual(self): return 100 # This only exists in maq files + # for ILT and Merlin, we assume the same confidence for all mappings + + +class qltout_file: + def __init__(self, filename): + self.filename = filename + self.fqlt = open(self.filename) + + def RecordGenerator(self): + + for line in self.fqlt: + #fields = line.rstrip("\n").split("\t") + if not line.startswith("QUERY"): + continue + yield qltout_record(line) #fields) + raise StopIteration + + def __iter__(self): + return self.RecordGenerator() + + diff --git a/python/samtooltest.sh b/python/samtooltest.sh new file mode 100755 index 000000000..0c23d9f16 --- /dev/null +++ b/python/samtooltest.sh @@ -0,0 +1,20 @@ +#!/bin/tcsh + +setenv refroot ~/work/refmut/ +setenv refname test2 +setenv ref "${refroot}/${refname}" +setenv cov 100 +setenv sites 1 +setenv x '' # '-x 6696745:6708910' + +# testing samtools +./SimulateReads.py --ref ${ref}.fasta --coverage ${cov} -n ${sites} -l 10 -t None ${x} + +# running samtools +samtools import ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.ref ${refname}__mut.None_10.bp_nsites.${sites}_cov.${cov}.sam samtest.bam + +samtools sort samtest.bam samtestSorted + +samtools index samtestSorted.bam + +samtools pileup -f ${ref}.fasta samtestSorted.bam diff --git a/python/semantic.cache b/python/semantic.cache new file mode 100644 index 000000000..9ec61ef45 --- /dev/null +++ b/python/semantic.cache @@ -0,0 +1,15 @@ +;; Object AlignerEvaluation/ +;; SEMANTICDB Tags save file +(semanticdb-project-database-file "AlignerEvaluation/" + :tables (list + (semanticdb-table "SimulateReads.py" + :major-mode 'python-mode + :tags '(("Bio" include nil (dependency-file none) [21 31]) ("optparse" include nil (dependency-file none) [32 65]) ("Bio" include nil (dependency-file none) [66 87]) ("Bio.Seq" include nil (dependency-file none) [88 111]) ("Bio.SeqRecord" include nil (dependency-file none) [112 147]) ("Bio.Alphabet" include nil (dependency-file none) [148 186]) ("os" include nil (dependency-file none) [187 196]) ("sys" include nil (dependency-file none) [197 207]) ("random" include nil (dependency-file none) [220 233]) ("string" include nil (dependency-file none) [234 247]) ("SAM" include nil (dependency-file none) [249 266]) ("mutateReference" function (:arguments (("ref" variable nil (reparse-symbol function_parameters) [288 291]) ("mutSite" variable nil (reparse-symbol function_parameters) [293 300]) ("mutType" variable nil (reparse-symbol function_parameters) [302 309]) ("mutParams" variable nil (reparse-symbol function_parameters) [311 320]) ("nBasesToPad" variable nil (reparse-symbol function_parameters) [322 333]))) nil [268 1475]) ("sampleReadsFromAlignment" function (:arguments (("refSeq" variable nil (reparse-symbol function_parameters) [1517 1523]) ("mutSeq" variable nil (reparse-symbol function_parameters) [1525 1531]) ("alignStart" variable nil (reparse-symbol function_parameters) [1533 1543]) ("readLen" variable nil (reparse-symbol function_parameters) [1545 1552]) ("nReads" variable nil (reparse-symbol function_parameters) [1554 1560]))) nil [1488 2391]) ("fakeQuals" function (:arguments (("seq" variable nil (reparse-symbol function_parameters) [2407 2410]))) nil [2392 2441]) ("alignedRead2SAM" function (:arguments (("readID" variable nil (reparse-symbol function_parameters) [2491 2497]) ("fastaID" variable nil (reparse-symbol function_parameters) [2499 2506]) ("read" variable nil (reparse-symbol function_parameters) [2508 2512]) ("pos" variable nil (reparse-symbol function_parameters) [2514 2517]) ("cigar" variable nil (reparse-symbol function_parameters) [2519 2524]))) nil [2470 2783]) ("readRef" function (:arguments (("referenceFasta" variable nil (reparse-symbol function_parameters) [2796 2810]))) nil [2784 3033]) ("OPTIONS" variable nil nil [3034 3048]) ("os.path" include nil (dependency-file none) [3050 3064]) ("outputFilename" function nil nil [3065 3481]) ("main" function nil nil [3482 8118]) ("main" code nil nil [8120 8126])) + :file "SimulateReads.py" + :pointmax 8129 + ) + ) + :file "semantic.cache" + :semantic-tag-version "2.0pre4" + :semanticdb-version "2.0pre4" + ) diff --git a/python/tgtc2sam.py b/python/tgtc2sam.py new file mode 100755 index 000000000..009d992f8 --- /dev/null +++ b/python/tgtc2sam.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python + +"""Starts from TGTC list of fastb, qualb, and qltout files and creates merged SAM/BAM files""" + +import os, commands, sys +from farm_commands import cmd + +# Global list of failed conversions to be reported at end of execution +failed_list = [] + +class FailedConversion: + def __init__(self, lane, error_str): + self.lane = lane + self.error_str = error_str + print self + def __str__(self): + return self.error_str+" while attempting conversion of "+str(self.lane) + +class lane: + bam_ref_list = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.bam.ref_list"; + pipe_dirs_head = "/slxa/pipelineOutput/farm" + pipe_dirs = os.listdir(pipe_dirs_head) + pipe_dirs_dict = dict( [(dir.split("_")[1], dir) for dir in pipe_dirs if "_" in dir] ) + + def __init__(self, line): + fields = line.split(" ") + if len(fields) == 5: + self.fdl, self.fastb, self.qualb, self.qltout, group = fields + else: + self.fdl, self.fastb, self.qualb, self.qltout = fields + self.qltout = self.qltout.rstrip() + + self.fqltout = os.path.splitext(self.qltout)[0]+".fais_filtered.qltout" + self.flowcell, self.date, self.lane = self.fdl.split(".") + self.flowlane = self.flowcell+"."+self.lane + self.path_head = os.path.splitext(self.qltout)[0] + self.sam = self.path_head+".sam" + self.bam = self.path_head+".bam" + self.head = ".".join(os.path.basename(self.path_head).split(".")[:2]) # should amount to FLOWCELL.LANE + + #self.refdict_metrics() + + def pipe_dir(self): + dir = lane.pipe_dirs_dict[self.flowcell] + return lane.pipe_dirs_head+"/"+dir + + def set_default_refdict(self): + self.refdict = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict"; + #print "WARNING: Using standard Hg18 reference dictionary (refdict) for",self + + def set_default_metrics(self): + self.metrics = "/broad/1KG/legacy_data/tgtc2sam/unknown.metrics" + #print "WARNING: Using default unkonwn.metrics file for",self + + def refdict_metrics(self): + "Ensures that a reference dictionary and metrics are available" + try: + pipe_lane_dir = self.pipe_dir() + except KeyError: + self.set_default_refdict() + self.set_default_metrics() + return True + + self.metrics = pipe_lane_dir+"/"+str(self.flowlane)+".metrics" + if os.path.exists(self.metrics): + line = commands.getoutput("grep '^reference=' "+pipe_lane_dir+"/"+self.flowlane+".metrics") + if line != "": + reference_fasta = line.split("=")[1] + self.refdict = os.path.splitext(reference_fasta)[0]+'.dict' + if "PhiX" in self.refdict: + failed_list.append(FailedConversion(self, "PhiX dictionary missing")) + return False + else: + self.set_default_refdict() + return True # We were able to generate refdict and metrics files + else: + self.set_default_refdict() + self.set_default_metrics() + return True # We were able to generate refdict file and substituted unknown.metrics as the metrics file + + #failed_list.append(FailedConversion(self, "Could not find metrics file")) + #return False + + def __str__(self): + return self.fdl + +def usage(): + print "Usage: tgtc_sam.py ..." + print + print " Works from tgtc files which contain fastb, qualb, qltout filenames corresponding to one lane of data" + + print " Commands and command arguments:" + print " convert tgtc_list - given a tgtc file converts files to SAM and BAM" + print " tgtc - file with lines of fastb, qualb, qltout filenames" + print " [clean] - optional 3rd argument that will create SAM/BAM files" + print " regardless of whether they already exist" + print + print " merge tgtc output_file - merges converted BAM files" + print " tgtc - file with lines of fastb, qualb, qltout filenames" + print " output_file - path and name of merged BAM file" + print + print " lists list_of_tgtc_lists - run over a list of tgtc lists given in a file" + print " list_of_tgtc_files - file with tgtc filenames in it" + print + print " Global options:" + print " print_only: do not run commands, just print what they would be " + sys.exit(1) + +def make_lane_list(tgtc_file): + all_lanes = [lane(l) for l in open(tgtc_file) if not l.startswith("#")] + lanes = [] + for ln in all_lanes: + if ln.refdict_metrics(): # if we were able to generate refdict and metrics + # append this lane to the lanes list that we will work with + lanes.append(ln) + unpaired = [l for l in lanes if ".end1." not in l.qualb and ".end2." not in l.qualb] + paired1 = [l for l in lanes if ".end1." in l.qualb] + paired2 = [l for l in lanes if ".end2." in l.qualb] + + print "Unpaired lanes:",len(unpaired) + print "Paired 1 lanes:",len(paired1) + print "Paired 2 lanes:",len(paired2) + assert len(paired1) == len(paired2) + + return [(u, None) for u in unpaired] + zip(paired1, paired2) + +def convert_lanes(tgtc_file, recreate_all=False, just_print_commands=False): + lanes_done = 0 + lane_list = make_lane_list(tgtc_file) + print "Beginning processing of these lanes (unpaired and paired):" + print "\n".join(["%s %s" % (p1,p2) for p1,p2 in lane_list]) + + for p1, p2 in lane_list: + if recreate_all or not os.path.exists(p1.sam) or not os.path.getsize(p1.sam): + + # Filer reads with > 4 errors + cmd_str = "\"FilterAlignsILTStyle QLTIN="+p1.qltout+" QLTOUT="+p1.fqltout + + if p2 == None: + # This is an UNPAIRED LANE + print "Processing unpaired lane",p1 + cmd_str += (" && Qltout2SAM"+ + " HEAD="+p1.head+ + " SAM="+p1.sam+ + " QLTOUT="+p1.fqltout+ + " FASTB="+p1.fastb+ + " QUALB="+p1.qualb+ + " REFDICT="+p1.refdict+ + " METRICS="+p1.metrics+ + " ALIGNER=PairFinder"+ + " TMPDIR=/broad/hptmp/andrewk") + else: + # This is a PAIRED LANE + print "Processing paired lanes",p1,p2 + if p1.flowcell != p2.flowcell or p1.lane != p2.lane: + print "### WARNING ### paired lanes ",p1,p2,"do not match! Skipping." + continue + # Filter 2nd Qltout file too + cmd_str += " && FilterAlignsILTStyle QLTIN="+p2.qltout+" QLTOUT="+p2.fqltout + cmd_str += (" && QltoutPair2SAM"+ + " HEAD="+p1.head+ + " SAM="+p1.sam+ + " QLTOUT1="+p1.fqltout+" QLTOUT2="+p2.fqltout+ + " FASTB1="+p1.fastb+" FASTB2="+p2.fastb+ + " QUALB1="+p1.qualb+" QUALB2="+p2.qualb+ + " REFDICT="+p1.refdict+ + " METRICS="+p1.metrics+ + " ALIGNER=PairFinder"+ + " TMPDIR=/broad/hptmp/andrewk") + + # Make the BAM file + cmd_str += " && /seq/dirseq/samtools/current/samtools import "+lane.bam_ref_list+" "+p1.sam+" "+p1.bam+"\"" + # Now remove one or both SAM files if all previous commands returned 0 for success + cmd_str += " && rm -f "+p1.sam + if p2 != None: cmd_str += " && rm -f "+p2.sam + + status = cmd(cmd_str, "long", output_head=p1.path_head, just_print_commands=just_print_commands) + if status == 0: + lanes_done += 1 + else: + failed_list.append("Unkown failure") + #if lanes_done + len(failed_list) > 0: + #break + + failed = len(failed_list) + lanes_total = lanes_done+failed + percent_done = lanes_done / lanes_total if lanes_done != 0 else 0 + print "Lane conversion success: %d/%d (%.1f%%)" % (lanes_done, lanes_total, percent_done) + +def merge_bams(tgtc_file, outfile, just_print_commands=False): + input_lane_list = make_lane_list(tgtc_file) + #lane_list = [lp for lp in lane_list if lp[0].flowcell =="30LLT"] + missing_files = False + lane_list = [] + for p1,p2 in input_lane_list: + if "adapted" not in p1.qualb or (p2 != None and "adapted" not in p2.qualb): + print "NOT ADAPTED", p1.qualb, p2.qualb + if not os.path.exists(p1.bam): + #print "Missing file:",p1.bam + missing_files = True + else: + lane_list.append( (p1,p2) ) + + #return + if not missing_files: + if not os.path.exists(outfile): + print len(lane_list),"lanes / lane pairs to merge" + if not os.path.exists(os.path.dirname(outfile)): + os.makedirs(os.path.dirname(outfile)) + + cmd("java -cp /home/radon01/depristo/dev/workspace/Libraries/bin"+ + " edu.mit.broad.picard.sam.MergeSamFiles"+ + " "+" ".join(["I="+p1.bam for p1,p2 in lane_list])+ + " O="+outfile+ + " TMP_DIR=/broad/hptmp/andrewk/"+ + " && /seq/dirseq/samtools/current/samtools index "+outfile+ + " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100", + "long", outfile, just_print_commands=just_print_commands) + + # Original samtools merge command below that was used for rev. 1 merging + + #cmd("/seq/dirseq/samtools/current/samtools merge "+outfile+ + # " "+" ".join([p1.bam for p1,p2 in lane_list])+ + # " && /seq/dirseq/samtools/current/samtools index "+outfile+ + # " && /xchip/tcga/gbm/visualization/sam/IGVAlignmentProcessor/bin/run_linux-64.sh "+outfile+" "+os.path.dirname(outfile)+" hg18 -w 100", + # "long", outfile) + + else: + # Convert the lanes that had missing outfiles + #convert_lanes(tgtc_file) + pass + +def process_tgtc_lists(tgtc_list_file): + "Works through a list of tgtc files and merges them to produce the output" + lists_done = 0 + + for line in open(tgtc_list_file): + if line[0] == '#': + continue + fields = line.split() + if len(fields) != 2: + continue + head, tgtc_list = fields + bam = head+".bam" + if True: #not os.path.exists(bam): + print "Processing tgtc_list_file",tgtc_list,"with goal of producing",bam + merge_bams(tgtc_list, bam) + print + else: + print bam,"already exists\n" + + lists_done += 1 + #if lists_done > 0: + # break + +if __name__ == "__main__": + if len(sys.argv) < 2: + usage() + operation = sys.argv[1] + + # Check for print commands only option + if "print_only" in sys.argv: + just_print_commands = True + sys.argv.remove("print_only") + else: + just_print_commands = False + + if operation == "convert": + if len(sys.argv) < 3: + usage() + tgtc_file = sys.argv[2] + # Run convert_lanes and recreate_all == True if clean option was given as 3rd arg + convert_lanes(tgtc_file, len(sys.argv) > 3 and sys.argv[3] == "clean", just_print_commands=just_print_commands) + elif operation == "merge": + if len(sys.argv) < 4: + usage() + tgtc_file = sys.argv[2] + outfile = sys.argv[3] + merge_bams(tgtc_file, outfile, just_print_commands=just_print_commands) + elif operation == "lists": + tgtc_list_file = "/broad/1KG/legacy_data/tgtc2sam/tgtc_lists_2_legacy_bams" + if len(sys.argv) >= 3: + tgtc_list_file = sys.argv[2] + process_tgtc_lists(tgtc_list_file) + else: + print "Didn't understand operation value: "+operation + usage()