gatk-3.8/python/ucscRepeatMaskToIntervalLis...

147 lines
5.9 KiB
Python
Executable File

import farm_commands
import os.path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import faiReader
import math
import shutil
IS_OFFSET = True
CHR_OFFSET = 5
START_OFFSET = 6
END_OFFSET = 7
TYPE_OFFSET = 11
class RepeatInfo:
def __init__(self, name):
self.name = name
self.count = 0
self.coveredBases = 0
def badChr(excludes, chr):
if excludes == None:
return False
for exclude in excludes:
if chr.find(exclude) != -1:
return True
return False
def main():
global OPTIONS
usage = "usage: %prog stage [options]"
parser = OptionParser(usage=usage)
# parser.add_option("-q", "--farm", dest="farmQueue",
# type="string", default=None,
# help="Farm queue to send processing jobs to")
parser.add_option("", "--header", dest="header",
type='string', default=None,
help="interval_list file")
parser.add_option("-o", "--output", dest="output",
type='string', default=None,
help="output interval_list filename")
parser.add_option("-r", "--ref", dest="ref",
type='string', default=None,
help="reference name -- either hg18 or b36")
parser.add_option("-x", "--exclude", dest="excludes",
action="append", type='string',
help="If provided, only run pipeline for this sample")
parser.add_option("-m", "--maxRecords", dest="maxRecords",
type='int', default=None,
help="If provided, max. number of records to process")
parser.add_option("-s", "--skip", dest="skip",
type='int', default=None,
help="If provided, only process every skip records")
parser.add_option("", "--excludeChr", dest="excludeChrs",
action="append", type='string',
help="If provided, don't include chr matching this string")
(OPTIONS, args) = parser.parse_args()
if len(args) != 1:
parser.error("incorrect number of arguments")
repeatFile = args[0]
# open the output file
out = open(OPTIONS.output, 'w')
# write out the header
for line in open(OPTIONS.header):
out.write(line)
info = dict()
nRecords = 1
i = 1
for line in open(repeatFile):
if OPTIONS.maxRecords != None and nRecords > OPTIONS.maxRecords:
break
if len(line) == 0 or line[0] == '#':
continue
parts = line.split()
type = parts[TYPE_OFFSET]
start = int(parts[START_OFFSET]) + 1
end = int(parts[END_OFFSET]) + 1
chr = parts[CHR_OFFSET]
if OPTIONS.ref == 'b36':
chr = chr.replace('chrM', 'chrMT')
chr = chr.replace('chr', '')
if (OPTIONS.excludes == None or type not in OPTIONS.excludes) and not badChr(OPTIONS.excludeChrs, chr):
name = 'repeat_' + str(i) + '_' + type
strand = '+'
if OPTIONS.skip == None or i % OPTIONS.skip == 0:
print >> out, '\t'.join(map(str, [chr, start, end, strand, name]))
nRecords += 1
i += 1
if type not in info:
info[type] = RepeatInfo(type)
typeInfo = info[type]
typeInfo.count += 1
typeInfo.coveredBases += end - start
out.close()
for typeInfo in info.values():
print "%20s\t%20d\t%20d" % ( typeInfo.name, typeInfo.count, typeInfo.coveredBases )
if __name__ == "__main__":
main()
# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A
# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered
# if ( $1 == 3.1 ) then
# cat ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf | cut -f 1-10 > ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only.filtered.vcf
# endif
#
# if ( $1 == 4 ) then
# foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only )
# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf
# cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf
# end
# endif
#
# if ( $1 == 5 ) then
# mkdir /humgen/gsa-scr1/pub/1000Pilot2_010710
#
# foreach file ( NA12878.allTechs.mmq10_mbq10_q200.filtered.vcf NA12878.SLX.mmq10_mbq10_q50.filtered.vcf NA12891.calls.mmq10_mbq10_q50.filtered.vcf NA12892.calls.mmq10_mbq10_q50.filtered.vcf ceu.trio.calls.allTechs.mmq10_mbq10_q200.filtered.vcf )
# echo $file
# cp $file /humgen/gsa-scr1/pub/1000Pilot2_010710
# end
#
# endif