From 7b3c34d21020aeb0d67328f60d0145d0caaf7236 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 31 Jan 2010 15:36:25 +0000 Subject: [PATCH] keeping a backup git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2750 348d0f76-0448-11de-a6fe-93d51630548a --- python/pilot2CallingPipeline.py | 274 ++++++++++++++++++++++++-------- 1 file changed, 210 insertions(+), 64 deletions(-) diff --git a/python/pilot2CallingPipeline.py b/python/pilot2CallingPipeline.py index 5ca83a5bc..6b02c62ad 100755 --- a/python/pilot2CallingPipeline.py +++ b/python/pilot2CallingPipeline.py @@ -9,6 +9,10 @@ import faiReader import math import shutil +GATK_STABLE = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta ' +GATK_DEV = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta ' +GATK = GATK_STABLE + class CallTarget: def __init__(self, name, hetero, minQ = 50, depth = 120, truthGFFName = None, variantEvalArgs = '', otherVCF = None): self.name = name @@ -24,6 +28,8 @@ class CallTarget: self.unionVCF = None if otherVCF != None: self.unionVCF = self.name + '.gatk.glftrio.union.filtered.vcf' + self.listFile = None + self.releaseVCF = None CEU_HET = 0.79e-3 YRI_HET = 1.0e-3 @@ -36,11 +42,14 @@ targets = [ CallTarget('NA19239', YRI_HET), CallTarget('NA19240', YRI_HET), - CallTarget('NA19240.alltechs', YRI_HET, 50, 120, 'NA19240'), - CallTarget('NA12878.alltechs', CEU_HET, 50, 120, 'NA12878'), - CallTarget('ceu.trio', CEU_HET, 50, 360, 'NA12878', '--sampleName NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf'), CallTarget('yri.trio', YRI_HET, 50, 360, 'NA19240', '--sampleName NA19240', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/YRI_1kg_pilot2.vcf'), + +# CallTarget('NA19240.alltechs.solid_original', YRI_HET, 50, 120, 'NA19240'), +# CallTarget('NA12878.alltechs.solid_original', CEU_HET, 50, 120, 'NA12878'), + + CallTarget('NA19240.alltechs.solid_recal', YRI_HET, 50, 120, 'NA19240'), + CallTarget('NA12878.alltechs.solid_recal', CEU_HET, 50, 120, 'NA12878'), ] sets = ['Intersection', 'filteredInBoth', 'gatk', 'gatk-filteredInOther', 'glftrio', 'glftrio-filteredInOther', 'Intersection', ['gatk-unique', 'gatk.*'], ['glftrio-unique', 'glftrio.*']] @@ -55,6 +64,12 @@ def main(): parser.add_option("", "--dry", dest="dry", action='store_true', default=False, help="If provided, nothing actually gets run, just a dry run") + parser.add_option("-s", "--sample", dest="useSample", + type='string', default=None, + help="If provided, only run pipeline for this sample") + parser.add_option("-c", "--minQ", dest="minQ", + type='float', default=None, + help="If provided, will actually use this Q threshold for calls") parser.add_option("-d", "--dir", dest="dir", type='string', default="", help="If provided, this is the root where files are read and written") @@ -66,100 +81,231 @@ def main(): if len(args) != 1: parser.error("incorrect number of arguments") - stage = args[0] + stages = args[0].split(",") + allJobs = [] for callTarget in targets: - target = callTarget.name - listFile = os.path.join("lists", target + ".list") - unfilteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.vcf') - filteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.filtered.vcf') - tmpdir = os.path.join(OPTIONS.dir, "intermediates", unfilteredVCF + ".scatter") - if not os.path.exists(tmpdir): - os.makedirs(tmpdir) + if callTarget.name == OPTIONS.useSample or OPTIONS.useSample == None: + lastJobs = None + + if OPTIONS.minQ != None: + callTarget.minQ = OPTIONS.minQ + + for stage in stages: + print 'STAGE', stage + target = callTarget.name + callTarget.listFile = os.path.join("lists", target + ".list") + unfilteredVCFBaseName = target + '.gatk.ug.vcf' + unfilteredVCF = os.path.join(OPTIONS.dir, unfilteredVCFBaseName) + filteredVCF = os.path.join(OPTIONS.dir, target + '.gatk.ug.filtered.vcf') + + tmpdir = os.path.join(OPTIONS.dir, "intermediates", unfilteredVCFBaseName + ".scatter") + if not os.path.exists(tmpdir): + os.makedirs(tmpdir) + + if callTarget.unionVCF != None: + callTarget.releaseVCF = os.path.join(OPTIONS.dir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf") + + print 'Heading into stage', stage, lastJobs + + newJobs = [] + if stage == 'CALL': + newJobs = callSNPs(target, callTarget, callTarget.listFile, tmpdir) + + if stage == 'MERGE': + newJobs = mergeSNPs(target, lastJobs, unfilteredVCF, tmpdir) + + if stage == 'FILTER': + newJobs = filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF) + + if stage == 'UNION': + newJobs = unionSNPs(callTarget, lastJobs, filteredVCF ) + + if stage == 'SELECT_INTERSECT': + if ( callTarget.unionVCF != None ): + # we are one fo the release targets + newJobs = finalize1KGRelease(callTarget, lastJobs, os.path.join(OPTIONS.dir, callTarget.unionVCF ), callTarget.releaseVCF) + + if stage == 'EVAL': + newJobs = evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF) + + if stage == 'RELEASE': + dir = '/humgen/gsa-scr1/pub/1000GenomesPilot2' + subdir = '1000GenomesPilot2SNPs_GATK_glftrio_' + date.today().strftime("%m%d%y") + releaseSNPs(os.path.join(dir, subdir), callTarget, filteredVCF ) + + if stage == 'CLEAN': + shutil.rmtree("intermediates", True) + for file in [filteredVCF, unfilteredVCF]: + if os.path.exists(file): os.remove(file) + + print 'New jobs' + for job in newJobs: + print ' ', job + allJobs.append(newJobs) + if newJobs != []: + lastJobs = newJobs - if stage == 'CALL': - callSNPs(target, callTarget, listFile, tmpdir) + print 'EXECUTING JOBS' + farm_commands.executeJobs(allJobs, farm_queue = OPTIONS.farmQueue, just_print_commands = OPTIONS.dry) - if stage == 'MERGE': - mergeSNPs(target, unfilteredVCF, tmpdir) +hg18 = ['chr' + str(i) for i in range(1,23)] + ['chrX'] +b36 = [str(i) for i in range(1,23)] + ['X'] - if stage == 'FILTER': - filterSNPs(target, callTarget.depth, unfilteredVCF, filteredVCF) +def autosomePlusX(name): + return name in b36 or name in hg18 - if stage == 'UNION': - unionSNPs(callTarget, filteredVCF ) - - if stage == 'RELEASE': - dir = '/humgen/gsa-scr1/pub/1000GenomesPilot2' - subdir = '1000GenomesPilot2SNPs_GATK_glftrio_' + date.today().strftime("%m%d%y") - releaseSNPs(os.path.join(dir, subdir), callTarget, filteredVCF ) - - if stage == 'CLEAN': - shutil.rmtree("intermediates", True) - for file in [filteredVCF, unfilteredVCF]: - if os.path.exists(file): os.remove(file) - - if stage == 'EVAL': - evalSNPs(callTarget, unfilteredVCF, filteredVCF) - -GATK_STABLE = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /broad/1KG/reference/human_b36_both.fasta ' -GATK_DEV = 'java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /broad/1KG/reference/human_b36_both.fasta ' -GATK = GATK_DEV +def partition(faiRecords, maybeChrom, n): +# 1 247249719 3 60 61 +# 2 242951149 251370554 60 61 + chromAndSize = [[x[0], int(x[1])] for x in faiRecords if autosomePlusX(x[0])] + #print chromAndSize + if maybeChrom <> None: + chromAndSize = filter(lambda x: x[0] == maybeChrom, chromAndSize) + + def r(chrom, chrStart, chrEnd): + outputVCF = 'calls.chr%s.good.%s.vcf' % (chrom, chrStart) + L = '-L %s:%d-%d' % (chrom, chrStart, chrEnd) + return outputVCF, chrom, chrStart, chrEnd, L + + for chrom, size in chromAndSize: + #print 'SIZE', chrom, size + if n == 1: + yield r(chrom, 1, size) + else: + idealBp = float(size-1) / n + chrEnd = None + chrStart = 1 + for i in range(1, n): + chrEnd = 1 + int(round(idealBp * (i + 1))) + #print chrom, chrStart, chrEnd, idealBp + result = r(chrom, chrStart, chrEnd) + chrStart = chrEnd + 1 + yield result + if chrEnd <> size: + raise Exception('X') def callSNPs(target, callTarget, listFile, tmpdir): - cmd = "python ./runmeCalls.py -q %s -N 5 -d %s -I %s /broad/1KG/reference/human_b36_both.fasta.fai -e \"-mmq 10 -mbq 10 -pl SOLID --heterozygosity %e\" -Q %s" % (OPTIONS.farmQueue, tmpdir, listFile, callTarget.hetero, callTarget.minQ) - print 'Enqueuing job for', target, callTarget, listFile, tmpdir - jobid = farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) + extras = "-mmq 10 -mbq 10 -pl SOLID --heterozygosity %e" % (callTarget.hetero) + fai = "/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai" + NWaysParallel = 5 -def mergeSNPs(target, snpFile, tmpdir): - cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /broad/1KG/reference/human_b36_both.fasta.fai %s/*.vcf > %s" % (tmpdir, snpFile) - jobid = farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) + bins = partition(faiReader.readFAI(fai), None, NWaysParallel) + jobs = list() + for outputVCF, chrom, chrStart, chrEnd, L in bins: + outputVCF = os.path.join(tmpdir, outputVCF) + #print outputVCF, L, os.path.exists(outputVCF) + #if not os.path.exists(outputVCF): + print 'Enqueuing job for', outputVCF, L + cmd = 'java -Xmx2048m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar ' + \ + '-T UnifiedGenotyper -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -mrl 500000 ' + \ + '-I %s -confidence %d %s -varout %s -vf VCF -L %s -gm JOINT_ESTIMATE' % (listFile, callTarget.minQ, extras, outputVCF, L) + #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) + jobs.append(farm_commands.FarmJob(cmd, jobName = "CALL_%s_c%s_s%s" % (target, str(chrom), str(chrStart)))) -def filterSNPs(target, depth, unfilteredVCF, filteredVCF): + return jobs + +def mergeSNPs(target, lastJobs, snpFile, tmpdir): + #print 'LastJobs = ', lastJobs + cmd = "python ~/dev/GenomeAnalysisTK/trunk/python/mergeVCFs.py -a -f /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta.fai %s/*.vcf > %s" % (tmpdir, snpFile) + return [farm_commands.FarmJob(cmd, jobName = "MERGE_%s" % (target), dependencies = lastJobs)] + +def filterSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF): + target = callTarget.name #expression = "AB > 0.75 || DP > %s" % depth - expression = "AB > 0.75 || DP > %s || MQ0 > 40 || SB > -0.10" % depth - cmd = GATK + '-T VariantFiltration -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B variant,VCF,%s --clusterWindowSize 10 -o %s --filterExpression "%s"' % (unfilteredVCF, filteredVCF, expression) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) + expression1 = ['GATK_STANDARD', "AB > 0.75 || DP > %s || MQ0 > 40 || SB > -0.10" % callTarget.depth] + expression2 = ['HARD_TO_VALIDATE', "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)"] + cmd = GATK + '-T VariantFiltration -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B variant,VCF,%s --clusterWindowSize 10 -o %s ' % (unfilteredVCF, filteredVCF) + + for name, exp in [expression1, expression2]: + cmd += '--filterName %s --filterExpression "%s" ' % ( name, exp ) + + #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) + return [farm_commands.FarmJob(cmd, jobName = "FILTER_%s" % (target), dependencies = lastJobs)] -def evalSNPs(callTarget, unfilteredVCF, filteredVCF): - def eval1(vcf, namePostfix = "", args = ""): +def evalSNPs(callTarget, lastJobs, unfilteredVCF, filteredVCF): + evalRoot = os.path.join(OPTIONS.dir, "eval") + if not os.path.exists(evalRoot): + os.makedirs(evalRoot) + + def eval1(vcfFullPath, namePostfix = "", args = ""): + vcf = os.path.split(vcfFullPath)[1] out = os.path.join(OPTIONS.dir, "eval", vcf + namePostfix + ".eval") - cmd = GATK + "-T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B 1kg_ceu,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/CEU.2and3_way.annotated.vcf -B 1kg_yri,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/YRI.2and3_way.annotated.vcf -B eval,VCF,%s -G -A -o %s -L %s" % ( vcf, out, '\;'.join(map(str, range(1,23))) ) + cmd = GATK + "-T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B 1kg_ceu,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/CEU.2and3_way.annotated.vcf -B 1kg_yri,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot1/SNPCalls/Joint/RC1/YRI.2and3_way.annotated.vcf -B eval,VCF,%s -G -A -o %s -L %s" % ( vcfFullPath, out, '\;'.join(map(str, b36)) ) hapmap3 = os.path.join("../../hapmap3Genotypes", callTarget.truthGFFName + ".b36.gff") if os.path.exists(hapmap3): cmd += " -B hapmap-chip,GFF,%s %s %s" % (hapmap3, callTarget.variantEvalArgs, args) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) - eval1(filteredVCF) - eval1(unfilteredVCF) + #jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) + return farm_commands.FarmJob(cmd, jobName = "EVAL_%s_%s" % (callTarget.name, namePostfix), dependencies = lastJobs) + + jobs = [] + jobs.append(eval1(filteredVCF)) + jobs.append(eval1(unfilteredVCF)) if callTarget.unionVCF != None: - eval1(callTarget.unionVCF, ".union") + jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), ".union")) for set in sets: if type(set) == list: name, selector = set else: name, selector = set, set - eval1(callTarget.unionVCF, "." + name, " -vcfInfoSelector set=\"" + selector + "\"") + jobs.append(eval1(os.path.join(OPTIONS.dir, callTarget.unionVCF), "." + name, " -vcfInfoSelector set=\"" + selector + "\"")) -def unionSNPs(target, filteredVCF ): - if target.otherVCF != None: - cmd = GATK + "-T VCFCombine -B GATK,VCF,%s -B glfTrio,VCF,%s -O %s -type UNION -priority GATK,glfTrio -A" % ( filteredVCF, target.otherVCF, target.unionVCF ) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) + if callTarget.releaseVCF != None: + jobs.append(eval1(callTarget.releaseVCF, ".release")) + + + return jobs + +def unionSNPs(callTarget, lastJobs, filteredVCF ): + if callTarget.otherVCF != None: + cmd = GATK + "-T VCFCombine -B GATK,VCF,%s -B glfTrio,VCF,%s -O %s -type UNION -priority GATK,glfTrio -A" % ( filteredVCF, callTarget.otherVCF, os.path.join(OPTIONS.dir, callTarget.unionVCF) ) + return [farm_commands.FarmJob(cmd, jobName = "UNION_%s" % (callTarget.name), dependencies = lastJobs)] + else: + return [] +# jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, None, just_print_commands = OPTIONS.dry) + +def finalize1KGRelease(callTarget, lastJobs, inputVCF, outputVCF): + commands = [] + lastJob = lastJobs + +# subdir = os.path.join(OPTIONS.dir, "1kg_release") +# if not os.path.exists(subdir): +# os.makedirs(subdir) + + # first, filter out the intersection + cmd = GATK + '-T VCFSelect -B variant,VCF,%s -o %s -match "(set eq \'Intersection\' || set eq \'filteredInBoth\')" ' % (inputVCF, outputVCF) + lastJob = farm_commands.FarmJob(cmd, jobName = "SELECT_%s" % (callTarget.name), dependencies = lastJob) + commands.append(lastJob) + + # call variant annotator to annotate all of the calls + # annotatedVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.vcf") +# cmd = GATK + '-T VariantAnnotator -I %s -standard -B variant,VCF,%s -vcf %s -L 1:1-1,000,000 ' % (callTarget.listFile, intersectVCF, annotatedVCF) +# lastJob = farm_commands.FarmJob(cmd, jobName = "ANNOTATE_%s" % (callTarget.name), dependencies = lastJob) +# commands.append(lastJob) +# +# # filter the calls +# filteredVCF = os.path.join(subdir, callTarget.name + ".gatk_glftrio.intersection.annotated.filtered.vcf") +# commands = commands + filterSNPs(callTarget, lastJob, annotatedVCF, filteredVCF) +# + return commands def releaseSNPs(dir, callTarget, filteredVCF ): if not os.path.exists(dir): os.makedirs(dir) - print dir, filteredVCF - #shutil.copy(filteredVCF, dir) + print 'Copying files into ', dir, filteredVCF, callTarget.unionVCF, callTarget.releaseVCF + if not OPTIONS.dry: shutil.copy(filteredVCF, dir) if callTarget.unionVCF != None: - shutil.copy(callTarget.unionVCF, dir) + if not OPTIONS.dry: shutil.copy(os.path.join(OPTIONS.dir, callTarget.unionVCF), dir) + if callTarget.releaseVCF != None: + if not OPTIONS.dry: shutil.copy(callTarget.releaseVCF, dir) if __name__ == "__main__": main() -# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /broad/1KG/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A -# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /broad/1KG/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered +# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -T VCFCombine -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,ceu.trio.gatk.ug.filtered.vcf -B glfTrio,VCF,/humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls/CEU_1kg_pilot2.vcf -O test.vcf -type UNION -priority GATK,glfTrio -l INFO -A +# java -ea -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar -l INFO -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -T VariantEval -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod -B eval,VCF,test.vcf -B hapmap-chip,GFF,../../hapmap3Genotypes/NA12878.b36.gff --sampleName NA12878 -vcfInfoSelector set=gatk-filtered # if ( $1 == 3.1 ) then @@ -168,7 +314,7 @@ if __name__ == "__main__": # # if ( $1 == 4 ) then # foreach callset ( NA12878.allTechs.mmq10_mbq10_q200 ceu.trio.calls.allTechs.mmq10_mbq10_q200.NA12878only ) -# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /broad/1KG/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO +# java -Xmx4096m -jar /home/radon01/depristo/dev/GenomeAnalysisTKStable/trunk/dist/GenomeAnalysisTK.jar -T CallsetConcordance -R /humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta -B GATK,VCF,$callset.filtered.vcf -B glfTrio,VCF,CEU_1kg_pilot2.na12878.vcf -CT SimpleVenn -CO ${callset}_v_CEU_1kg_pilot2.filtered.vcf -l INFO # cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset2_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.CEU_1kg_pilot2Unique.vcf # cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "callset1_only"' > ${callset}_v_CEU_1kg_pilot2.filtered.${callset}Unique.vcf # cat ${callset}_v_CEU_1kg_pilot2.filtered.vcf | awk '$1 ~ "#" || $8 ~ "concordant"' > ${callset}_v_CEU_1kg_pilot2.filtered.concordant.vcf