diff --git a/perl/reports/1KG/AssembleSlides.pl b/perl/reports/1KG/AssembleSlides.pl new file mode 100755 index 000000000..d33d7b4a6 --- /dev/null +++ b/perl/reports/1KG/AssembleSlides.pl @@ -0,0 +1,16 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; + +my %args = &getCommandArguments("DIR" => undef, "OUT" => undef); + +my $outfile = $args{'OUT'}; +chomp(my @files = qx(find $args{'DIR'} -path \*slide\*.pdf | sort -n)); + +print join("\n", @files) . "\n"; + +my $cmd = "gs -q -sPAPERSIZE=letter -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dAutoRotatePages=/All -sOutputFile=$outfile " . join(' ', @files); + +system($cmd); diff --git a/perl/reports/1KG/GenerateDataProcessingReport.pl b/perl/reports/1KG/GenerateDataProcessingReport.pl new file mode 100755 index 000000000..4e837935a --- /dev/null +++ b/perl/reports/1KG/GenerateDataProcessingReport.pl @@ -0,0 +1,64 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; +use DistributedMake; +use DataTable; +use Data::Dumper; + +my %args = &getCommandArguments( + "OUT_DIR" => undef, + "SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index", + "ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index" +); + +my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t"); +my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t"); + +my %results; + +foreach my $sequenceEntry (@sequenceIndex) { + my %sequenceEntry = %{$sequenceEntry}; + + if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) { + ${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; + + ${$results{'all'}}{'population'} = "all"; + ${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'}; + + ${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); + ${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); + + ${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); + ${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); + + if ($sequenceEntry{'WITHDRAWN'} == 0) { + ${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; + ${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; + } else { + ${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; + } + + ${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; + + ${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; + + ${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; + } +} + +foreach my $studyid (keys(%results)) { + my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}}; + my @samples = keys(%study); + my $numSamples = scalar(@samples); + + ${$results{$studyid}}{'num_samples'} = $numSamples; +} + +print Dumper(\%results); diff --git a/perl/reports/1KG/GenerateDataProcessingReport2.pl b/perl/reports/1KG/GenerateDataProcessingReport2.pl new file mode 100755 index 000000000..83943afe6 --- /dev/null +++ b/perl/reports/1KG/GenerateDataProcessingReport2.pl @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; +use DistributedMake; +use DataTable; +use Data::Dumper; +use File::Basename; + +my %args = &getCommandArguments( + "LIST" => undef, + "OUT_DIR" => undef, + + "REFERENCE" => "/broad/1KG/reference/human_b36_both.fasta", + "DBSNP" => "/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod", + + "SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index", + "ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index", + + "DRY_RUN" => 1, + "QUEUE" => "gsa", + "NUM_JOBS" => "", + "KEEP_GOING" => 1, +); + +my $dm = new DistributedMake("dryRun" => $args{'DRY_RUN'}, "queue" => $args{'QUEUE'}, "numJobs" => "", "keepGoing" => 1, "outputFile" => "$args{'OUT_DIR'}/GenerateDataProcessingReport2.log"); + +my @entries; + +open(LIST, $args{'LIST'}); +while (my $entry = ) { + chomp($entry); + + my %entry; + ($entry{'POP'}, $entry{'UG'}, $entry{'QCALL'}) = split(/\s+/, $entry); + + push(@entries, \%entry); +} +close(LIST); + +my @slides; + +foreach my $entry (@entries) { + my %entry = %{$entry}; + + # Combine VCFs + my $vcfCombineOut = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}.combined.vcf"; + my $vcfCombineOutCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VCFCombine -R $args{'REFERENCE'} -type UNION -B UG,VCF,$entry{'UG'} -B QCALL,VCF,$entry{'QCALL'} -A -priority 'UG,QCALL' -o $vcfCombineOut -l INFO -L 1"; + $dm->addRule($vcfCombineOut, [$entry{'UG'}, $entry{'QCALL'}], $vcfCombineOutCmd); + + # Evaluate variants + my $evalRoot = "$vcfCombineOut.eval/eval"; + my $evalCounts = "$evalRoot.Count_Variants.csv"; + my $evalCmd = "java -Xmx2048m -jar $ENV{'STING_DIR'}/dist/GenomeAnalysisTK.jar -T VariantEval -R $args{'REFERENCE'} -D $args{'DBSNP'} -B eval,VCF,$vcfCombineOut -select 'set==\"Intersection\"' -selectName Intersection -select 'set==\"UG-filteredInOther\"' -selectName InUG-FilteredInQCALL -select 'set==\"QCALL-filteredInOther\"' -selectName InQCALL-FilteredInUG -select 'set==\"UG\"' -selectName UG -select 'set==\"QCALL\"' -selectName QCALL -reportType R -reportLocation $evalRoot -l INFO -L 1"; + $dm->addRule($evalCounts, $vcfCombineOut, $evalCmd); + + # Get SNPs per sample + my $snpsPerSample = "$evalRoot.persample.table"; + my $snpsPerSampleCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/GetSNPsPerSample.pl VCF=$vcfCombineOut OUT=$snpsPerSample"; + $dm->addRule($snpsPerSample, $vcfCombineOut, $snpsPerSampleCmd); + + # Create topsheet slide + my $slideDir = "$args{'OUT_DIR'}/intermediate/$entry{'POP'}/slides"; + my $topSheetRoot = "$slideDir/$entry{'POP'}.000.topsheet"; + my $topSheetSlide = "$topSheetRoot.pdf"; + my $topSheetSlideCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeTopSheetSlide.pl VCF=$vcfCombineOut SEQUENCE_INDEX=$args{'SEQUENCE_INDEX'} ALIGNMENT_INDEX=$args{'ALIGNMENT_INDEX'} EVAL_ROOT=$evalRoot OUT=$topSheetRoot"; + #$dm->addRule($topSheetSlide, [$vcfCombineOut, $args{'SEQUENCE_INDEX'}, $args{'ALIGNMENT_INDEX'}, $evalCounts], $topSheetSlideCmd); + + # Create Venn diagram slide of variants + my $vennRoot = "$slideDir/$entry{'POP'}.010.venn"; + my $vennPdf = "$vennRoot.pdf"; + my $vennPdfCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeVennSlide.pl EVAL_ROOT=$evalRoot OUT_ROOT=$vennRoot POP=$entry{'POP'}"; + $dm->addRule($vennPdf, $evalCounts, $vennPdfCmd); + + # Create plots for other slides + my $plotRoot = "$evalRoot.plot"; + my $plotDummy = "$plotRoot.venn_per_ac.norm.pdf"; + my $plotCmd = "Rscript $ENV{'STING_DIR'}/perl/reports/1KG/MakeDataProcessingReportPlots.R $evalRoot $plotRoot $entry{'POP'}"; + $dm->addRule($plotDummy, [$evalCounts, $snpsPerSample], $plotCmd, 'queue' => ""); + + # Create other slides + my $dprSlideDummy = "$slideDir/$entry{'POP'}.050.eval.plot.titv.pdf"; + my $dprSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/MakeDPRSlides.pl PLOT_ROOT=$plotRoot OUT_DIR=$slideDir POP=$entry{'POP'}"; + $dm->addRule($dprSlideDummy, $plotDummy, $dprSlidesCmd); + + push(@slides, $dprSlideDummy); +} + +my $assembledSlides = "$args{'OUT_DIR'}/dpr.pdf"; +my $assembledSlidesCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/AssembleSlides.pl DIR=$args{'OUT_DIR'} OUT=$assembledSlides"; +$dm->addRule($assembledSlides, \@slides, $assembledSlidesCmd); + +$dm->execute(); diff --git a/perl/reports/1KG/GetSNPsPerSample.pl b/perl/reports/1KG/GetSNPsPerSample.pl new file mode 100755 index 000000000..8fa430649 --- /dev/null +++ b/perl/reports/1KG/GetSNPsPerSample.pl @@ -0,0 +1,67 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; + +my %args = &getCommandArguments("VCF" => undef, "OUT" => "/dev/stdout"); + +my %table; +my %sets; + +open(VCF, $args{'VCF'}); + +my @header; +while (my $line = ) { + chomp($line); + if ($line =~ /#/) { + if ($line =~ /CHROM/) { + $line =~ s/#//g; + @header = split(/\s+/, $line); + } + } else { + my @columns = split(/\s+/, $line); + + if ($columns[0] eq '1') { + my %entry; + for (my $i = 0; $i <= $#columns; $i++) { + $entry{$header[$i]} = $columns[$i]; + } + + my ($set) = $entry{'INFO'} =~ /set=(\w+)/; + $sets{$set} = 1; + + for (my $i = 9; $i <= $#header; $i++) { + if ($entry{$header[$i]} =~ /0[\\\|\/]1/ || + $entry{$header[$i]} =~ /1[\\\|\/]0/ || + $entry{$header[$i]} =~ /1[\\\|\/]1/) { + + ${$table{$header[$i]}}{$set}++; + $sets{$set} = 1; + } + } + } + } +} +close(VCF); + +open(OUT, ">$args{'OUT'}"); + +print OUT "sample\t" . join("\t", sort { $a cmp $b } keys(%sets)) . "\n"; + +foreach my $sample (sort { $a cmp $b } keys(%table)) { + print OUT "$sample"; + + foreach my $set (sort { $a cmp $b } keys(%sets)) { + my $value = 0; + if (exists(${$table{$sample}}{$set})) { + $value = ${$table{$sample}}{$set}; + } + + print OUT "\t$value"; + } + + print OUT "\n"; +} + +close(OUT); diff --git a/perl/reports/1KG/MakeDPRSlides.pl b/perl/reports/1KG/MakeDPRSlides.pl new file mode 100755 index 000000000..71bb0c472 --- /dev/null +++ b/perl/reports/1KG/MakeDPRSlides.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; +use File::Basename; +use DistributedMake; + +my %args = &getCommandArguments("PLOT_ROOT" => undef, "OUT_DIR" => undef, "POP" => undef); + +my $dm = new DistributedMake("dryRun" => 0, "numJobs" => "1"); + +my $timestamp = gmtime(); + +my @items = ( + { 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac", 'title' => 'Callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp }, + { 'plotprefix' => "$args{'PLOT_ROOT'}.venn_per_ac.norm", 'title' => 'Normalized callset concordance per allele count', 'population' => $args{'POP'}, 'caption' => "Concordance of the UnifiedGenotyper and QCALL callsets, per allele count, normalized by the number of variants in the callset union for that allele count bin. Bottom and top sections of each bar represent the calls unique to each caller's callset, while the middle portion represents the intersection.", 'timestamp' => $timestamp }, + { 'plotprefix' => "$args{'PLOT_ROOT'}.acs", 'title' => 'Allele count spectrum', 'population' => $args{'POP'}, 'caption' => 'Allele count spectrum for all known, novel, and all variants. Individual contributions from separate callers (thin, dotted or dashed lines) as well as the callset intersection (bold, solid lines) are shown.', 'timestamp' => $timestamp }, + { 'plotprefix' => "$args{'PLOT_ROOT'}.titv", 'title' => 'Ti/Tv per allele count', 'population' => $args{'POP'}, 'caption' => 'Ratio of transition mutations to transversions, per allele count. Whole-genome expectation is approximately 2.1. Fluctuation at higher allele counts is typically due to fewer available variants for the calculation at those frequencies.', 'timestamp' => $timestamp }, + { 'plotprefix' => "$args{'PLOT_ROOT'}.counts_per_sample", 'title' => 'SNP calls per sample', 'population' => $args{'POP'}, 'caption' => 'Number of SNPs per sample. As samples in plot are all from the same or similar populations, with presumably similar heterozygosity, all samples should have roughly consistent performance. Outliers may indicate problematic samples - perhaps in coverage, identity QC (sample mixup), or quality of sequencing.', 'timestamp' => $timestamp }, +); + +my $idnum = 1; + +foreach my $item (@items) { + $idnum++; + my $id = "$args{'POP'}." . sprintf("%03d", $idnum*10); + + my %item = %$item; + + my $basename = "$id." . basename($item{'plotprefix'}); + my $table = "$args{'OUT_DIR'}/$basename.kvp"; + + open(TABLE, ">$table"); + foreach my $key (sort { $a cmp $b } keys(%item)) { + print TABLE "$key\t$item{$key}\n"; + } + close(TABLE); + + my $render = "$args{'OUT_DIR'}/$basename.tex"; + my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$table TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/plot_and_text_template.tex TEMPLATE_OUT=$render"; + $dm->addRule($render, "$item{'plotprefix'}.pdf", $renderCmd); + + my $pdf = "$args{'OUT_DIR'}/$basename.pdf"; + my $pdfdir = dirname($pdf); + my $pdfCmd = "pdflatex -output-directory $pdfdir $render"; + $dm->addRule($pdf, $render, $pdfCmd); +} + +$dm->execute(); diff --git a/perl/reports/1KG/MakeDataProcessingReportPlots.R b/perl/reports/1KG/MakeDataProcessingReportPlots.R new file mode 100644 index 000000000..119b07958 --- /dev/null +++ b/perl/reports/1KG/MakeDataProcessingReportPlots.R @@ -0,0 +1,146 @@ +args = commandArgs(TRUE); + +evalRoot = args[1]; +if (is.na(evalRoot)) { evalRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval"; } + +plotRoot = args[2]; +if (is.na(plotRoot)) { plotRoot = "results/v2/intermediate/CEU.combined.vcf.eval/eval.plot"; } + +population = args[3]; +if (is.na(population)) { population = "unknown population"; } + +fileAlleleCountStats = paste(evalRoot, ".AlleleCountStats.csv", sep=""); +fileCompOverlap = paste(evalRoot, ".Comp_Overlap.csv", sep=""); +fileCountVariants = paste(evalRoot, ".Count_Variants.csv", sep=""); +fileGenotypeConcordance = paste(evalRoot, ".Genotype_Concordance.csv", sep=""); +fileMetricsByAc = paste(evalRoot, ".MetricsByAc.csv", sep=""); +fileQuality_Metrics_by_allele_count = paste(evalRoot, ".Quality_Metrics_by_allele_count.csv", sep=""); +fileQualityScoreHistogram = paste(evalRoot, ".QualityScoreHistogram.csv", sep=""); +fileSampleStatistics = paste(evalRoot, ".Sample_Statistics.csv", sep=""); +fileSampleSummaryStatistics = paste(evalRoot, ".Sample_Summary_Statistics.csv", sep=""); +fileTi_slash_Tv_Variant_Evaluator = paste(evalRoot, ".Ti_slash_Tv_Variant_Evaluator.csv", sep=""); +fileTiTvStats = paste(evalRoot, ".TiTvStats.csv", sep=""); +fileVariant_Quality_Score = paste(evalRoot, ".Variant_Quality_Score.csv", sep=""); +filePerSample = paste(evalRoot, ".persample.table", sep=""); + +# Allele count spectrum +if (!is.na(plotRoot)) { + pdf(paste(plotRoot, ".acs.pdf", sep="")); +} else { + x11(); +} + +dataMetricsByAc = read.csv(fileMetricsByAc, header=TRUE, comment.char="#"); + +dataMetricsByAc.none.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "none" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),]; + +dataMetricsByAc.int.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),]; +dataMetricsByAc.int.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),]; +dataMetricsByAc.int.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "Intersection" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),]; + +dataMetricsByAc.UG.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all; +dataMetricsByAc.UG.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known; +dataMetricsByAc.UG.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "UG" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel; + +dataMetricsByAc.QCALL.all = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "all"),] + dataMetricsByAc.int.all; +dataMetricsByAc.QCALL.known = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "known"),] + dataMetricsByAc.int.known; +dataMetricsByAc.QCALL.novel = dataMetricsByAc[which(dataMetricsByAc$jexl_expression == "QCALL" & dataMetricsByAc$filter_name == "called" & dataMetricsByAc$novelty_name == "novel"),] + dataMetricsByAc.int.novel; + + +plot(0, 0, type="n", xlab="Allele count", ylab="Number of variants", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(min(dataMetricsByAc.none.all$n), max(dataMetricsByAc.none.all$n)), main=paste("Allele count spectrum (", population, ")", sep=""), cex=1.2, cex.axis=1.2, cex.lab=1.2, bty="n"); + +points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$n, type="l", lwd=3, col="black"); +points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$n, type="l", lwd=3, col="blue"); +points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$n, type="l", lwd=3, col="red"); + +points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.UG.all$n, type="l", lwd=1, lty=2, col="black"); +points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.UG.known$n, type="l", lwd=1, lty=2, col="blue"); +points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.UG.novel$n, type="l", lwd=1, lty=2, col="red"); + +points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.QCALL.all$n, type="l", lwd=1, lty=3, col="black"); +points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.QCALL.known$n, type="l", lwd=1, lty=3, col="blue"); +points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.QCALL.novel$n, type="l", lwd=1, lty=3, col="red"); + +legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2); + +if (!is.na(plotRoot)) { dev.off(); } + +# Ti/Tv per allele count +if (!is.na(plotRoot)) { + pdf(paste(plotRoot, ".titv.pdf", sep="")); +} else { + x11(); +} + +plot(0, 0, type="n", xlab="Allele count", ylab="Transition/transversion ratio", xlim=c(0, max(dataMetricsByAc.none.all$AC)), ylim=c(0, max(dataMetricsByAc$Ti.Tv)), main=paste("Ti/Tv per allele count (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2, bty="n"); + +points(dataMetricsByAc.int.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=3, lty=1, col="black"); +points(dataMetricsByAc.int.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=3, lty=1, col="blue"); +points(dataMetricsByAc.int.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=3, lty=1, col="red"); + +points(dataMetricsByAc.UG.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=2, col="black"); +points(dataMetricsByAc.UG.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=2, col="blue"); +points(dataMetricsByAc.UG.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=2, col="red"); + +points(dataMetricsByAc.QCALL.all$AC, dataMetricsByAc.int.all$Ti.Tv, type="l", lwd=1, lty=3, col="black"); +points(dataMetricsByAc.QCALL.known$AC, dataMetricsByAc.int.known$Ti.Tv, type="l", lwd=1, lty=3, col="blue"); +points(dataMetricsByAc.QCALL.novel$AC, dataMetricsByAc.int.novel$Ti.Tv, type="l", lwd=1, lty=3, col="red"); + +legend("topright", c("Intersection: all", "Intersection: known (present in dbSNP 129)", "Intersection: novel (absent in dbSNP 129)", "UnifiedGenotyper: all", "UnifiedGenotyper: known (present in dbSNP 129)", "UnifiedGenotyper: novel (absent in dbSNP 129)", "QCALL: all", "QCALL: known (present in dbSNP 129)", "QCALL: novel (absent in dbSNP 129)"), lwd=c(3, 3, 3, 1, 1, 1, 1, 1, 1), lty=c(1, 1, 1, 2, 2, 2, 3, 3, 3), col=c("black", "blue", "red"), cex=1.2); + +if (!is.na(plotRoot)) { dev.off(); } + +# Barplot (unnormalized) +if (!is.na(plotRoot)) { + pdf(paste(plotRoot, ".venn_per_ac.pdf", sep="")); +} else { + x11(); +} + +norm = dataMetricsByAc.UG.all$n + dataMetricsByAc.int.all$n + dataMetricsByAc.QCALL.all$n; +mat = rbind(dataMetricsByAc.QCALL.all$n, dataMetricsByAc.int.all$n, dataMetricsByAc.UG.all$n); +matnorm = rbind(dataMetricsByAc.QCALL.all$n/norm, dataMetricsByAc.int.all$n/norm, dataMetricsByAc.UG.all$n/norm); + +qcallcolor = "#FF5555"; +intcolor = "#5582C6"; +ugcolor = "#55BBFF"; + +barplot(mat, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="counts", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC); +legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor)); + +if (!is.na(plotRoot)) { dev.off(); } + +# Barplot (normalized) +if (!is.na(plotRoot)) { + pdf(paste(plotRoot, ".venn_per_ac.norm.pdf", sep="")); +} else { + x11(); +} + +barplot(matnorm, col=c(qcallcolor, intcolor, ugcolor), xlab="Allele count", ylab="Fraction", main=paste("Callset concordance per allele count (", population, ")", sep=""), names.arg=dataMetricsByAc.UG.all$AC, ylim=c(0, 1.3)); +legend("topright", c("UG-only", "Intersection", "QCALL-only"), fill=c(ugcolor, intcolor, qcallcolor)); + +if (!is.na(plotRoot)) { dev.off(); } + +# Per-sample +if (!is.na(plotRoot)) { + pdf(paste(plotRoot, ".counts_per_sample.pdf", sep=""), width=10); +} else { + x11(width=10); +} + +dataPerSample = read.table(filePerSample, header=TRUE); + +par(mar=c(7, 5, 2, 1)); + +plot(0, 0, type="n", xaxt="n", xlim=c(1, length(dataPerSample$sample)), ylim=c(0, max(dataPerSample$Intersection + dataPerSample$UG, dataPerSample$Intersection, dataPerSample$Intersection + dataPerSample$QCALL)), xlab="", ylab="Counts", main=paste("SNPs per sample (", population, ")", sep=""), cex=1.2, cex.lab=1.2, cex.axis=1.2); + +points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$UG, col=ugcolor); +points(dataPerSample$sample, dataPerSample$Intersection, col=intcolor); +points(dataPerSample$sample, dataPerSample$Intersection + dataPerSample$QCALL, col=qcallcolor); + +legend("bottomleft", c("UG", "Intersection", "QCALL"), fill=c(ugcolor, intcolor, qcallcolor)); + +axis(1, at=c(1:length(dataPerSample$sample)), labels=dataPerSample$sample, las=2, cex.axis=0.7); + +if (!is.na(plotRoot)) { dev.off(); } diff --git a/perl/reports/1KG/MakeTopSheetSlide.pl b/perl/reports/1KG/MakeTopSheetSlide.pl new file mode 100755 index 000000000..db46ed841 --- /dev/null +++ b/perl/reports/1KG/MakeTopSheetSlide.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; +use DistributedMake; +use DataTable; +use Data::Dumper; + +my %args = &getCommandArguments( + "VCF" => undef, + "EVAL_ROOT" => undef, + "OUT" => undef, + + "POP" => undef, + "SRP_IDS" => undef, + "SEQUENCE_INDEX" => "/broad/1KG/DCC/ftp/sequence.index", + "ALIGNMENT_INDEX" => "/broad/1KG/DCC/ftp/alignment.index" +); + +my @sequenceIndex = &readTable($args{'SEQUENCE_INDEX'}, "header" => 1, "delimiter" => "\t"); +my @alignmentIndex = &readTable($args{'ALIGNMENT_INDEX'}, "header" => 1, "delimiter" => "\t"); + +my %results; + +foreach my $sequenceEntry (@sequenceIndex) { + my %sequenceEntry = %{$sequenceEntry}; + + if ($sequenceEntry{'STUDY_NAME'} =~ /1000/ && $sequenceEntry{'ANALYSIS_GROUP'} =~ /low coverage/ && $sequenceEntry{'FASTQ_FILE'} !~ /_\d.filt.fastq.gz/) { + ${${$results{'all'}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'sequencer'}}{$sequenceEntry{'INSTRUMENT_PLATFORM'}}++; + + ${$results{'all'}}{'population'} = "all"; + ${$results{$sequenceEntry{'STUDY_ID'}}}{'population'} = $sequenceEntry{'POPULATION'}; + + ${$results{'all'}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); + ${$results{$sequenceEntry{'STUDY_ID'}}}{'used_lanes'} += ($sequenceEntry{'WITHDRAWN'} == 0); + + ${$results{'all'}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); + ${$results{$sequenceEntry{'STUDY_ID'}}}{'unused_lanes'} += ($sequenceEntry{'WITHDRAWN'} != 0); + + if ($sequenceEntry{'WITHDRAWN'} == 0) { + ${${${$results{'all'}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; + ${${${$results{$sequenceEntry{'STUDY_ID'}}}{'lanes_per_samples'}}}{$sequenceEntry{'SAMPLE_NAME'}}++; + } else { + ${${$results{'all'}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'withdrawn_comment'}}{$sequenceEntry{'COMMENT'}}++; + } + + ${${$results{'all'}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'lane_parities'}}{$sequenceEntry{'LIBRARY_LAYOUT'}}++; + + ${${$results{'all'}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'submission_date'}}{$sequenceEntry{'SUBMISSION_DATE'}}++; + + ${${$results{'all'}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; + ${${$results{$sequenceEntry{'STUDY_ID'}}}{'center_names'}}{$sequenceEntry{'CENTER_NAME'}}++; + } +} + +foreach my $studyid (keys(%results)) { + my %study = %{${${$results{$studyid}}{'lanes_per_samples'}}}; + my @samples = keys(%study); + my $numSamples = scalar(@samples); + + ${$results{$studyid}}{'num_samples'} = $numSamples; +} + +print Dumper(\%results); diff --git a/perl/reports/1KG/MakeVennSlide.pl b/perl/reports/1KG/MakeVennSlide.pl new file mode 100755 index 000000000..216ebc357 --- /dev/null +++ b/perl/reports/1KG/MakeVennSlide.pl @@ -0,0 +1,93 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; +use DistributedMake; +use Data::Dumper; +use File::Basename; + +sub getCSV { + my ($file) = @_; + + my @header; + my @csv; + + open(CSV, $file); + while (my $csvline = ) { + if ($csvline !~ /#/) { + chomp(my @columns = split(/,/, $csvline)); + + if (scalar(@columns) > 1) { + if (scalar(@header) < 2) { + @header = @columns; + } else { + my %entry; + for (my $column = 0; $column <= $#header; $column++) { + $entry{$header[$column]} = $columns[$column]; + } + + push(@csv, \%entry); + } + } + } + } + close(CSV); + + return @csv; +} + +my %args = &getCommandArguments("EVAL_ROOT" => undef, "OUT_ROOT" => undef, "POP" => undef); + +my %values; + +my @header; +my @counts = &getCSV("$args{'EVAL_ROOT'}.Count_Variants.csv"); +my @titv = &getCSV("$args{'EVAL_ROOT'}.Ti_slash_Tv_Variant_Evaluator.csv"); + +my $varout = "$args{'OUT_ROOT'}.kvp"; +open(VAROUT, ">$varout"); +foreach my $entry (@counts) { + my %entry = %$entry; + + print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.nVariantLoci\t$entry{'nVariantLoci'}\n"; + + if ($entry{'jexl_expression'} eq 'Intersection') { $values{'Intersection'} = $entry{'nVariantLoci'}; } + if ($entry{'jexl_expression'} eq 'UG') { $values{'UG-only'} = $entry{'nVariantLoci'}; } + if ($entry{'jexl_expression'} eq 'QCALL') { $values{'QCALL-only'} = $entry{'nVariantLoci'}; } +} + +foreach my $entry (@titv) { + my %entry = %$entry; + + print VAROUT "$entry{'jexl_expression'}.$entry{'filter_name'}.$entry{'novelty_name'}.ti/tv_ratio\t$entry{'ti/tv ratio'}\n"; +} + +my $outpng = "$args{'OUT_ROOT'}.png"; +print VAROUT "venndiagram\t$args{'OUT_ROOT'}\n"; +print VAROUT "population\t$args{'POP'}\n"; +print VAROUT "timestamp\t" . gmtime() . "\n"; +close(VAROUT); + +my $ug = $values{'UG-only'} + $values{'Intersection'}; +my $qcall = $values{'QCALL-only'} + $values{'Intersection'}; + +my $scaledug = ($ug > $qcall) ? 100 : int($ug*100/$qcall); +my $scaledqcall = ($qcall > $ug) ? 100 : int($qcall*100/$ug); +my $scaledint = ($ug > $qcall) ? int($values{'Intersection'}*100/$ug) : int($values{'Intersection'}*100/$qcall); + +my $dm = new DistributedMake("dryRun" => 0); + +my $vennCmd = "wget -O $outpng \"http://chart.apis.google.com/chart?cht=v&chd=t:$scaledqcall,$scaledug,0,$scaledint&chco=FF3355,00BBFF&chs=600x480&chdl=QCALL|UnifiedGenotyper&chtt=Callset+concordance+($args{'POP'})&chts=000000,20&chma=100,0,0,0\""; +$dm->addRule($outpng, $varout, $vennCmd); + +my $render = "$args{'OUT_ROOT'}.tex"; +my $renderCmd = "$ENV{'STING_DIR'}/perl/reports/1KG/RenderTemplate.pl KVP=$varout TEMPLATE=$ENV{'STING_DIR'}/perl/reports/1KG/venn_template.tex TEMPLATE_OUT=$render"; +$dm->addRule($render, $outpng, $renderCmd); + +my $pdf = "$args{'OUT_ROOT'}.pdf"; +my $pdfdir = dirname($args{'OUT_ROOT'}); +my $pdfCmd = "pdflatex -output-directory $pdfdir $render"; +$dm->addRule($pdf, $render, $pdfCmd); + +$dm->execute(); diff --git a/perl/reports/1KG/RenderTemplate.pl b/perl/reports/1KG/RenderTemplate.pl new file mode 100755 index 000000000..19f9fb30d --- /dev/null +++ b/perl/reports/1KG/RenderTemplate.pl @@ -0,0 +1,37 @@ +#!/usr/bin/perl -w + +use strict; +use lib "$ENV{'STING_DIR'}/perl"; +use StingArgs; +use Data::Dumper; + +my %args = &getCommandArguments("KVP" => undef, "TEMPLATE" => undef, "TEMPLATE_OUT" => undef); + +my %vars; +open(KVP, $args{'KVP'}); +while (my $kvpline = ) { + chomp($kvpline); + + my ($key, $value) = split(/\t+/, $kvpline); + $vars{$key} = $value; +} +close(KVP); + +open(TEMPLATE, $args{'TEMPLATE'}); +open(TEMPLATE_OUT, ">$args{'TEMPLATE_OUT'}"); +while (my $line =