fast-bwa/bwakit/run-gen-ref

40 lines
1.7 KiB
Plaintext
Raw Permalink Normal View History

2014-11-16 13:46:02 +08:00
#!/bin/bash
root=`dirname $0`
url38="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz"
2014-11-16 13:46:02 +08:00
url37d5="ftp://ftp.ncbi.nlm.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
if [ $# -eq 0 ]; then
echo "Usage: $0 <hs38|hs38a|hs38DH|hs37|hs37d5>"
echo "Analysis sets:"
echo " hs38 primary assembly of GRCh38 (incl. chromosomes, unplaced and unlocalized contigs) and EBV"
echo " hs38a hs38 plus ALT contigs"
echo " hs38DH hs38a plus decoy contigs and HLA genes (recommended for GRCh38 mapping)"
echo " hs37 primary assembly of GRCh37 (used by 1000g phase 1) plus the EBV genome"
echo " hs37d5 hs37 plus decoy contigs (used by 1000g phase 3)"
echo ""
echo "Note: This script downloads human reference genomes. For hs38a and hs38DH, it needs additional"
echo " sequences and ALT-to-REF mapping included in the bwa.kit package."
2014-11-16 13:46:02 +08:00
exit 1;
fi
if [ $1 == "hs38DH" ]; then
(wget -O- $url38 | gzip -dc; cat $root/resource-GRCh38/hs38DH-extra.fa) > $1.fa
[ ! -f $1.fa.alt ] && cp $root/resource-GRCh38/hs38DH.fa.alt $1.fa.alt
2014-11-16 13:46:02 +08:00
elif [ $1 == "hs38a" ]; then
wget -O- $url38 | gzip -dc > $1.fa
[ ! -f $1.fa.alt ] && grep _alt $root/resource-GRCh38/hs38DH.fa.alt > $1.fa.alt
2014-11-16 13:46:02 +08:00
elif [ $1 == "hs38" ]; then
wget -O- $url38 | gzip -dc | awk '/^>/{f=/_alt/?0:1}f' > $1.fa
elif [ $1 == "hs37d5" ]; then
wget -O- $url37d5 | gzip -dc > $1.fa 2>/dev/null
elif [ $1 == "hs37" ]; then
wget -O- $url37d5 | gzip -dc 2>/dev/null | awk '/^>/{f=/>hs37d5/?0:1}f' > $1.fa
else
echo "ERROR: unknown genome build"
fi
[ ! -f $1.fa.bwt ] && echo -e "\nPlease run 'bwa index $1.fa'...\n"