Commit d1ae404e authored by Gonzalo S Nido's avatar Gonzalo S Nido
Browse files

Changed lambda chr name, added automatic reference donwload/build

parent dc255d67
......@@ -21,3 +21,21 @@ Also included are the indices that are built using the
```bash
bash wgbs-pipeline.sh /folder/with/fastqfiles/sample_id.R1.fastq.gz /folder/with/fastqfiles/sample_id.R2.fastq.gz
```
Creating references
-------------------
The `wgbs-pipeline.sh` pipeline should take care of downloading and indexing
references if the appropriate files cannot be found in the ./ref folder. This
can also be done beforehand if so desired using the `generate_bs_indices.sh`
script. You need to provide the folder in which the reference files will be
stored. By default (so that you don't need to change anything in the pipeline)
you can run the following:
```bash
bash ./apps/generate_bs_indices.sh ./ref
```
The script requires samtools, bismark and bowtie2 to be installed in very
specific directories. To that end, the `prepare_vm.sh` script needs to be run.
#!/bin/bash
# Programs
samtools="/usr/local/bin/samtools-1.10"
bismark_index="/usr/local/bin/bismark_genome_preparation-0.22.3"
bowtie_path="/opt/bowtie2-2.4.41/"
# Check arguments
if [[ $# -lt 1 ]]; then
echo "ERROR: Provide output folder for the references"
exit 1
fi
# Directories where the references will be stored
mkdir -p $1
output_dir=$( realpath $1 )
echo "Reference files will be stored in \"${output_dir}\""
# === #
grch38_dir=${output_dir}/GRCh38
mkdir -p ${grch38_dir}
grch38_no_mt_dir=${output_dir}/GRCh38_no_mt
mkdir -p ${grch38_no_mt_dir}
mt_dir=${output_dir}/chrM
mkdir -p ${mt_dir}
lambda_dir=${output_dir}/lambda
mkdir -p ${lambda_dir}
# Fasta filenames
grch38_fasta=${grch38_dir}/GRCh38_no_alt_with_lambda.fasta
grch38_no_mt_fasta=${grch38_no_mt_dir}/GRCh38_no_alt_no_mt.fasta
mt_fasta=${mt_dir}/chrM.fasta
lambda_fasta=${lambda_dir}/lambda.fasta
# URL with the files from ENCODE
GRCH38_url="https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz"
LAMBDA_url="https://www.encodeproject.org/files/lambda.fa/@@download/lambda.fa.fasta.gz"
# Get number of processors in the machine
N_proc=`expr $( grep -c processor /proc/cpuinfo ) / 2`
# 1) Obtain GRCh38 human genome from ENCODE
echo "1/8 - Downloading GRCh38 reference..."
wget -o wget.log --show-progress -O ${grch38_fasta}.gz ${GRCH38_url}
echo " Decompressing GRCh38 reference..."
gunzip -f ${grch38_fasta}.gz
# 2) Obtain Lambda genome
echo "2/8 - Downloading Lambda genome..."
wget -a wget.log --show-progress -O ${lambda_fasta}.gz ${LAMBDA_url}
echo " Decompressing Lambda reference..."
gunzip -f ${lambda_fasta}.gz
# 3) Generate chrM genome fomr GRCh38
echo "3/8 - Extracting mtDNA from GRCh38 reference..."
${samtools} faidx ${grch38_fasta} chrM > ${mt_fasta}
# 4) Generate GRCh38 without chrM
echo "4/8 - Removing mtDNA from GRCh38 reference..."
all_chr=$( grep -v '^chrM' ${grch38_fasta}.fai | cut -f 1 )
${samtools} faidx ${grch38_fasta} ${all_chr} > ${grch38_no_mt_fasta}
# 5) Add Lambda to GRCh38
echo "5/8 - Adding Lambda genome to GRCh38 reference..."
cat ${lambda_fasta} >> ${grch38_fasta}
rm ${grch38_fasta}.fai
# 6) Generate index for GRCh38
echo "6/8 - Generating indices for GRCh38..."
${bismark_index} --parallel ${N_proc} --path_to_aligner ${bowtie_path} ${grch38_dir}
# 7) Generate index for GRCh38 without MT
echo "7/8 - Generating indices for GRCh38 without mtDNA..."
${bismark_index} --parallel ${N_proc} --path_to_aligner ${bowtie_path} ${grch38_no_mt_dir}
# 8) Generate index for chrM
echo "8/8 - Generating indices for chrM..."
${bismark_index} --parallel ${N_proc} --path_to_bowtie ${bowtie_path} ${mt_dir}
......@@ -2,13 +2,15 @@
# Some variables
MT_chr_name="chrM"
LAMBDA_chr_name="NC_001416.1"
LAMBDA_chr_name="chrL"
# Reference genome (with Bismark indices and stuff
b38_ref=$( realpath ./ref/GRCh38/ )
nuc_ref=$( realpath ./ref/GRCh38_no_mt/ )
mt_ref=$( realpath ./ref/chrM/ )
ref_dir="./ref"
b38_ref=$( realpath ${ref_dir}/GRCh38/ )
nuc_ref=$( realpath ${ref_dir}/GRCh38_no_mt/ )
mt_ref=$( realpath ${ref_dir}/chrM/ )
# Programs
......@@ -21,6 +23,7 @@ samtools="/usr/local/bin/samtools-1.10"
samtools_path="/opt/samtools-1.10"
ext_chr_srt=$( realpath ./apps/ext_chr_srt.py )
meth_prop=$( realpath ./apps/methylation_proportion.py )
generate_bs_indices="./apps/generate_bs_indices.sh"
psrecord="~/.local/bin/psrecord"
......@@ -29,12 +32,19 @@ MONITOR=false
#MONITOR=true
# exit when any command fails
set -e
#trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG
#trap 'echo "\"${last_command}\" command filed with exit code $?"' EXIT
# Ensure reference files/indices are there, download and index otherwise
if [[ ! -s ${b38_ref}/Bisulfite_Genome/GA_conversion/genome_mfa.GA_conversion.fa ]]; then
echo "Reference files not found. Downloading and indexing..."
bash ${generate_bs_indices} ${ref_dir}
fi
if [[ $# -lt 3 ]]; then
echo "ERROR: Provide fastq files (R1 and R2) and the output folder!"
exit 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment