BaseCalling Dorado + Demultiplexage

Téléchargement des modèles

$ /home/grid/dorado-0.7.2-linux-x64/bin/dorado download

Lancement du basecalling et du demultiplexage en mĂŞme temps

/home/grid/dorado-0.7.2-linux-x64/bin/dorado basecaller \
    -x "cuda:0" \
    --min-qscore 7 \
    --no-trim \
    --emit-fastq \
    /home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v4.2.0 \
    pod5/ | \
    /home/grid/dorado-0.7.2-linux-x64/bin/dorado demux \
    --kit-name SQK-RBK114-24 \
    --emit-fastq \
    --output-dir demultiplexed

Bash permettant l'automatisation de plusieurs basecalling et demultiplexage Ă  la suite

#!/bin/bash


# Processing C:/path/to/pod5/1
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="C:/pth/to/References/hg38.mmi"
INPUT_DIR="C:/path/to/pod5/1"
QS_SCORES=(10)

for qscore in "${QS_SCORES[@]}"; do
    OUTPUT_DIR="demultiplexed_q${qscore}"
    mkdir -p "${OUTPUT_DIR}"
    ${DORADO_BIN} basecaller -x "cuda:0" --min-qscore "${qscore}" --no-trim --emit-fastq ${MODEL_PATH} ${INPUT_DIR} | \
    ${DORADO_BIN} demux --kit-name "SQK-NBD114-24" --emit-fastq --output-dir "${OUTPUT_DIR}"
    echo "Processing complete for C:/path/to/pod5/1 with Q-score ${qscore}"
done

# Processing C:/path/to/pod5/2
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="C:/path/to/References/hg38.mmi"
INPUT_DIR="C:/path/to/pod5/2"
QS_SCORES=(40)

for qscore in "${QS_SCORES[@]}"; do
    OUTPUT_DIR="demultiplexed_q${qscore}"
    mkdir -p "${OUTPUT_DIR}"
    ${DORADO_BIN} basecaller -x "cuda:0" --min-qscore "${qscore}" --no-trim --emit-fastq ${MODEL_PATH} ${INPUT_DIR} | \
    ${DORADO_BIN} demux --kit-name "SQK-NBD114-24" --emit-fastq --output-dir "${OUTPUT_DIR}"
    echo "Processing complete for C:/path/to/pod5/2 with Q-score ${qscore}"
done

Script python permettant de creer le bash via interface graphique

import tkinter as tk
from tkinter import filedialog, messagebox
import os

def launch_config_ui():
    root = tk.Tk()
    root.title("Batch Configuration for Genomic Processing")

    configurations = []

    def add_configuration():
        ref_genome = ref_genome_entry.get()
        input_dir = input_dir_entry.get()
        qs_scores = qs_score_entry.get()
        cuda_device = cuda_device_entry.get()
        kit_name = kit_name_entry.get()
        
        if not all([ref_genome, input_dir, qs_scores, cuda_device, kit_name]):
            messagebox.showerror("Error", "Please fill all fields before adding a configuration.")
            return
        
        configurations.append({
            "ref_genome": ref_genome,
            "input_dir": input_dir,
            "qs_scores": qs_scores,
            "cuda_device": cuda_device,
            "kit_name": kit_name
        })
        
        listbox.insert(tk.END, input_dir)
        ref_genome_entry.delete(0, tk.END)
        input_dir_entry.delete(0, tk.END)
        qs_score_entry.delete(0, tk.END)
        cuda_device_entry.delete(0, tk.END)
        kit_name_entry.delete(0, tk.END)
        messagebox.showinfo("Success", "Configuration added successfully.")

    def generate_and_run_script():
        script_path = "all_configurations_processing.sh"
        with open(script_path, "w") as script_file:
            script_file.write("#!/bin/bash\n\n")
            for config in configurations:
                qs_scores_list = config['qs_scores'].split()
                qs_scores_array = ' '.join(f"{qs}" for qs in qs_scores_list)
                script_content = f"""
# Processing {config['input_dir']}
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="{config['ref_genome']}"
INPUT_DIR="{config['input_dir']}/"
QS_SCORES=({qs_scores_array})

for qscore in "${{QS_SCORES[@]}}"; do
    OUTPUT_DIR="demultiplexed_q${{qscore}}"
    mkdir -p "${{OUTPUT_DIR}}"
    ${{DORADO_BIN}} basecaller -x "{config['cuda_device']}" --min-qscore "${{qscore}}" --no-trim --emit-fastq ${{MODEL_PATH}} ${{INPUT_DIR}} | \\
    ${{DORADO_BIN}} demux --kit-name "{config['kit_name']}" --emit-fastq --output-dir "${{OUTPUT_DIR}}"
    echo "Processing complete for {config['input_dir']} with Q-score ${{qscore}}"
done
"""
                script_file.write(script_content)
        messagebox.showinfo("Done", f"All configurations have been written to {script_path}. Please run the script manually.")

    tk.Label(root, text="Select the genome file REF_GENOME (.mmi):").pack()
    ref_genome_entry = tk.Entry(root, width=50)
    ref_genome_entry.pack(padx=20, pady=5)
    tk.Button(root, text="Browse", command=lambda: ref_genome_entry.insert(0, filedialog.askopenfilename(filetypes=[("MMI files", "*.mmi")]))).pack()

    tk.Label(root, text="Select the folder for INPUT_DIR:").pack()
    input_dir_entry = tk.Entry(root, width=50)
    input_dir_entry.pack(padx=20, pady=5)
    tk.Button(root, text="Browse", command=lambda: input_dir_entry.insert(0, filedialog.askdirectory())).pack()

    tk.Label(root, text="Enter Q-scores separated by spaces:").pack()
    qs_score_entry = tk.Entry(root, width=50)
    qs_score_entry.pack(padx=20, pady=5)

    tk.Label(root, text="Specify the CUDA device (e.g., cuda:0):").pack()
    cuda_device_entry = tk.Entry(root, width=50)
    cuda_device_entry.insert(0, "cuda:0")
    cuda_device_entry.pack(padx=20, pady=5)

    tk.Label(root, text="Enter the kit name (e.g., SQK-NBD114-24):").pack()
    kit_name_entry = tk.Entry(root, width=50)
    kit_name_entry.insert(0, "SQK-NBD114-24")
    kit_name_entry.pack(padx=20, pady=5)

    tk.Button(root, text="Add Configuration", command=add_configuration).pack(pady=10)

    listbox = tk.Listbox(root, height=6, width=50)
    listbox.pack(pady=10)

    tk.Button(root, text="Generate Script", command=generate_and_run_script).pack(pady=20)

    root.mainloop()

if __name__ == "__main__":
    launch_config_ui()

Basecalling, demultiplexage suivi de l'alignement et de la conversion en BAM

#!/bin/bash

source ~/miniconda3/etc/profile.d/conda.sh
conda activate genomics

BASE_OUTPUT_DIR="C:/Users/aleks/OneDrive/Bureau/CHU/Test1/BAM"
mkdir -p "${BASE_OUTPUT_DIR}"

DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="C:/Users/aleks/OneDrive/Bureau/CHU/References/hg38.mmi"
INPUT_DIR="C:/Users/aleks/OneDrive/Bureau/CHU/Test1/pod51"
OUTPUT_DIR="${BASE_OUTPUT_DIR}/demultiplexed_q17"
mkdir -p "${OUTPUT_DIR}"
${DORADO_BIN} basecaller -x "cuda:0" --min-qscore "17" --no-trim --emit-fastq ${MODEL_PATH} ${INPUT_DIR} | \
${DORADO_BIN} demux --kit-name "SQK-NBD114-24" --emit-fastq --output-dir "${OUTPUT_DIR}"
echo "Processing complete for C:/Users/aleks/OneDrive/Bureau/CHU/Test1/pod51 with Q-score 17"
for fastq_file in "${OUTPUT_DIR}"/*.fastq; do
    bam_file="${fastq_file%.fastq}.bam"
    echo "Aligning ${fastq_file} to reference genome..."
    minimap2 -ax map-ont "C:/Users/aleks/OneDrive/Bureau/CHU/References/hg38.mmi" "$fastq_file" | samtools sort -o "$bam_file"
    samtools index "$bam_file"
    echo "Alignment and BAM conversion completed for ${bam_file}"
done
echo "All processes are complete."

Et en python via interface graphique :

import tkinter as tk
from tkinter import filedialog, messagebox

def launch_config_ui():
    root = tk.Tk()
    root.title("Batch Configuration for Genomic Processing")

    configurations = []

    def add_configuration():
        base_output_dir = base_output_dir_entry.get()
        input_dir = input_dir_entry.get()
        ref_genome = ref_genome_entry.get()
        qs_scores = qs_score_entry.get()
        cuda_device = cuda_device_entry.get()
        kit_name = kit_name_entry.get()
        
        if not all([base_output_dir, input_dir, ref_genome, qs_scores, cuda_device, kit_name]):
            messagebox.showerror("Error", "Please fill all fields before adding a configuration.")
            return
        
        configurations.append({
            "base_output_dir": base_output_dir,
            "input_dir": input_dir,
            "ref_genome": ref_genome,
            "qs_scores": qs_scores,
            "cuda_device": cuda_device,
            "kit_name": kit_name
        })
        
        listbox.insert(tk.END, f"Input Dir: {input_dir}, Output Dir: {base_output_dir}, Q-Scores: {qs_scores}")
        base_output_dir_entry.delete(0, tk.END)
        input_dir_entry.delete(0, tk.END)
        ref_genome_entry.delete(0, tk.END)
        qs_score_entry.delete(0, tk.END)
        cuda_device_entry.delete(0, tk.END)
        kit_name_entry.delete(0, tk.END)
        messagebox.showinfo("Success", "Configuration added successfully.")

    def generate_and_run_script():
        script_path = "all_configurations_processing.sh"
        with open(script_path, "w") as script_file:
            script_file.write("#!/bin/bash\n\n")
            script_file.write("source ~/miniconda3/etc/profile.d/conda.sh\n")
            script_file.write("conda activate genomics\n\n")
            for config in configurations:
                qs_scores_list = config['qs_scores'].split()
                for qscore in qs_scores_list:
                    output_dir = f"${{BASE_OUTPUT_DIR}}/demultiplexed_q{qscore}"
                    script_file.write(f"BASE_OUTPUT_DIR=\"{config['base_output_dir']}\"\n")
                    script_file.write("mkdir -p \"${BASE_OUTPUT_DIR}\"\n")
                    script_file.write(f"""
DORADO_BIN="/home/grid/dorado-0.7.2-linux-x64/bin/dorado"
MODEL_PATH="/home/grid/dorado-0.7.2-linux-x64/bin/dna_r10.4.1_e8.2_400bps_hac@v5.0.0"
REF_GENOME="{config['ref_genome']}"
INPUT_DIR="{config['input_dir']}"
OUTPUT_DIR="{output_dir}"
mkdir -p "${{OUTPUT_DIR}}"
${{DORADO_BIN}} basecaller -x "{config['cuda_device']}" --min-qscore "{qscore}" --no-trim --emit-fastq ${{MODEL_PATH}} ${{INPUT_DIR}} | \\
${{DORADO_BIN}} demux --kit-name "{config['kit_name']}" --emit-fastq --output-dir "${{OUTPUT_DIR}}"
echo "Processing complete for {config['input_dir']} with Q-score {qscore}"
""")
                    # Alignment and conversion to BAM
                    script_file.write(f"for fastq_file in \"${{OUTPUT_DIR}}\"/*.fastq; do\n")
                    script_file.write(f"    bam_file=\"${{fastq_file%.fastq}}.bam\"\n")
                    script_file.write(f"    echo \"Aligning ${{fastq_file}} to reference genome...\"\n")
                    script_file.write(f"    minimap2 -ax map-ont \"{config['ref_genome']}\" \"$fastq_file\" | samtools sort -o \"$bam_file\"\n")
                    script_file.write(f"    samtools index \"$bam_file\"\n")
                    script_file.write(f"    echo \"Alignment and BAM conversion completed for ${{bam_file}}\"\n")
                    script_file.write("done\n")
            script_file.write("echo \"All processes are complete.\"\n")
        messagebox.showinfo("Done", f"All configurations have been written to {script_path}. Please run the script manually.")

    # GUI layout settings
    tk.Label(root, text="Set the base output directory BASE_OUTPUT_DIR:").pack()
    base_output_dir_entry = tk.Entry(root, width=50)
    base_output_dir_entry.pack(padx=20, pady=5)
    tk.Button(root, text="Browse", command=lambda: base_output_dir_entry.insert(0, filedialog.askdirectory())).pack()

    tk.Label(root, text="Select the folder for INPUT_DIR:").pack()
    input_dir_entry = tk.Entry(root, width=50)
    input_dir_entry.pack(padx=20, pady=5)
    tk.Button(root, text="Browse", command=lambda: input_dir_entry.insert(0, filedialog.askdirectory())).pack()

    tk.Label(root, text="Select the genome file REF_GENOME (.mmi):").pack()
    ref_genome_entry = tk.Entry(root, width=50)
    ref_genome_entry.pack(padx=20, pady=5)
    tk.Button(root, text="Browse", command=lambda: ref_genome_entry.insert(0, filedialog.askopenfilename(filetypes=[("FASTA files", "*.mmi")]))).pack()

    tk.Label(root, text="Enter Q-scores separated by spaces:").pack()
    qs_score_entry = tk.Entry(root, width=50)
    qs_score_entry.pack(padx=20, pady=5)

    tk.Label(root, text="Specify the CUDA device (e.g., cuda:0):").pack()
    cuda_device_entry = tk.Entry(root, width=50)
    cuda_device_entry.insert(0, "cuda:0")
    cuda_device_entry.pack(padx=20, pady=5)

    tk.Label(root, text="Enter the kit name (e.g., SQK-NBD114-24):").pack()
    kit_name_entry = tk.Entry(root, width=50)
    kit_name_entry.insert(0, "SQK-NBD114-24")
    kit_name_entry.pack(padx=20, pady=5)

    tk.Button(root, text="Add Configuration", command=add_configuration).pack(pady=10)

    listbox = tk.Listbox(root, height=6, width=50)
    listbox.pack(pady=10)

    tk.Button(root, text="Generate Script", command=generate_and_run_script).pack(pady=20)

    root.mainloop()

if __name__ == "__main__":
    launch_config_ui()

Last updated