Skip to content

API

API Documentation

This package facilitates the download of data from the ENA in fastq format. To use it, you need to provide the accession number of the data you want to download.

cli()

Entry point for the command line interface. This function is called when the package is called from the command line. It uses the argparse package to parse the arguments passed to the command line.

Returns:

Type Description
None
Source code in ena_download/__init__.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def cli():
    """
    Entry point for the command line interface. This function is called when the package is called from the command line.
    It uses the argparse package to parse the arguments passed to the command line.

    Returns
    -------
    None
    """
    argparser = argparse.ArgumentParser(description='ENA Download')
    argparser.add_argument('accession', type=str, help='Accession number of the data to download')
    argparser.add_argument('--timeout', default=300, type=int, help='Timeout in seconds for the download to complete. Default is 300 seconds.')

    args = argparser.parse_args()

    main(args.accession,args.timeout)

download_data(accession, urls, timeout=300)

Download data from the ENA.

Parameters:

Name Type Description Default
accession str

The accession number of the data to download.

required
urls str

The URLs of the data to download.

required
timeout int

The timeout in seconds for the download to complete. Default is 300 seconds.

300

Returns:

Type Description
None
Source code in ena_download/__init__.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def download_data(accession: str, urls: List[str],timeout: int = 300) -> None:
    """
    Download data from the ENA.

    Parameters
    ----------
    accession : str
        The accession number of the data to download.
    urls : str
        The URLs of the data to download.
    timeout : int
        The timeout in seconds for the download to complete. Default is 300 seconds.

    Returns
    -------
    None
    """
    if not os.path.exists(accession):
        os.mkdir(accession)

    home = os.path.expanduser("~")
    ascp = os.path.join(home, '.aspera/cli/bin/ascp')
    opensshfile = os.path.join(home, '.aspera/cli/etc/asperaweb_id_dsa.openssh')

    for url in urls:
        i=0
        while True:
            sys.stderr.write(f"Attempt {i+1} at downloading {url}...\n")
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(timeout)
            try:
                path = url.replace('ftp.sra.ebi.ac.uk/', 'era-fasp@fasp.sra.ebi.ac.uk:')
                sp.run([
                    ascp, '-T', '-l', '300m', '-P', '33001', '-i', opensshfile, 
                    path, accession + '/'
                ], check=True)
                signal.alarm(0)
                break
            except:
                i+=1
                if i==3:
                    raise TimeoutError(f"Download failed after 3 attempts")
                continue
    return None

extract_data_path(accession)

Get the URL of the data to download.

Parameters:

Name Type Description Default
accession str

The accession number of the data to download.

required

Returns:

Type Description
str

The URLs of the data to download.

Examples:

>>> extract_data_path("ERR11466368")
['ftp.sra.ebi.ac.uk/vol1/fastq/ERR114/068/ERR11466368/ERR11466368_1.fastq.gz', 'ftp.sra.ebi.ac.uk/vol1/fastq/ERR114/068/ERR11466368/ERR11466368_2.fastq.gz']
Source code in ena_download/__init__.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def extract_data_path(accession: str) -> List[str]:
    """
    Get the URL of the data to download.

    Parameters
    ----------
    accession : str
        The accession number of the data to download.

    Returns
    -------
    str
        The URLs of the data to download.

    Examples
    --------
    >>> extract_data_path("ERR11466368")
    ['ftp.sra.ebi.ac.uk/vol1/fastq/ERR114/068/ERR11466368/ERR11466368_1.fastq.gz', 'ftp.sra.ebi.ac.uk/vol1/fastq/ERR114/068/ERR11466368/ERR11466368_2.fastq.gz']
    """

    url = f"https://www.ebi.ac.uk/ena/portal/api/filereport?accession={accession}&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Invalid URL: {url}")

    second_row = response.text.split("\n")[1]
    return second_row.split("\t")[1].split(";")

handler(signum, frame)

Signal handler for the download timeout.

Source code in ena_download/__init__.py
73
74
75
def handler(signum, frame):
    """Signal handler for the download timeout."""
    raise TimeoutError(f'Download timeout reached, trying again!')

is_valid_accession(accession)

Get the URL of the data to download.

Parameters:

Name Type Description Default
accession str

The run accession number of the data to download.

required

Returns:

Type Description
str

The URL of the data to download.

Examples:

>>> is_valid_accession("ERR11466368")
True
>>> is_valid_accession("ERR0000000")
Traceback (most recent call last):
ValueError: Invalid accession number: ERR0000000
Source code in ena_download/__init__.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def is_valid_accession(accession: str) -> bool:
    """
    Get the URL of the data to download.

    Parameters
    ----------
    accession :
        The run accession number of the data to download.

    Returns
    -------
    str
        The URL of the data to download.

    Examples
    --------
    >>> is_valid_accession("ERR11466368")
    True
    >>> is_valid_accession("ERR0000000")
    Traceback (most recent call last):
    ValueError: Invalid accession number: ERR0000000
    """

    url = f"https://www.ebi.ac.uk/ena/browser/api/xml/{accession}"
    response = requests.get(url)
    if response.status_code != 200:
        raise ValueError(f"Invalid accession number: {accession}")

    return True

main(accession, timeout=300)

Function that calls all the other functions to download data from the ENA.

Parameters:

Name Type Description Default
accession str

The accession number of the data to download.

required
timeout int

The timeout in seconds for the download to complete. Default is 300 seconds.

300

Returns:

Type Description
None
Source code in ena_download/__init__.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def main(accession: str, timeout: int = 300) -> None:
    """
    Function that calls all the other functions to download data from the ENA.

    Parameters
    ----------
    accession : str
        The accession number of the data to download.
    timeout : int
        The timeout in seconds for the download to complete. Default is 300 seconds.

    Returns
    -------
    None
    """

    is_valid_accession(accession)

    paths = extract_data_path(accession)

    download_data(accession, paths, timeout)


    return None