Source code for iris.data

"""
Utilities for finding and downloading IRIS data
"""

from __future__ import annotations
from typing import Sequence
import pathlib
import shutil
import requests
import astropy.time
import iris

__all__ = [
    "query_hek",
    "urls_hek",
    "download",
    "decompress",
]



[docs]
def query_hek(
    time_start: None | astropy.time.Time = None,
    time_stop: None | astropy.time.Time = None,
    description: str = "",
    obs_id: None | int = None,
    limit: int = 200,
    nrt: bool = False,
) -> str:
    """
    Constructs a query that can be sent to the Heliophysics Event Knowledge
    Base (HEK) to receive a list of URLs.

    Parameters
    ----------
    time_start
        The start time of the search period. If :obj:`None`, the start of operations,
        2013-07-20 will be used.
    time_stop
        The end time of the search period. If :obj:`None`, the current time will be used.
    description
        The description of the observation. If an empty string, observations with
        any description will be returned.
    obs_id
        the OBSID of the observation, a number which describes the size, cadence,
        etc. of the observation. If :obj:`None`, all OBSIDs will be used.
    limit
        the maximum number of files returned by the query
    nrt
        Whether to return results with near-real-time (NRT) data.

    Examples
    --------

    Construct a query for the first 100 A1: QS monitoring observations in 2023

    .. jupyter-execute::

        import astropy.time
        import iris

        iris.data.query_hek(
            time_start=astropy.time.Time("2023-01-01T00:00"),
            time_stop=astropy.time.Time("2024-01-01T00:00"),
            description="A1: QS monitoring",
            limit=100,
        )
    """

    format_spec = "%Y-%m-%dT%H:%M"

    if time_start is None:
        time_start = astropy.time.Time("2013-07-20T00:00")

    if time_stop is None:
        time_stop = astropy.time.Time.now()

    if nrt:
        hasData = "false"
    else:
        hasData = "true"

    query_hek = (
        "https://www.lmsal.com/hek/hcr?cmd=search-events3"
        "&outputformat=json"
        f"&startTime={time_start.strftime(format_spec)}"
        f"&stopTime={time_stop.strftime(format_spec)}"
        f"&hasData={hasData}"
        "&hideMostLimbScans=true"
        f"&obsDesc={description}"
        f"&limit={limit}"
    )
    if obs_id is not None:
        query_hek += f"&obsId={obs_id}"

    return query_hek




[docs]
@iris.memory.cache
def urls_hek(
    time_start: None | astropy.time.Time = None,
    time_stop: None | astropy.time.Time = None,
    description: str = "",
    obs_id: None | int = None,
    limit: int = 200,
    nrt: bool = False,
    spectrograph: bool = True,
    sji: bool = True,
    deconvolved: bool = False,
    num_retry: int = 5,
) -> list[str]:
    """
    Find a list of URLs to download matching the given parameters.

    Parameters
    ----------
    time_start
        The start time of the search period. If :obj:`None`, the start of operations,
        2013-07-20 will be used.
    time_stop
        The end time of the search period. If :obj:`None`, the current time will be used.
    description
        The description of the observation. If an empty string, observations with
        any description will be returned.
    obs_id
        the OBSID of the observation, a number which describes the size, cadence,
        etc. of the observation. If :obj:`None`, all OBSIDs will be used.
    limit
        The maximum number of observations returned by the query.
        Note that this is not the same as the number of files since there
        are several files per observation.
    spectrograph
        Boolean flag controlling whether to include spectrograph data.
    sji
        Boolean flag controlling whether to include SJI data.
    deconvolved
        Boolean flag controlling whether to include the deconvolved slitjaw
        imagery. Has no effect if ``sji`` is :obj:`False`.
    num_retry
        The number of times to try to connect to the server.
    nrt
        Whether to return results with near-real-time (NRT) data.

    Examples
    --------
    Find the URLs of the last 5 "A1: QS monitoring" spectrograph observations
    in 2023.

    .. jupyter-execute::

        import astropy.time
        import iris

        iris.data.urls_hek(
            time_start=astropy.time.Time("2023-01-01T00:00"),
            time_stop=astropy.time.Time("2024-01-01T00:00"),
            description="A1: QS monitoring",
            limit=5,
            sji=False,
        )
    """
    query = query_hek(
        time_start=time_start,
        time_stop=time_stop,
        description=description,
        obs_id=obs_id,
        limit=limit,
        nrt=nrt,
    )

    for i in range(num_retry):
        try:
            response = requests.get(query, timeout=5).json()
            break
        except requests.exceptions.RequestException:  # pragma: no cover
            pass
    else:  # pragma: no cover
        raise ConnectionError(f"Could not get query {query}")

    result = []
    for event in response["Events"]:
        for group in event["groups"]:

            url = group["comp_data_url"]
            url = url.replace("data_lmsal", "data")

            url_str = str(url)

            if spectrograph:
                if "raster" in url_str:
                    result.append(url)
            if sji:
                if "SJI" in url_str:
                    if "deconvolved" in url_str:
                        if deconvolved:
                            result.append(url)
                    else:
                        result.append(url)

    return result




[docs]
def download(
    urls: list[str],
    directory: None | pathlib.Path = None,
    overwrite: bool = False,
) -> list[pathlib.Path]:
    """
    Download the given URLs to a specified directory.
    If `overwrite` is :obj:`False`, the file will not be downloaded if it exists.

    Parameters
    ----------
    urls
        The URLs to download.
    directory
        The directory to place the downloaded files.
    overwrite
        Boolean flag controlling whether to overwrite existing files.


    Examples
    --------
    Download the last "A1: QS monitoring" spectrograph file in 2023.

    .. jupyter-execute::

        import astropy.time
        import iris

        urls = iris.data.urls_hek(
            time_start=astropy.time.Time("2023-01-01T00:00"),
            time_stop=astropy.time.Time("2024-01-01T00:00"),
            description="A1: QS monitoring",
            limit=1,
            sji=False,
        )

        iris.data.download(urls)
    """
    if directory is None:
        directory = pathlib.Path.home() / ".iris/cache"

    directory.mkdir(parents=True, exist_ok=True)

    result = []
    for url in urls:

        file = directory / url.split("/")[~0]

        if overwrite or not file.exists():
            r = requests.get(url, stream=True)
            with open(file, "wb") as f:
                f.write(r.content)

        result.append(file)

    return sorted(result)




[docs]
def decompress(
    archives: Sequence[pathlib.Path],
    directory: None | pathlib.Path = None,
    overwrite: bool = False,
) -> list[pathlib.Path]:
    """
    Decompress a list of ``.tar.gz`` files.

    Each ``.tar.gz`` file is decompressed and the ``.fits`` files within the
    archive are appended to the returned list.

    Parameters
    ----------
    archives
        A list of ``.tar.gz`` files to decompress.
    directory
        A filesystem directory to place the decompressed results.
        If :obj:`None`, the directory of the ``.tar.gz`` archive will be used.
    overwrite
        If the file already exists, it will be overwritten.

    Examples
    --------
    Download the most last "A1: QS monitoring" spectrograph file in 2023 and
    decompress it into a list of ``.fits`` files.

    .. jupyter-execute::

        import astropy.time
        import iris

        # Find the URL of the .tar.gz archive
        urls = iris.data.urls_hek(
            time_start=astropy.time.Time("2023-01-01T00:00"),
            time_stop=astropy.time.Time("2024-01-01T00:00"),
            description="A1: QS monitoring",
            limit=1,
            sji=False,
        )

        # Download the .tar.gz archive
        archives = iris.data.download(urls)

        # Decompress the .tar.gz archive into a list of fits files
        iris.data.decompress(archives)
    """

    result = []

    for archive in archives:

        if directory is None:
            directory = archive.parent

        destination = directory / pathlib.Path(archive.stem).stem

        if overwrite or not destination.exists():
            shutil.unpack_archive(archive, extract_dir=destination)

        files = sorted(destination.rglob("*.fits"))
        result = result + files

    return result