Source code for pylim.reader

#!/usr/bin/env python
"""A collection of reader functions for instruments operated on HALO and model output files

*author*: Johannes Röttenbacher
"""

import datetime
import logging
import os
import re
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr

from pylim import cirrus_hl
from pylim import helpers as h
from pylim import smart

log = logging.getLogger(__name__)


[docs]def read_bacardi_raw(filename: str, path: str) -> xr.Dataset:
    """
    Read raw BACARDI data as provided by DLR

    Args:
        filename: name of file
        path: path to file

    Returns: Dataset with BACARDI data and time as dimension

    """
    filepath = os.path.join(path, filename)
    date = re.search(r"\d{8}", filename)[0]
    ds = xr.open_dataset(filepath)
    ds = ds.rename({"TIME": "time"})
    ds = ds.swap_dims({"tid": "time"})  # make time the dimension
    # overwrite time to make a datetime index
    ds = ds.assign({"time": pd.to_datetime(ds.time, unit='ms', origin=pd.to_datetime(date, format="%Y%m%d"))})

    return ds


[docs]def read_bahamas(bahamas_path: str) -> xr.Dataset:
    """
    Reader function for netcdf BAHAMAS data as provided by DLR.

    Args:
        bahamas_path: full path of netcdf file

    Returns: xr.DataSet with BAHAMAS data and time as dimension

    """
    ds = xr.open_dataset(bahamas_path)
    ds = ds.swap_dims({"tid": "TIME"})
    ds = ds.rename({"TIME": "time"})

    return ds


[docs]def read_lamp_file(campaign: str = "cirrus-hl", filename: str = None, plot: bool = True, save_fig: bool = True,
                   save_file: bool = True) -> pd.DataFrame:
    """
    Read in the 1000W lamp specification file interpolated to 1nm steps. Converts W/cm^2 to W/m^2.

    Args:
        campaign: for which campaign should the lamp file be read in?
        filename: name of lamp file to be read in
        plot: plot lamp file?
        save_fig: save figure to standard plot path defined in config.toml?
        save_file: save lamp file to standard calib path deined in config.toml?

    Returns: A data frame with the irradiance in W/m² and the corresponding wavelength in nm

    """
    # set paths
    calib_path = h.get_path("calib", campaign=campaign)
    plot_path = h.get_path("plot", campaign=campaign)
    lamp_path = h.get_path("lamp", campaign=campaign)  # get path to lamp defined in config.toml
    # read in lamp file (name for cirrus-hl and halo-ac3)
    lamp_file = "F1587i01_19.std" if filename is None else filename
    names = ["Irradiance"]  # column name
    lamp = pd.read_csv(os.path.join(lamp_path, lamp_file), skiprows=1, header=None, names=names)
    # TODO: make the wavelength definition more flexible,
    #  it should adjust itself according to information given in the lamp file
    lamp["Wavelength"] = np.arange(250, 2501)
    # convert from W/cm^2 to W/m^2; cm = m * 10^-2 => cm^2 = (m * 10^-2)^2 = m^2 * 10^-4 => W*10^4/m^2
    lamp["Irradiance"] = lamp["Irradiance"] * 1e4
    if plot:
        # plot lamp calibration
        lamp.plot(x="Wavelength", y="Irradiance", ylabel="Irradiance (W$\\,$m$^{-2}$)", xlabel="Wavelenght (nm)",
                  legend=False, title="1000W Lamp F-1587 interpolated on 1nm steps")
        plt.grid()
        plt.tight_layout()
        if save_fig:
            plt.savefig(f"{plot_path}/1000W_Lamp_F1587_1nm_19.png", dpi=100)
        plt.show()
        plt.close()
    if save_file:
        # save lamp file in calib folder
        lamp.to_csv(f"{calib_path}/1000W_lamp_F1587_1nm_19.dat", index=False)

    return lamp


[docs]def read_smart_raw(path: str, filename: str) -> pd.DataFrame:
    """
    Read raw SMART data files

    Args:
        path: Path where to find file
        filename: Name of file

    Returns: pandas DataFrame with column names and datetime index

    """
    file = os.path.join(path, filename)
    date_str, channel, _ = smart.get_info_from_filename(filename)

    if channel == "SWIR":
        pixels = list(range(1, 257))  # 256 pixels
    elif channel == "VNIR":
        pixels = list(range(1, 1025))  # 1024 pixels
    else:
        raise ValueError("channel has to be 'SWIR' or 'VNIR'!")

    # first three columns: Time (hh mm ss.ss), integration time (ms), shutter flag
    header = ["time", "t_int", "shutter"]
    header.extend(pixels)

    df = pd.read_csv(file, sep="\t", header=None, names=header)
    datetime_str = date_str + " " + df["time"]
    df = df.set_index(pd.to_datetime(datetime_str, format="%Y_%m_%d %H %M %S.%f")).drop("time", axis=1)

    return df


[docs]def read_smart_cor(path: str, filename: str) -> pd.DataFrame:
    """
        Read dark current corrected SMART data files

        Args:
            path: Path where to find file
            filename: Name of file

        Returns: pandas DataFrame with column names and datetime index

        """
    file = os.path.join(path, filename)
    df = pd.read_csv(file, sep="\t", index_col="time", parse_dates=True)
    df.columns = pd.to_numeric(df.columns)  # make columns numeric

    return df


[docs]def read_pixel_to_wavelength(path: str, spectrometer: str) -> pd.DataFrame:
    """
    Read file which maps each pixel to a certain wavelength for a specified spectrometer.

    Args:
        path: Path where to find file
        spectrometer: For which spectrometer the pixel to wavelength mapping should be read in, refer to lookup table for possible
         spectrometers (e.g. ASP06_J3)

    Returns: pandas DataFrame relating pixel number to wavelength

    """
    filename = f"pixel_wl_{cirrus_hl.smart_lookup[spectrometer]}.dat"
    file = os.path.join(path, filename)
    df = pd.read_csv(file, sep="\s+", skiprows=7, header=None, names=["pixel", "wavelength"])
    # sort df by the wavelength column and reset the index, necessary for the SWIR spectrometers
    # df = df.sort_values(by="wavelength").reset_index(drop=True)

    return df


[docs]def read_stabbi_data(stabbi_path: str) -> pd.DataFrame:
    """
    Read in stabilization platform data from SMART.

    Args:
        stabbi_path: full path to dat file

    Returns: pandas DataFrame with headers and a DateTimeIndex

    """
    df = pd.read_csv(stabbi_path, skipinitialspace=True, sep="\t")
    df["PCTIME"] = pd.to_datetime(df["DATE"] + " " + df["PCTIME"], format='%Y/%m/%d %H:%M:%S.%f')
    df.set_index("PCTIME", inplace=True)
    df.index.name = "time"

    return df


[docs]def read_nav_data(nav_path: str) -> pd.DataFrame:
    """
    Reader function for Navigation data file from the INS used by the stabilization of SMART

    Args:
        nav_path: path to IMS file including filename

    Returns: pandas DataFrame with headers and a DateTimeIndex

    """
    # read out the start time information given in the file
    with open(nav_path, encoding="cp1252") as f:
        time_info = f.readlines()[1]
    start_time = pd.to_datetime(time_info[11:31], format="%m/%d/%Y %H:%M:%S")
    # define the start date of the measurement
    start_date = pd.Timestamp(year=start_time.year, month=start_time.month, day=start_time.day)
    header = ["marker", "seconds", "roll", "pitch", "yaw", "AccS_X", "AccS_Y", "AccS_Z", "OmgS_X", "OmgS_Y", "OmgS_Z"]
    nav = pd.read_csv(nav_path, sep="\s+", skiprows=13, header=None, names=header, encoding="cp1252")
    nav["time"] = pd.to_datetime(nav["seconds"], origin=start_date, unit="s")
    nav = nav.set_index("time")

    return nav


[docs]def read_ins_gps_pos(filepath: str) -> pd.DataFrame:
    """
    Read in a GPS position file as returned by the HALO-SMART INS system.

    Args:
        filepath: complete path to Nav_GPSPosxxxx.Asc file

    Returns: time series with the GPS position data

    """
    with open(filepath, encoding="cp1252") as f:
        time_info = f.readlines()[1]
    start_time = pd.to_datetime(time_info[11:31], format="%m/%d/%Y %H:%M:%S")
    # define the start date of the measurement
    start_date = pd.Timestamp(year=start_time.year, month=start_time.month, day=start_time.day)
    header = ["marker", "seconds", "lon", "lat", "alt", "lon_std", "lat_std", "alt_std"]
    df = pd.read_csv(filepath, sep="\s+", skiprows=10, header=None, names=header, encoding="cp1252")
    df["time"] = pd.to_datetime(df["seconds"], origin=start_date, unit="s")
    df = df.set_index("time")

    return df


[docs]def read_ins_gps_vel(filepath: str) -> pd.DataFrame:
    """
    Read in a GPS velocity file as returned by the HALO-SMART INS system.

    Args:
        filepath: complete path to Nav_GPSVelxxxx.Asc file

    Returns: time series with the GPS velocity data

    """
    with open(filepath, encoding="cp1252") as f:
        time_info = f.readlines()[1]
    start_time = pd.to_datetime(time_info[11:31], format="%m/%d/%Y %H:%M:%S")
    # define the start date of the measurement
    start_date = pd.Timestamp(year=start_time.year, month=start_time.month, day=start_time.day)
    header = ["marker", "seconds", "v_east", "v_north", "v_up", "v_east_std", "v_north_std", "v_up_std"]
    df = pd.read_csv(filepath, sep="\s+", skiprows=10, header=None, names=header, encoding="cp1252")
    df["time"] = pd.to_datetime(df["seconds"], origin=start_date, unit="s")
    df = df.set_index("time")

    return df


[docs]def read_libradtran(flight: str, filename: str) -> pd.DataFrame:
    """
    Read an old libRadtran simulation file generated by ``01_dirdiff_BBR_Cirrus_HL_Server_jr.pro`` and add a DateTime Index.

    Args:
        flight: which flight does the simulation belong to (e.g. Flight_20210629a)
        filename: filename

    Returns: DataFrame with libRadtran output data

    """
    file_path = f"{h.get_path('libradtran', flight)}/{filename}"
    bbr_sim = pd.read_csv(file_path, sep="\s+", skiprows=34)
    date_dt = datetime.datetime.strptime(flight[7:15], "%Y%m%d")
    date_ts = pd.Timestamp(year=date_dt.year, month=date_dt.month, day=date_dt.day)
    bbr_sim["time"] = pd.to_datetime(bbr_sim["sod"], origin=date_ts, unit="s")  # add a datetime column
    bbr_sim = bbr_sim.set_index("time")  # set it as index

    return bbr_sim


[docs]def read_ozone_sonde(filepath: str) -> pd.DataFrame():
    """
    Reader function for ames formatted ozone sonde data from http://www.ndaccdemo.org/

    Args:
        filepath: complete path to file

    Returns: pandas DataFrame with ozone volume mixing ratio

    """
    header_ny = ["ElapTime", "Press", "GeopHgt", "Temp", "RH", "PO3", "DD", "FF", "GPSHgt", "Lon", "Lat", "PmpT", "Ozi",
                 "Vpmp", "Ipmp"]
    header_sc = ["ElapTime", "Press", "GeopHgt", "Temp", "RH", "T_styro", "PO3", "HorWindDir", "HorWindSpeed"]
    header_ho = ["Press", "ElapTime", "GeopHgt", "Temp", "RH", "PO3", "HorWindDir", "HorWindSpeed"]

    if "sc" in filepath:
        df = pd.read_csv(filepath, skiprows=134, sep="\s+", names=header_sc, na_values=[99.99])
        df["o3_vmr"] = df.PO3 * 1e-3 / (df.Press * 1e2)  # mPa and hPa
    elif "ny" in filepath:
        df = pd.read_csv(filepath, skiprows=170, sep="\s+", names=header_ny, na_values=[99.99])
        df["o3_vmr"] = df.PO3 * 1e-3 / (df.Press * 1e2)  # mPa and hPa
    else:
        assert "ho" in filepath, "No header information for given file! Adjust reader function!"
        df = pd.read_csv(filepath, skiprows=68, sep="\s+", names=header_ho, na_values=[999.9])

    return df


[docs]def read_ecrad_output(filepath: str) -> xr.Dataset:
    """
    Read in and preprocess a merged ecRad output file from :py:module:ecrad_merge_files.py

    - Assign coordinates to dimensions
    - Calculate pressure height in m

    Args:
        filepath: Full path to file

    Returns: xarray DataSet with added coordinates to all dimensions of the file

    """
    warnings.warn("This is done in its own script now. The function is obsolete and should not be used.",
                  DeprecationWarning, stacklevel=2)
    ds = xr.open_dataset(filepath)
    # assign coordinates to band_sw, band_lw and half_level
    ds = ds.assign_coords({"band_sw": range(1, 15), "band_lw": range(1, 17), "half_level": range(138)})
    # calculate pressure height
    q_air = 1.292  # dry air density at 0°C in kg/m3
    g_geo = 9.81  # earth acceleration in m/s^2
    pressure_hl = ds["pressure_hl"]
    ds["press_height"] = -(pressure_hl[:, :, 137]) * np.log(pressure_hl[:, :, :] / pressure_hl[:, :, 137]) / (q_air * g_geo)
    # replace TOA height (calculated as infinity) with nan
    ds["press_height"] = ds["press_height"].where(ds["press_height"] != np.inf, np.nan)

    return ds


[docs]def read_cams_file(filepath: str) -> xr.Dataset:
    """
    Read a CAMS atmosphere file

    Args:
        filepath: full path to nc file

    Returns: DataSet with

    """