Source code for windpowerlib.data

"""
The ``data`` module contains functions to handle the needed data.

SPDX-FileCopyrightText: 2019 oemof developer group <contact@oemof.org>
SPDX-License-Identifier: MIT
"""

import logging
import os
import warnings
from shutil import copyfile

import pandas as pd
import requests
from windpowerlib.tools import WindpowerlibUserWarning
from windpowerlib.wind_turbine import WindTurbine



[docs]
def get_turbine_types(turbine_library="local", print_out=True, filter_=True):
    r"""
    Get all provided wind turbine types provided.

    Choose by `turbine_library` whether to get wind turbine types provided by
    the OpenEnergy Database ('oedb') or wind turbine types provided in your
    local file(s) ('local').
    By default only turbine types for which a power coefficient curve or power
    curve is provided are returned. Set `filter_=False` to see all turbine
    types for which any data (e.g. hub height, rotor diameter, ...) is
    provided.

    Parameters
    ----------
    turbine_library : str
        Specifies if the oedb turbine library ('oedb') or your local turbine
        data file ('local') is evaluated. Default: 'local'.
    print_out : bool
        Directly prints a tabular containing the turbine types in column
        'turbine_type', the manufacturer in column 'manufacturer' and
        information about whether a power (coefficient) curve exists (True) or
        not (False) in columns 'has_power_curve' and 'has_cp_curve'.
        Default: True.
    filter_ : bool
        If True only turbine types for which a power coefficient curve or
        power curve is provided in the oedb turbine library are
        returned. Default: True.

    Returns
    -------
    :pandas:`pandas.DataFrame<frame>`
        Contains turbine types in column 'turbine_type', the manufacturer in
        column 'manufacturer' and information about whether a power
        (coefficient) curve exists (True) or not (False) in columns
        'has_power_curve' and 'has_cp_curve'.

    Notes
    -----
    If the power (coefficient) curve of the desired turbine type (or the
    turbine type itself) is missing you can contact us via github or
    windpowerlib@rl-institut.de. You can help us by providing data in the
    format as shown in `the data base
    <https://openenergy-platform.org/dataedit/view/supply/wind_turbine_library>`_.

    Examples
    --------
    >>> from windpowerlib import get_turbine_types
    >>> df=get_turbine_types(print_out=False)
    >>> print(df[df["turbine_type"].str.contains("E-126")].iloc[0])
    manufacturer          Enercon
    turbine_type       E-126/4200
    has_power_curve          True
    has_cp_curve             True
    Name: 5, dtype: object
    >>> print(df[df["manufacturer"].str.contains("Enercon")].iloc[0])
    manufacturer          Enercon
    turbine_type       E-101/3050
    has_power_curve          True
    has_cp_curve             True
    Name: 1, dtype: object

    """
    if turbine_library == "local":
        filename = os.path.join(
            os.path.dirname(__file__), "oedb", "turbine_data.csv"
        )
        df = pd.read_csv(filename, index_col=0).reset_index()
    elif turbine_library == "oedb":
        df = fetch_turbine_data_from_oedb()

    else:
        raise ValueError(
            "`turbine_library` is '{}' ".format(turbine_library)
            + "but must be 'local' or 'oedb'."
        )
    if filter_:
        cp_curves_df = df.loc[df["has_cp_curve"].fillna(False)][
            ["manufacturer", "turbine_type", "has_cp_curve"]
        ]
        p_curves_df = df.loc[df["has_power_curve"].fillna(False)][
            ["manufacturer", "turbine_type", "has_power_curve"]
        ]
        curves_df = pd.merge(
            p_curves_df, cp_curves_df, how="outer", sort=True
        ).fillna(False)
    else:
        curves_df = df[
            ["manufacturer", "turbine_type", "has_power_curve", "has_cp_curve"]
        ]
    if print_out:
        pd.set_option("display.max_rows", len(curves_df))
        print(curves_df)
        pd.reset_option("display.max_rows")
    return curves_df



def fetch_turbine_data_from_oedb(
    schema="supply", table="wind_turbine_library"
):
    r"""
    Fetches turbine library from the OpenEnergy database (oedb).

    Parameters
    ----------
    schema : str
        Database schema of the turbine library.
    table : str
        Table name of the turbine library.

    Returns
    -------
    :pandas:`pandas.DataFrame<frame>`
        Turbine data of different turbines such as 'manufacturer',
        'turbine_type', 'nominal_power'.

    """
    # url of OpenEnergy Platform that contains the oedb
    oep_url = "https://oep.iks.cs.ovgu.de/"
    url = oep_url + "/api/v0/schema/{}/tables/{}/rows/?".format(schema, table)

    # load data
    result = requests.get(url, verify=True)
    if not result.status_code == 200:
        raise ConnectionError(
            "Database (oep) connection not successful. \nURL: {2}\n"
            "Response: [{0}] \n{1}".format(
                result.status_code, result.text, url
            )
        )
    return pd.DataFrame(result.json())


def load_turbine_data_from_oedb(schema="supply", table="wind_turbine_library"):
    msg = (
        "\nUse >>store_turbine_data_from_oedb<< and not"
        " >>load_turbine_data_from_oedb<< in the future."
    )
    warnings.warn(msg, FutureWarning)
    return store_turbine_data_from_oedb(schema=schema, table=table)



[docs]
def store_turbine_data_from_oedb(
    schema="supply", table="wind_turbine_library", threshold=0.2
):
    r"""
    Loads turbine library from the OpenEnergy database (oedb).

    Turbine data is saved to csv files ('oedb_power_curves.csv',
    'oedb_power_coefficient_curves.csv' and 'oedb_nominal_power') for offline
    usage of the windpowerlib. If the files already exist they are overwritten.
    In case the turbine library on the oedb contains too many faulty turbines,
    the already existing files are not overwritten. The accepted percentage of faulty
    turbines can be set through the parameter `threshold`.

    Parameters
    ----------
    schema : str
        Database schema of the turbine library.
    table : str
        Table name of the turbine library.
    threshold : float
        In case there are turbines in the turbine library with faulty data (e.g.
        duplicate wind speed entries in the power (coefficient) curve data), the
        threshold defines the share of accepted faulty turbine ata up to which the
        existing turbine data is overwritten by the newly downloaded data.
        For example, a threshold of 0.1 means that more than 10% of the
        turbines would need to have invalid data in order to discard the downloaded
        data. This is to make sure that in the rare case the oedb data is too buggy,
        the turbine data that is by default provided with the windpowerlib is not
        overwritten by poor data.

    Returns
    -------
    :pandas:`pandas.DataFrame<frame>`
        Turbine data of different turbines such as 'manufacturer',
        'turbine_type', 'nominal_power'.

    """
    turbine_data = fetch_turbine_data_from_oedb(schema=schema, table=table)
    turbine_data = _process_and_save_oedb_data(
        turbine_data, threshold=threshold
    )
    check_turbine_data(
        filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
    )
    return turbine_data



def _process_and_save_oedb_data(turbine_data, threshold=0.2):
    """
    Helper function to extract power (coefficient) curve data from the turbine library.

    Parameters
    -----------
    turbine_data : :pandas:`pandas.DataFrame<frame>`
        Raw turbine data downloaded from the oedb with
        :func:`fetch_turbine_data_from_oedb`.
    threshold : float
        See parameter `threshold` in func:`store_turbine_data_from_oedb`
        for more information.

    Returns
    --------
    :pandas:`pandas.DataFrame<frame>`
        Turbine data of different turbines such as 'manufacturer',
        'turbine_type', 'nominal_power'.

    """
    curve_types = ["power_curve", "power_coefficient_curve"]
    # get all power (coefficient) curves
    curve_dict = {}
    broken_turbines_dict = {}
    for curve_type in curve_types:
        broken_turbine_data = []
        curves_df = pd.DataFrame(columns=["wind_speed"])
        for index in turbine_data.index:
            if (
                turbine_data["{}_wind_speeds".format(curve_type)][index]
                and turbine_data["{}_values".format(curve_type)][index]
            ):
                try:
                    df = (
                        pd.DataFrame(
                            data=[
                                eval(
                                    turbine_data[
                                        "{}_wind_speeds".format(curve_type)
                                    ][index]
                                ),
                                eval(
                                    turbine_data["{}_values".format(curve_type)][
                                        index
                                    ]
                                ),
                            ]
                        )
                        .transpose()
                        .rename(
                            columns={
                                0: "wind_speed",
                                1: turbine_data["turbine_type"][index],
                            }
                        )
                    )
                    if not df.wind_speed.duplicated().any():
                        curves_df = pd.merge(
                            left=curves_df, right=df, how="outer", on="wind_speed"
                        )
                    else:
                        broken_turbine_data.append(
                            turbine_data.loc[index, "turbine_type"])
                except:
                    broken_turbine_data.append(turbine_data.loc[index, "turbine_type"])
        curve_dict[curve_type] = curves_df
        broken_turbines_dict[curve_type] = broken_turbine_data

    # check if there are faulty turbines and if so, raise warning
    # if there are too many, don't save downloaded data to disk but keep existing data
    if any(len(_) > 0 for _ in broken_turbines_dict.values()):
        issue_link = ("https://github.com/OpenEnergyPlatform/data-preprocessing"
                      "/issues/28")
        # in case only some data is faulty, only give out warning
        if all(len(_) < threshold * len(turbine_data)
               for _ in broken_turbines_dict.values()):
            save_turbine_data = True
            for curve_type in curve_types:
                if len(broken_turbines_dict[curve_type]) > 0:
                    logging.warning(
                        f"The turbine library data contains faulty {curve_type}s. The "
                        f"{curve_type} data can therefore not be loaded for the  "
                        f"following turbines: {broken_turbine_data}. "
                        f"Please report this in the following issue, in case it hasn't "
                        f"already been reported: {issue_link}"
                    )
                # set has_power_(coefficient)_curve to False for faulty turbines
                for turb in broken_turbines_dict[curve_type]:
                    ind = turbine_data[turbine_data.turbine_type == turb].index[0]
                    col = ("has_power_curve" if curve_type == "power_curve"
                           else "has_cp_curve")
                    turbine_data.at[ind, col] = False
        # in case most data is faulty, do not store downloaded data
        else:
            logging.warning(
                f"The turbine library data contains too many faulty turbine datasets "
                f"wherefore it is not loaded from the oedb. "
                f"In case you want to circumvent this behaviour, you can specify a "
                f"higher tolerance through the parameter 'threshold'."
                f"Please report this in the following issue, in case it hasn't "
                f"already been reported: {issue_link}"
            )
            save_turbine_data = False
    else:
        save_turbine_data = True

    if save_turbine_data:
        # standard file name for saving data
        filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
        # save curve data to csv
        for curve_type in curve_types:
            curves_df = curve_dict[curve_type].set_index(
                "wind_speed").sort_index().transpose()
            # power curve values in W
            if curve_type == "power_curve":
                curves_df *= 1000
            curves_df.index.name = "turbine_type"
            curves_df.sort_index(inplace=True)
            curves_df.to_csv(filename.format("{}s".format(curve_type)))

        # save turbine data to file (excl. curves)
        turbine_data_df = turbine_data.drop(
            [
                "power_curve_wind_speeds",
                "power_curve_values",
                "power_coefficient_curve_wind_speeds",
                "power_coefficient_curve_values",
                "thrust_coefficient_curve_wind_speeds",
                "thrust_coefficient_curve_values",
            ],
            axis=1,
        ).set_index("turbine_type")
        # nominal power in W
        turbine_data_df["nominal_power"] *= 1000
        turbine_data_df.sort_index(inplace=True)
        turbine_data_df.to_csv(filename.format("turbine_data"))
    return turbine_data


def check_turbine_data(filename):
    try:
        data = check_data_integrity(filename)
    except Exception as e:
        restore_default_turbine_data()
        raise e
    return data


def check_data_integrity(filename, min_pc_length=5):
    data = pd.read_csv(filename.format("turbine_data"), index_col=[0])
    for data_set in data.iterrows():
        wt_type = data_set[0]
        turbine_data_set = {
            "turbine_type": "{0}".format(wt_type),
            "hub_height": 135,
        }
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            wt = WindTurbine(**turbine_data_set)
            if wt.power_curve is None and data_set[1].has_power_curve is True:
                logging.warning(
                    "{0}: No power curve but has_power_curve=True.".format(
                        wt_type
                    )
                )
            if (
                wt.power_coefficient_curve is None
                and data_set[1].has_cp_curve is True
            ):
                logging.warning(
                    "{0}: No cp-curve but has_cp_curve=True.".format(wt_type)
                )
            if wt.power_curve is not None:
                if len(wt.power_curve) < min_pc_length:
                    logging.warning(
                        "{0}: power_curve is too short ({1} values),".format(
                            wt_type, len(wt.power_curve)
                        )
                    )
    return data


def restore_default_turbine_data():
    """

    Returns
    -------

    Examples
    --------
    >>> restore_default_turbine_data()

    """
    src_path = os.path.join(
        os.path.dirname(__file__), "data", "default_turbine_data"
    )
    dst_path = os.path.join(os.path.dirname(__file__), "oedb")

    for file in os.listdir(src_path):
        src = os.path.join(src_path, file)
        dst = os.path.join(dst_path, file)
        copyfile(src, dst)


def check_weather_data(weather_data):
    """
    Check weather Data Frame.

    - Raise warning if there are nan values.
    - Convert columns if heights are string and not numeric.

    Parameters
    ----------
    weather_data : pandas.DataFrame
        A weather table with MultiIndex columns (name, data height)

    Returns
    -------
    pandas.DataFrame : A valid weather table.

    """
    # Convert data heights to integer. In some case they are strings.
    weather_data.columns = pd.MultiIndex.from_arrays(
        [
            weather_data.columns.get_level_values(0),
            pd.to_numeric(weather_data.columns.get_level_values(1)),
        ]
    )

    # check for nan values
    if weather_data.isnull().any().any():
        nan_columns = list(weather_data.columns[weather_data.isnull().any()])
        msg = (
            "The following columns of the weather data contain invalid "
            "values like 'nan': {0}"
        )
        warnings.warn(msg.format(nan_columns), WindpowerlibUserWarning)
    return weather_data