a
    ٴ:eK                     @   s:  d dl mZmZmZmZ d dlZd dlZd dl	Z
e
jddejeejee f dddZee ddd	d
Ze
jddejeeeeef eeef ejdddZejeeeef eeef ejdddZejedddZejeeeef ejdddZejeeejdddZe
jddejeeef eeef eeeejee f dddZejeeef ejdddZejeeef eejee f d d!d"Zejeejd#d$d%Zee dd&d'd(Zejeeef ejdd)d*Ze
jddejeeef ejd+d,d-Ze
jddejeeef ejd+d.d/Zejeeef dd d0d1Zeeef eeef eeef eeef eeeef ed2d3d4Z eeef eeef eeeeef eeef eeef eeef eeef eejeeef f d5
d6d7Z!e
jddejeeef ejd8d9d:Z"dS );    )AnyDictListTupleNi,  )ttl)dfreturnc                 C   s0   | j dd}t||dk  j}| j|dd|fS )af  Remove columns with strictly less than 2 distinct values in input dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe whose columns will be checked and potentially removed.

    Returns
    -------
    pd.DataFrame
        Dataframe with empty columns removed.
    list
        List of columns that have been removed.
    Fdropna      axis)nuniquelistindexdrop)r   Z
count_cols
empty_cols r   iC:\Users\mjcastro\Documents\data-science-function-store\apps\fbco_simulator\app\..\lib\dataprep\format.pyremove_empty_cols   s    r   )r   r   c                 C   s`   t | }|dkr\td|dkr"dnd d|dkr4dnd d|dkrFd	nd
 dd|   dS )zDisplays a message in streamlit dashboard if the input list is not empty.

    Parameters
    ----------
    empty_cols : list
        List of columns that have been removed.
    r   The following columnr   s  have been removed because z	they havezit hasz <= 1 distinct values: , Nlensterrorjoin)r   Lr   r   r   print_empty_cols   s    &r$   )df_inputdate_col
target_colconfigload_optionsr   c                 C   s2   |   }t||||}t|||}t|||}|S )a  Formats date and target columns of input dataframe.

    Parameters
    ----------
    df_input : pd.DataFrame
        Input dataframe whose columns will be formatted.
    date_col : str
        Name of date column in input dataframe.
    target_col : str
        Name of target column in input dataframe.
    config : Dict
        Lib configuration dictionary.
    load_options : Dict
        Loading options selected by user.

    Returns
    -------
    pd.DataFrame
        Dataframe with columns formatted.
    )copy_format_date_format_target_rename_cols)r%   r&   r'   r(   r)   r   r   r   r   format_date_and_target-   s
    r.   )r   r&   r)   r(   r   c                 C   s   zt | | }t||d d |d kB rBt j| | |d d}|| |< | |  | |   j}| |  | |   j}|dk |dk @ t|t|@ B rt	
d t	  | W S    t	
d t	  Y n0 dS )a  Formats date column of input dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe whose columns will be formatted.
    date_col : str
        Name of date column in input dataframe.
    load_options : Dict
        Loading options selected by user.
    config : Dict
        Lib config dictionary containing information about default date format.

    Returns
    -------
    pd.DataFrame
        Dataframe with date column formatted.
    dataprepdate_format)formatr   zNPlease select the correct date column (selected column has a time range < 1s).zQPlease select a valid date format (selected column can't be converted into date).N)pdto_datetime__check_date_formatmaxmindayssecondsnpisnanr    r!   stop)r   r&   r)   r(   date_seriesZ
days_rangeZ	sec_ranger   r   r   r+   P   s(    $r+   )r<   r   c                 C   sZ   |  dd  dk }|  dd  dk }|  dd  dk }||@ |@ rRdS dS dS )	a  Checks whether the date column has been correctly converted to datetime.

    Parameters
    ----------
    date_series : pd.Series
        Date column that has been converted.

    Returns
    -------
    bool
        False if conversion has not worked correctly, True otherwise.
    c                 S   s   | j S N)yearxr   r   r   <lambda>       z%__check_date_format.<locals>.<lambda>r   c                 S   s   | j S r=   )monthr?   r   r   r   rA      rB   c                 S   s   | j S r=   )dayr?   r   r   r   rA      rB   TFN)mapr   )r<   Ztest1Ztest2Ztest3r   r   r   r4   {   s    r4   )r   r'   r(   r   c                 C   sf   zB| |  d| |< | |  |d d k r>td t  | W S    td t  Y n0 dS )a;  Formats target column of input dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe whose columns will be formatted.
    target_col : str
        Name of target column in input dataframe.

    Returns
    -------
    pd.DataFrame
        Dataframe with date column formatted.
    floatvalidityZmin_target_cardinalityzOPlease select the correct target column (should be numerical, not categorical).zIPlease select the correct target column (should be of type int or float).N)astyper   r    r!   r;   )r   r'   r(   r   r   r   r,      s    
r,   )r   r&   r'   r   c                 C   s\   |dkr"d| j v r"| jddid} |dkrDd| j v rD| jddid} | j|d|did} | S )a  Renames date and target columns of input dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe whose columns will be renamed.
    date_col : str
        Name of date column in input dataframe.
    target_col : str
        Name of target column in input dataframe.

    Returns
    -------
    pd.DataFrame
        Dataframe with columns renamed.
    yZy_2columnsdsZds_2)rK   rename)r   r&   r'   r   r   r   r-      s    r-   )r%   
dimensionsr(   r&   r'   r   c                 C   s2   |   }t||}t||\}}t||}||fS )a{  Filters and aggregates input dataframe according to dimensions dictionary specifications.

    Parameters
    ----------
    df_input : pd.DataFrame
        Input dataframe that will be filtered and/or aggregated.
    dimensions : Dict
        Filtering and aggregation specifications.
    config : Dict
        Lib configuration dictionary.
    date_col : str
        Name of date column in input dataframe.
    target_col : str
        Name of target column in input dataframe.

    Returns
    -------
    pd.DataFrame
        Dataframe filtered and/or aggregated.
    list
        List of columns removed from input dataframe.
    )r*   _filter_format_regressors
_aggregate)r%   rN   r(   r&   r'   r   cols_to_dropr   r   r   filter_and_aggregate_df   s
    

rS   )r   rN   r   c                 C   sF   t t| dh }|D ]}| j| | ||  } q| j|ddS )aC  Filters input dataframe according to dimensions dictionary specifications.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe that will be filtered and/or aggregated.
    dimensions : Dict
        Filtering specifications.

    Returns
    -------
    pd.DataFrame
        Filtered dataframe.
    aggr   r   )r   setkeyslocisinr   )r   rN   Zfilter_colscolr   r   r   rO      s    rO   )r   r(   r   c                 C   s   g }t | jddh D ]}| | jdddk r:|| q| | jdddkrx| | tt| |  ddg| |< q| |  |d d	 krt| |} qz| | 	d
| |< W q   || Y q0 q| j
|dd|fS )aX  Format some columns in input dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe whose columns will be formatted.
    config : Dict
        Lib configuration dictionary.

    Returns
    -------
    pd.DataFrame
        Formatted dataframe.
    list
        List of columns removed from input dataframe.
    rL   rI   Fr	   r   r   r   rG   Zmax_cat_reg_cardinalityrF   r   )rU   rK   r   appendrE   dictzipunique__one_hot_encodingrH   r   )r   r(   rR   rY   r   r   r   rP     s    *rP   )r   rY   r   c                 C   s.   t j| t j| | |dgdd} | j|ddS )a<  Applies one-hot encoding to some columns of input dataframe.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe whose columns will be one-hot encoded.
    col : list
        List of columns to one-hot encode.

    Returns
    -------
    pd.DataFrame
        One-hot encoded dataframe.
    )prefixr   r   )r2   concatget_dummiesr   )r   rY   r   r   r   r^   "  s     r^   )cols_removedr   c                 C   s`   t | }|dkr\td|dkr"dnd d|dkr4dnd d|dkrFd	nd
 dd|   dS )zDisplays a message in streamlit dashboard if the input list is not empty.

    Parameters
    ----------
    cols_removed : list
        List of columns that have been removed.
    r   r   r   r   r   r   r   r   zthey arezit iszA neither the target, nor a dimension, nor a potential regressor: r   Nr   )rb   r#   r   r   r   print_removed_cols5  s    &rc   c                    sH   t  jddh } fdd|D }|d  |d<  d| S )aH  Aggregates input dataframe according to dimensions dictionary specifications.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe that will be filtered and/or aggregated.
    dimensions : Dict
        Filtering specifications.

    Returns
    -------
    pd.DataFrame
        Aggregated dataframe.
    rL   rI   c                    s&   i | ]}| |   d krdndqS r   meanr5   r   .0rY   r   r   r   
<dictcomp>V  rB   z_aggregate.<locals>.<dictcomp>rT   )rU   rK   lowergroupbyrT   reset_index)r   rN   cols_to_aggagg_dictr   ri   r   rQ   F  s    rQ   )r%   
resamplingr   c                 C   sD   |   }|d d dv r@|d dd |d< t|d |d< |S )a  Formats date column to datetime in input dataframe.

    Parameters
    ----------
    df_input : pd.DataFrame
        Input dataframe whose date column will be formatted to datetime.
    resampling : Dict
        Dictionary whose "freq" key contains the frequency of input dataframe.

    Returns
    -------
    pd.DataFrame
        Dataframe with date column formatted to datetime.
    freq)Hr   rL   c                 S   s
   |  dS )Nz%Y-%m-%d %H:%M:%S)strftimer?   r   r   r   rA   m  rB   z!format_datetime.<locals>.<lambda>)r*   rE   r2   r3   )r%   rp   r   r   r   r   format_datetime[  s
    ru   c                    sj   |    |d rft jddh } fdd|D }|d  |d<  d|d d |   S )	a<  Resamples input dataframe according to resampling dictionary specifications.

    Parameters
    ----------
    df_input : pd.DataFrame
        Input dataframe that will be resampled.
    resampling : Dict
        Resampling specifications.

    Returns
    -------
    pd.DataFrame
        Resampled dataframe.
    resamplerL   rI   c                    s&   i | ]}| |   d krdndqS rd   rf   rg   ri   r   r   rj     rB   zresample_df.<locals>.<dictcomp>rT   rq   rr   )r*   rU   rK   rk   	set_indexrv   rT   rm   )r%   rp   rn   ro   r   ri   r   resample_dfr  s    "rx   c                 C   sB   t | |d d |d d  kr>tdt |  d t  dS )a  Displays a message in streamlit dashboard and stops it if the input dataframe has not enough rows.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    config : Dict
        Lib configuration dictionary where the minimum number of rows is given.
    rG   Zmin_data_points_trainZmin_data_points_valz(The dataset has not enough data points (zj data points only) to make a forecast. Please resample with a higher frequency or change cleaning options.N)r   r    r!   r;   )r   r(   r   r   r   check_dataset_size  s    ry   )datasetsdatesparamsrp   r&   rN   r   c                    s  d}d|   v r|| d jvr:td| d t  | d |  }ttj|d |d |d d}||krtd	| d
| d|d 	d d|d 	d d|d  d t  t
|d   }	t
| d  t |	t|	krZ fdd|	D }
t|
dkr<tdd|
dd  d|
d  d ntd|
d  d t  dd |  D }t |t|kr fdd|D }t|dkrtd d|dd  d|d  d ntd!|d  d t  d"}|S )#a  Displays a message if the future regressors dataframe is incorrect and says whether or not to use it afterwards.

    Parameters
    ----------
    datasets : Dict
        Dictionary storing all dataframes.
    dates : Dict
        Dictionary containing future forecasting dates information.
    params : Dict
        Dictionary containing all model parameters and list of selected regressors.
    resampling : Dict
        Dictionary containing dataset frequency information.
    date_col : str
        Name of date column.
    dimensions : Dict
        Dictionary containing dimensions information.

    Returns
    -------
    bool
        Whether or not to use regressors for future forecast.
    Ffuture_regressorszDate column 'z:' not found in the dataset provided for future regressors.forecast_start_dateforecast_end_daterq   startendrq   z\The dataset provided for future regressors has the right number of distinct dates (expected z, found z3). Please make sure that the date column goes from z%Y-%m-%dz to z at frequency z) without skipping any date in this range.
regressorsc                    s   g | ]}| vr|qS r   r   )rh   regZ
input_colsr   r   
<listcomp>  rB   z.check_future_regressors_df.<locals>.<listcomp>r   zColumns r   Nrr   z and z; are missing in the dataset provided for future regressors.zColumn r   z: is missing in the dataset provided for future regressors.c                 S   s   h | ]}|d kr|qS )rT   r   rh   dimr   r   r   	<setcomp>  rB   z-check_future_regressors_df.<locals>.<setcomp>c                    s   g | ]}| vr|qS r   r   r   r   r   r   r     rB   zDimension columns zDimension column T)rV   rK   r    r!   r;   r   r   r2   
date_rangert   rU   intersectionr"   )rz   r{   r|   rp   r&   rN   Zuse_regressorsZN_dates_inputZN_dates_expectedZregressors_expectedZmissing_regressorsZdim_expectedZmissing_dimr   r   r   check_future_regressors_df  sn    
""r   )
rz   r{   r&   r'   rN   r)   r(   rp   r|   r   c	                 C   s   d|   v r| d }	d|	|< tj| d t|	j |	gdd}	t|	\}	}
t|	||||}	t|	||||\}	}
t|	|}	t	|	|}	|	j
|	d |d k  | d< |	jdd	d}	n2tj| d j |d
 |d d}tj|dgd}	t|	|}	|	| fS )a*  Applies data preparation to the dataset provided with future regressors.

    Parameters
    ----------
    datasets : Dict
        Dictionary storing all dataframes.
    dates : Dict
        Dictionary containing future forecasting dates information.
    date_col : str
        Name of date column.
    target_col : str
        Name of target column.
    dimensions : Dict
        Dictionary containing dimensions information.
    load_options : Dict
        Loading options selected by user.
    config : Dict
        Lib configuration dictionary.
    resampling : Dict
        Resampling specifications.
    params : Dict
        Dictionary containing all model parameters

    Returns
    -------
    pd.DataFrame
        Prepared  future dataframe.
    dict
        Dictionary storing all dataframes.
    r}   r   uploadedr   rL   r~   fullrI   r   r   Zforecast_freqr   rJ   )rV   r2   r`   r   rK   r   r.   rS   ru   rx   rW   r   r   rL   r6   	DataFrameadd_cap_and_floor_cols)rz   r{   r&   r'   rN   r)   r(   rp   r|   future_Zfuture_datesr   r   r   prepare_future_df  s&    ) 


r   )r%   r|   r   c                 C   s<   |   }|d d dkr8|d d |d< |d d |d< |S )aM  Resamples input dataframe according to resampling dictionary specifications.

    Parameters
    ----------
    df_input : pd.DataFrame
        Input dataframe that will be resampled.
    params : Dict
        Model parameters.

    Returns
    -------
    pd.DataFrame
        Dataframe with cap and floor columns if specified.
    otherZgrowthlogisticZ
saturationcapfloor)r*   )r%   r|   r   r   r   r   r   8  s
    r   )#typingr   r   r   r   numpyr9   pandasr2   	streamlitr    
cache_datar   r   r$   strr.   r+   Seriesboolr4   r,   r-   rS   rO   rP   r^   rc   rQ   ru   rx   ry   r   r   r   r   r   r   r   <module>   sv   
"


#+ 


$*!
 
 




Z






?
