This commit is contained in:
2023-11-20 15:36:38 +08:00
committed by GitHub
commit 1295646024
10 changed files with 200 additions and 0 deletions

9
LICENSE Normal file
View File

@@ -0,0 +1,9 @@
MIT License
Copyright (c) 2023 Yinr
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

1
README.md Normal file
View File

@@ -0,0 +1 @@
# Stats Helper

130
StatsHelper/DataLoader.py Normal file
View File

@@ -0,0 +1,130 @@
from dataclasses import dataclass
from typing import Callable, TypeAlias
from pathlib import Path
from functools import reduce
import pickle
import logging
import pandas as pd
from utils.helper import asList
MODULE_NAME = 'DataLoader'
logger = logging.getLogger(MODULE_NAME)
BASE_DIR = Path(__file__).parents[1]
DATA_DIR = BASE_DIR / 'data'
OUTPUT_DIR = BASE_DIR / 'output'
DataLoaderPreprocessor: TypeAlias = Callable[[pd.DataFrame], pd.DataFrame]
@dataclass
class DataLoaderExtMetaItem():
supported_ext: list[str]
loader: Callable
DataLoaderExtMeta: TypeAlias = dict[str, dict[str, DataLoaderExtMetaItem]]
class DataLoader(object):
ExtMeta: DataLoaderExtMeta = {
'csv': {
'supported_ext': ['.csv'],
'loader': pd.read_csv,
},
'xlsx': {
'supported_ext': ['.xls', '.xlsx'],
'loader': pd.read_excel,
},
'pkl': {
'supported_ext': ['.pkl'],
'loader': pickle.load,
},
}
def __init__(self, filename: str, *, preprocessor: DataLoaderPreprocessor | list[DataLoaderPreprocessor] = None, force_load = False, data_dir = None, dump_dir = None, dump_name = "", dump_suffix = ".dlpkl", auto_dump = False, force_dump = False, **kwargs):
# TODO: 增加支持 `dump_dir` + `dump_filename` 以及 `dump_file` 两种格式
# TODO: add support list of preprocessor
self.data_dir = Path(data_dir if data_dir else '.')
self.file_path = self.data_dir / filename
self.dump_dir = Path(dump_dir if dump_dir else self.data_dir)
self.dump_path = self.dump_dir / dump_name if dump_name else (self.dump_dir / filename).with_suffix(dump_suffix)
self.load_args: dict[str, dict] = kwargs
self.preprocessor: list[DataLoaderPreprocessor] = asList(preprocessor) if preprocessor is not None else []
logger.debug(f'DataLoader Filepath: {self.file_path}')
logger.debug(f'DataLoader Dumppath: {self.dump_path}')
logger.debug(f'DataLoader Preprocessor Count: {len(self.preprocessor)}')
self.data: pd.DataFrame | None = None
self.load(force=force_load)
if auto_dump:
self.dump(overwrite=force_dump)
def load(self, *, force = False):
"""
Loads data from a file or a dump file.
Parameters:
force (bool): If True, forces the data to be loaded even if a dump file exists.
Returns:
DataLoader: Returns the DataLoader object itself.
Raises:
FileNotFoundError: If the file path specified does not exist.
ValueError: If the file type is not supported.
"""
if not force and self.dump_path.exists():
logger.debug('Loading from %s' % (self.dump_path))
with open(self.dump_path, 'rb') as f:
dataLoader: DataLoader = pickle.load(f)
self.data = dataLoader.data
# restore preprocessor
if len(self.preprocessor) == 0:
self.preprocessor = dataLoader.preprocessor
# restore load_args
if len(self.load_args) == 0:
self.load_args = dataLoader.load_args
else:
if not self.file_path.exists():
raise FileNotFoundError(f'{self.file_path} not found.')
if self.file_path.suffix.lower() == '.csv':
logger.debug('Loading from "%s" using pandas.read_csv' % (self.file_path))
data = pd.read_csv(self.file_path, **self.load_args)
elif self.file_path.suffix.lower() in ['.xls', '.xlsx']:
logger.debug('Loading from "%s" using pandas.read_excel' % (self.file_path))
data = pd.read_excel(self.file_path, **self.load_args)
elif self.file_path.suffix.lower() in ['.pkl']:
logger.debug('Loading from "%s" using pickle.load' % (self.file_path))
data = pickle.load(self.file_path, **self.load_args)
else:
raise ValueError(f'{self.file_path.suffix} file not supported.')
if len(self.preprocessor) > 0:
logger.debug('Applying Preprocessor')
data = reduce(lambda data, preprocessor: preprocessor(data), self.preprocessor, data)
self.data = data
return self
def dump(self, *, overwrite = False):
"""
Dump the object to a file.
Parameters:
overwrite (bool): Whether to overwrite the file if it already exists. Defaults to False.
Returns:
DataLoader: Returns the DataLoader object itself.
"""
if overwrite or not self.dump_path.exists():
logger.debug('Dumping to "%s"' % (self.dump_path))
with open(self.dump_path, 'wb') as f:
pickle.dump(self, f)
else:
logger.debug('File "%s" already exists, skipping dumping.' % (self.dump_path))
return self
def clean_dump(self):
"""
Deletes the dump file if it exists.
Returns:
DataLoader: Returns the DataLoader object itself.
"""
if self.dump_path.exists():
self.dump_path.unlink()
return self

0
StatsHelper/__init__.py Normal file
View File

View File

@@ -0,0 +1,37 @@
import pandas as pd
def describe(df: pd.DataFrame):
return df.agg(['count', 'mean', 'std', 'min', 'max']).T
def proportion(df: pd.Series, *, dropna=True, **kwargs) -> pd.DataFrame:
"""
Calculate the proportion of each unique value in a pandas Series.
Parameters:
df (pd.Series): The pandas Series to calculate the proportions for.
dropna (bool, optional): Whether to drop NaN values before calculating.
Defaults to True.
**kwargs: Additional keyword arguments to pass to the value_counts
method.
Returns:
pd.DataFrame: A DataFrame with two columns: the counts and proportions
of each unique value in the Series.
"""
return pd.merge(
df.value_counts(dropna=dropna, **kwargs),
df.value_counts(normalize=True, dropna=dropna, **kwargs),
left_index=True,
right_index=True
)
def proportionby(df: pd.DataFrame, groupby: str) -> pd.DataFrame:
# TODO: WIP
grouped = df.groupby(groupby, as_index=False).count()
return proportion(grouped)
def sumby(df: pd.DataFrame, groupby: str) -> pd.DataFrame:
return df.groupby(groupby).sum()
def countby(df: pd.DataFrame, groupby: str) -> pd.DataFrame:
return df.groupby(groupby).count()

View File

View File

View File

@@ -0,0 +1,10 @@
from typing import Sequence, TypeVar
T = TypeVar('T')
# def asList[T](x: Sequence[T] | T) -> list[T]: # for python 3.12
def asList(x: Sequence[T] | T) -> list[T]:
if isinstance(x, (list, tuple)):
return list(x)
return [x]

1
requirements.txt Normal file
View File

@@ -0,0 +1 @@
pandas

12
setup.py Normal file
View File

@@ -0,0 +1,12 @@
from setuptools import setup, find_packages
setup(
name='Stats Helper',
version='0.1.0',
author='Yinr',
description='Stats helper',
packages=find_packages(),
install_requires=[
'pandas',
],
)