init

2026-03-15 12:43:16 +08:00 · 2023-11-20 15:36:38 +08:00
commit 1295646024
10 changed files with 200 additions and 0 deletions
--- a/9
+++ b/9
@@ -0,0 +1,9 @@
+MIT License
+
+Copyright (c) 2023 Yinr
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1 @@
+# Stats Helper
--- a/StatsHelper/DataLoader.py
+++ b/StatsHelper/DataLoader.py
@@ -0,0 +1,130 @@
+from dataclasses import dataclass
+from typing import Callable, TypeAlias
+from pathlib import Path
+from functools import reduce
+import pickle
+import logging
+import pandas as pd
+
+from utils.helper import asList
+
+MODULE_NAME = 'DataLoader'
+
+logger = logging.getLogger(MODULE_NAME)
+
+BASE_DIR = Path(__file__).parents[1]
+DATA_DIR = BASE_DIR / 'data'
+OUTPUT_DIR = BASE_DIR / 'output'
+
+DataLoaderPreprocessor: TypeAlias = Callable[[pd.DataFrame], pd.DataFrame]
+@dataclass
+class DataLoaderExtMetaItem():
+    supported_ext: list[str]
+    loader: Callable
+DataLoaderExtMeta: TypeAlias = dict[str, dict[str, DataLoaderExtMetaItem]]
+
+class DataLoader(object):
+    ExtMeta: DataLoaderExtMeta = {
+        'csv': {
+            'supported_ext': ['.csv'],
+            'loader': pd.read_csv,
+        },
+        'xlsx': {
+            'supported_ext': ['.xls', '.xlsx'],
+            'loader': pd.read_excel,
+        },
+        'pkl': {
+            'supported_ext': ['.pkl'],
+            'loader': pickle.load,
+        },
+    }
+    def __init__(self, filename: str, *, preprocessor: DataLoaderPreprocessor | list[DataLoaderPreprocessor] = None, force_load = False, data_dir = None, dump_dir = None, dump_name = "", dump_suffix = ".dlpkl", auto_dump = False, force_dump = False, **kwargs):
+        # TODO: 增加支持 `dump_dir` + `dump_filename` 以及 `dump_file` 两种格式
+        # TODO: add support list of preprocessor
+        self.data_dir = Path(data_dir if data_dir else '.')
+        self.file_path = self.data_dir / filename
+        self.dump_dir = Path(dump_dir if dump_dir else self.data_dir)
+        self.dump_path = self.dump_dir / dump_name if dump_name else (self.dump_dir / filename).with_suffix(dump_suffix)
+        self.load_args: dict[str, dict] = kwargs
+        self.preprocessor: list[DataLoaderPreprocessor] = asList(preprocessor) if preprocessor is not None else []
+        logger.debug(f'DataLoader Filepath: {self.file_path}')
+        logger.debug(f'DataLoader Dumppath: {self.dump_path}')
+        logger.debug(f'DataLoader Preprocessor Count: {len(self.preprocessor)}')
+        self.data: pd.DataFrame | None = None
+        self.load(force=force_load)
+        if auto_dump:
+            self.dump(overwrite=force_dump)
+
+    def load(self, *, force = False):
+        """
+        Loads data from a file or a dump file.
+
+        Parameters:
+            force (bool): If True, forces the data to be loaded even if a dump file exists.
+
+        Returns:
+            DataLoader: Returns the DataLoader object itself.
+
+        Raises:
+            FileNotFoundError: If the file path specified does not exist.
+            ValueError: If the file type is not supported.
+        """
+        if not force and self.dump_path.exists():
+            logger.debug('Loading from %s' % (self.dump_path))
+            with open(self.dump_path, 'rb') as f:
+                dataLoader: DataLoader = pickle.load(f)
+                self.data = dataLoader.data
+                # restore preprocessor
+                if len(self.preprocessor) == 0:
+                    self.preprocessor = dataLoader.preprocessor
+                # restore load_args
+                if len(self.load_args) == 0:
+                    self.load_args = dataLoader.load_args
+        else:
+            if not self.file_path.exists():
+                raise FileNotFoundError(f'{self.file_path} not found.')
+            if self.file_path.suffix.lower() == '.csv':
+                logger.debug('Loading from "%s" using pandas.read_csv' % (self.file_path))
+                data = pd.read_csv(self.file_path, **self.load_args)
+            elif self.file_path.suffix.lower() in ['.xls', '.xlsx']:
+                logger.debug('Loading from "%s" using pandas.read_excel' % (self.file_path))
+                data = pd.read_excel(self.file_path, **self.load_args)
+            elif self.file_path.suffix.lower() in ['.pkl']:
+                logger.debug('Loading from "%s" using pickle.load' % (self.file_path))
+                data = pickle.load(self.file_path, **self.load_args)
+            else:
+                raise ValueError(f'{self.file_path.suffix} file not supported.')
+            if len(self.preprocessor) > 0:
+                logger.debug('Applying Preprocessor')
+                data = reduce(lambda data, preprocessor: preprocessor(data), self.preprocessor, data)
+            self.data = data
+        return self
+
+    def dump(self, *, overwrite = False):
+        """
+        Dump the object to a file.
+
+        Parameters:
+            overwrite (bool): Whether to overwrite the file if it already exists. Defaults to False.
+
+        Returns:
+            DataLoader: Returns the DataLoader object itself.
+        """
+        if overwrite or not self.dump_path.exists():
+            logger.debug('Dumping to "%s"' % (self.dump_path))
+            with open(self.dump_path, 'wb') as f:
+                pickle.dump(self, f)
+        else:
+            logger.debug('File "%s" already exists, skipping dumping.' % (self.dump_path))
+        return self
+
+    def clean_dump(self):
+        """
+        Deletes the dump file if it exists.
+
+        Returns:
+            DataLoader: Returns the DataLoader object itself.
+        """
+        if self.dump_path.exists():
+            self.dump_path.unlink()
+        return self
--- a/StatsHelper/init.py
+++ b/StatsHelper/init.py
--- a/StatsHelper/stats/Describe.py
+++ b/StatsHelper/stats/Describe.py
@@ -0,0 +1,37 @@
+import pandas as pd
+
+def describe(df: pd.DataFrame):
+    return df.agg(['count', 'mean', 'std', 'min', 'max']).T
+
+def proportion(df: pd.Series, *, dropna=True, **kwargs) -> pd.DataFrame:
+    """
+    Calculate the proportion of each unique value in a pandas Series.
+
+    Parameters:
+        df (pd.Series): The pandas Series to calculate the proportions for.
+        dropna (bool, optional): Whether to drop NaN values before calculating.
+            Defaults to True.
+        **kwargs: Additional keyword arguments to pass to the value_counts
+            method.
+
+    Returns:
+        pd.DataFrame: A DataFrame with two columns: the counts and proportions
+            of each unique value in the Series.
+    """
+    return pd.merge(
+        df.value_counts(dropna=dropna, **kwargs),
+        df.value_counts(normalize=True, dropna=dropna, **kwargs),
+        left_index=True,
+        right_index=True
+    )
+
+def proportionby(df: pd.DataFrame, groupby: str) -> pd.DataFrame:
+    # TODO: WIP
+    grouped = df.groupby(groupby, as_index=False).count()
+    return proportion(grouped)
+
+def sumby(df: pd.DataFrame, groupby: str) -> pd.DataFrame:
+    return df.groupby(groupby).sum()
+
+def countby(df: pd.DataFrame, groupby: str) -> pd.DataFrame:
+    return df.groupby(groupby).count()
--- a/StatsHelper/stats/init.py
+++ b/StatsHelper/stats/init.py
--- a/StatsHelper/utils/init.py
+++ b/StatsHelper/utils/init.py
--- a/StatsHelper/utils/helper.py
+++ b/StatsHelper/utils/helper.py
@@ -0,0 +1,10 @@
+from typing import Sequence, TypeVar
+
+
+T = TypeVar('T')
+
+# def asList[T](x: Sequence[T] | T) -> list[T]: # for python 3.12
+def asList(x: Sequence[T] | T) -> list[T]:
+    if isinstance(x, (list, tuple)):
+        return list(x)
+    return [x]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1 @@
+pandas
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='Stats Helper',
+    version='0.1.0',
+    author='Yinr',
+    description='Stats helper',
+    packages=find_packages(),
+    install_requires=[
+        'pandas',
+    ],
+)