_tabular_datanode_mixin.py 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # Copyright 2021-2024 Avaiga Private Limited
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
  4. # the License. You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
  9. # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
  10. # specific language governing permissions and limitations under the License.
  11. from typing import Any, Callable, Dict, List, Union
  12. import numpy as np
  13. import pandas as pd
  14. from ..exceptions.exceptions import InvalidExposedType
  15. class _TabularDataNodeMixin(object):
  16. """Mixin class designed to handle tabular representable data nodes
  17. (CSVDataNode, ParquetDataNode, ExcelDataNode, SQLTableDataNode and SQLDataNode)."""
  18. _HAS_HEADER_PROPERTY = "has_header"
  19. _EXPOSED_TYPE_PROPERTY = "exposed_type"
  20. _EXPOSED_TYPE_NUMPY = "numpy"
  21. _EXPOSED_TYPE_PANDAS = "pandas"
  22. _EXPOSED_TYPE_MODIN = "modin" # Deprecated in favor of pandas since 3.1.0
  23. __VALID_STRING_EXPOSED_TYPES = [_EXPOSED_TYPE_PANDAS, _EXPOSED_TYPE_NUMPY]
  24. def __init__(self, **kwargs) -> None:
  25. self._decoder: Union[Callable[[List[Any]], Any], Callable[[Dict[Any, Any]], Any]]
  26. self.custom_document = kwargs.get(self._EXPOSED_TYPE_PROPERTY)
  27. if kwargs.get(self._HAS_HEADER_PROPERTY, True):
  28. self._decoder = self._default_decoder_with_header
  29. else:
  30. self._decoder = self._default_decoder_without_header
  31. custom_decoder = getattr(self.custom_document, "decode", None)
  32. if callable(custom_decoder):
  33. self._decoder = custom_decoder
  34. self._encoder = self._default_encoder
  35. custom_encoder = getattr(self.custom_document, "encode", None)
  36. if callable(custom_encoder):
  37. self._encoder = custom_encoder
  38. def _convert_data_to_dataframe(self, exposed_type: Any, data: Any) -> Union[pd.DataFrame, pd.Series]:
  39. if exposed_type == self._EXPOSED_TYPE_PANDAS and isinstance(data, (pd.DataFrame, pd.Series)):
  40. return data
  41. elif exposed_type == self._EXPOSED_TYPE_NUMPY and isinstance(data, np.ndarray):
  42. return pd.DataFrame(data)
  43. elif isinstance(data, list) and not isinstance(exposed_type, str):
  44. return pd.DataFrame.from_records([self._encoder(row) for row in data])
  45. return pd.DataFrame(data)
  46. @classmethod
  47. def _get_valid_exposed_type(cls, properties: Dict):
  48. if (
  49. cls._EXPOSED_TYPE_PROPERTY not in properties.keys()
  50. or properties[cls._EXPOSED_TYPE_PROPERTY] == cls._EXPOSED_TYPE_MODIN
  51. ):
  52. # Default exposed type is pandas
  53. # Deprecated modin exposed type in favor of pandas since 3.1.0
  54. return cls._EXPOSED_TYPE_PANDAS
  55. return properties[cls._EXPOSED_TYPE_PROPERTY]
  56. @classmethod
  57. def _check_exposed_type(cls, exposed_type):
  58. valid_string_exposed_types = cls.__VALID_STRING_EXPOSED_TYPES
  59. if isinstance(exposed_type, str) and exposed_type not in valid_string_exposed_types:
  60. raise InvalidExposedType(
  61. f"Invalid string exposed type {exposed_type}. Supported values are "
  62. f"{', '.join(valid_string_exposed_types)}"
  63. )
  64. def _default_decoder_with_header(self, document: Dict) -> Any:
  65. if self.custom_document:
  66. return self.custom_document(**document)
  67. def _default_decoder_without_header(self, document: List) -> Any:
  68. if self.custom_document:
  69. return self.custom_document(*document)
  70. def _default_encoder(self, document_object: Any) -> Dict:
  71. return document_object.__dict__