excel.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. # Copyright 2021-2024 Avaiga Private Limited
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
  4. # the License. You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
  9. # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
  10. # specific language governing permissions and limitations under the License.
  11. import os
  12. from collections import defaultdict
  13. from datetime import datetime, timedelta
  14. from os.path import isfile
  15. from typing import Any, Dict, List, Optional, Set, Tuple, Union
  16. import modin.pandas as modin_pd
  17. import numpy as np
  18. import pandas as pd
  19. from openpyxl import load_workbook
  20. from taipy.config.common.scope import Scope
  21. from .._backup._backup import _replace_in_backup_file
  22. from .._entity._reload import _self_reload
  23. from .._version._version_manager_factory import _VersionManagerFactory
  24. from ..exceptions.exceptions import ExposedTypeLengthMismatch, NonExistingExcelSheet, SheetNameLengthMismatch
  25. from ..job.job_id import JobId
  26. from ._abstract_file import _AbstractFileDataNode
  27. from ._abstract_tabular import _AbstractTabularDataNode
  28. from .data_node import DataNode
  29. from .data_node_id import DataNodeId, Edit
  30. class ExcelDataNode(DataNode, _AbstractFileDataNode, _AbstractTabularDataNode):
  31. """Data Node stored as an Excel file.
  32. The Excel file format is _xlsx_.
  33. Attributes:
  34. config_id (str): Identifier of this data node configuration. It must be a valid Python
  35. identifier.
  36. scope (Scope^): The scope of this data node.
  37. id (str): The unique identifier of this data node.
  38. owner_id (str): The identifier of the owner (sequence_id, scenario_id, cycle_id) or
  39. `None`.
  40. parent_ids (Optional[Set[str]]): The identifiers of the parent tasks or `None`.
  41. last_edit_date (datetime): The date and time of the last modification.
  42. edits (List[Edit^]): The ordered list of edits for that job.
  43. version (str): The string indicates the application version of the data node to instantiate. If not provided,
  44. the current version is used.
  45. validity_period (Optional[timedelta]): The duration implemented as a timedelta since the last edit date for
  46. which the data node can be considered up-to-date. Once the validity period has passed, the data node is
  47. considered stale and relevant tasks will run even if they are skippable (see the
  48. [Task management page](../core/entities/task-mgt.md) for more details).
  49. If _validity_period_ is set to `None`, the data node is always up-to-date.
  50. edit_in_progress (bool): True if a task computing the data node has been submitted
  51. and not completed yet. False otherwise.
  52. editor_id (Optional[str]): The identifier of the user who is currently editing the data node.
  53. editor_expiration_date (Optional[datetime]): The expiration date of the editor lock.
  54. path (str): The path to the Excel file.
  55. properties (dict[str, Any]): A dictionary of additional properties. The _properties_
  56. must have a _"default_path"_ or _"path"_ entry with the path of the Excel file:
  57. - _"default_path"_ `(str)`: The path of the Excel file.\n
  58. - _"has_header"_ `(bool)`: If True, indicates that the Excel file has a header.\n
  59. - _"sheet_name"_ `(Union[List[str], str])`: The list of sheet names to be used. This
  60. can be a unique name.\n
  61. - _"exposed_type"_: The exposed type of the data read from Excel file. The default value is `pandas`.\n
  62. """
  63. __STORAGE_TYPE = "excel"
  64. __EXPOSED_TYPE_PROPERTY = "exposed_type"
  65. __EXPOSED_TYPE_NUMPY = "numpy"
  66. __EXPOSED_TYPE_PANDAS = "pandas"
  67. __EXPOSED_TYPE_MODIN = "modin"
  68. __VALID_STRING_EXPOSED_TYPES = [__EXPOSED_TYPE_PANDAS, __EXPOSED_TYPE_MODIN, __EXPOSED_TYPE_NUMPY]
  69. __PATH_KEY = "path"
  70. __DEFAULT_DATA_KEY = "default_data"
  71. __DEFAULT_PATH_KEY = "default_path"
  72. __HAS_HEADER_PROPERTY = "has_header"
  73. __SHEET_NAME_PROPERTY = "sheet_name"
  74. _REQUIRED_PROPERTIES: List[str] = []
  75. def __init__(
  76. self,
  77. config_id: str,
  78. scope: Scope,
  79. id: Optional[DataNodeId] = None,
  80. owner_id: Optional[str] = None,
  81. parent_ids: Optional[Set[str]] = None,
  82. last_edit_date: Optional[datetime] = None,
  83. edits: List[Edit] = None,
  84. version: str = None,
  85. validity_period: Optional[timedelta] = None,
  86. edit_in_progress: bool = False,
  87. editor_id: Optional[str] = None,
  88. editor_expiration_date: Optional[datetime] = None,
  89. properties: Dict = None,
  90. ):
  91. if properties is None:
  92. properties = {}
  93. default_value = properties.pop(self.__DEFAULT_DATA_KEY, None)
  94. self._path = properties.get(self.__PATH_KEY, properties.get(self.__DEFAULT_PATH_KEY))
  95. properties[self.__PATH_KEY] = self._path
  96. if self.__SHEET_NAME_PROPERTY not in properties.keys():
  97. properties[self.__SHEET_NAME_PROPERTY] = None
  98. if self.__HAS_HEADER_PROPERTY not in properties.keys():
  99. properties[self.__HAS_HEADER_PROPERTY] = True
  100. if self.__EXPOSED_TYPE_PROPERTY not in properties.keys():
  101. properties[self.__EXPOSED_TYPE_PROPERTY] = self.__EXPOSED_TYPE_PANDAS
  102. self._check_exposed_type(properties[self.__EXPOSED_TYPE_PROPERTY], self.__VALID_STRING_EXPOSED_TYPES)
  103. super().__init__(
  104. config_id,
  105. scope,
  106. id,
  107. owner_id,
  108. parent_ids,
  109. last_edit_date,
  110. edits,
  111. version or _VersionManagerFactory._build_manager()._get_latest_version(),
  112. validity_period,
  113. edit_in_progress,
  114. editor_id,
  115. editor_expiration_date,
  116. **properties,
  117. )
  118. if not self._path:
  119. self._path = self._build_path(self.storage_type())
  120. properties[self.__PATH_KEY] = self._path
  121. if default_value is not None and not os.path.exists(self._path):
  122. self._write(default_value)
  123. self._last_edit_date = datetime.now()
  124. self._edits.append(
  125. Edit(
  126. {
  127. "timestamp": self._last_edit_date,
  128. "writer_identifier": "TAIPY",
  129. "comments": "Default data written.",
  130. }
  131. )
  132. )
  133. if not self._last_edit_date and isfile(self._path):
  134. self._last_edit_date = datetime.now()
  135. self._TAIPY_PROPERTIES.update(
  136. {
  137. self.__EXPOSED_TYPE_PROPERTY,
  138. self.__PATH_KEY,
  139. self.__DEFAULT_PATH_KEY,
  140. self.__DEFAULT_DATA_KEY,
  141. self.__HAS_HEADER_PROPERTY,
  142. self.__SHEET_NAME_PROPERTY,
  143. }
  144. )
  145. @property # type: ignore
  146. @_self_reload(DataNode._MANAGER_NAME)
  147. def path(self):
  148. return self._path
  149. @path.setter
  150. def path(self, value):
  151. tmp_old_path = self._path
  152. self._path = value
  153. self.properties[self.__PATH_KEY] = value
  154. _replace_in_backup_file(old_file_path=tmp_old_path, new_file_path=self._path)
  155. @classmethod
  156. def storage_type(cls) -> str:
  157. return cls.__STORAGE_TYPE
  158. @staticmethod
  159. def _check_exposed_type(exposed_type, valid_string_exposed_types):
  160. if isinstance(exposed_type, str):
  161. _AbstractTabularDataNode._check_exposed_type(exposed_type, valid_string_exposed_types)
  162. elif isinstance(exposed_type, list):
  163. for t in exposed_type:
  164. _AbstractTabularDataNode._check_exposed_type(t, valid_string_exposed_types)
  165. elif isinstance(exposed_type, dict):
  166. for t in exposed_type.values():
  167. _AbstractTabularDataNode._check_exposed_type(t, valid_string_exposed_types)
  168. def _read(self):
  169. if self.properties[self.__EXPOSED_TYPE_PROPERTY] == self.__EXPOSED_TYPE_PANDAS:
  170. return self._read_as_pandas_dataframe()
  171. if self.properties[self.__EXPOSED_TYPE_PROPERTY] == self.__EXPOSED_TYPE_MODIN:
  172. return self._read_as_modin_dataframe()
  173. if self.properties[self.__EXPOSED_TYPE_PROPERTY] == self.__EXPOSED_TYPE_NUMPY:
  174. return self._read_as_numpy()
  175. return self._read_as()
  176. def __sheet_name_to_list(self, properties):
  177. if properties[self.__SHEET_NAME_PROPERTY]:
  178. sheet_names = properties[self.__SHEET_NAME_PROPERTY]
  179. else:
  180. excel_file = load_workbook(properties[self.__PATH_KEY])
  181. sheet_names = excel_file.sheetnames
  182. excel_file.close()
  183. return sheet_names if isinstance(sheet_names, (List, Set, Tuple)) else [sheet_names]
  184. def _read_as(self):
  185. excel_file = load_workbook(self._path)
  186. exposed_type = self.properties[self.__EXPOSED_TYPE_PROPERTY]
  187. work_books = defaultdict()
  188. sheet_names = excel_file.sheetnames
  189. provided_sheet_names = self.__sheet_name_to_list(self.properties)
  190. for sheet_name in provided_sheet_names:
  191. if sheet_name not in sheet_names:
  192. raise NonExistingExcelSheet(sheet_name, self._path)
  193. if isinstance(exposed_type, List):
  194. if len(provided_sheet_names) != len(self.properties[self.__EXPOSED_TYPE_PROPERTY]):
  195. raise ExposedTypeLengthMismatch(
  196. f"Expected {len(provided_sheet_names)} exposed types, got "
  197. f"{len(self.properties[self.__EXPOSED_TYPE_PROPERTY])}"
  198. )
  199. for i, sheet_name in enumerate(provided_sheet_names):
  200. work_sheet = excel_file[sheet_name]
  201. sheet_exposed_type = exposed_type
  202. if not isinstance(sheet_exposed_type, str):
  203. if isinstance(exposed_type, dict):
  204. sheet_exposed_type = exposed_type.get(sheet_name, self.__EXPOSED_TYPE_PANDAS)
  205. elif isinstance(exposed_type, List):
  206. sheet_exposed_type = exposed_type[i]
  207. if isinstance(sheet_exposed_type, str):
  208. if sheet_exposed_type == self.__EXPOSED_TYPE_NUMPY:
  209. work_books[sheet_name] = self._read_as_pandas_dataframe(sheet_name).to_numpy()
  210. elif sheet_exposed_type == self.__EXPOSED_TYPE_PANDAS:
  211. work_books[sheet_name] = self._read_as_pandas_dataframe(sheet_name)
  212. continue
  213. res = list()
  214. for row in work_sheet.rows:
  215. res.append([col.value for col in row])
  216. if self.properties[self.__HAS_HEADER_PROPERTY] and res:
  217. header = res.pop(0)
  218. for i, row in enumerate(res):
  219. res[i] = sheet_exposed_type(**dict([[h, r] for h, r in zip(header, row)]))
  220. else:
  221. for i, row in enumerate(res):
  222. res[i] = sheet_exposed_type(*row)
  223. work_books[sheet_name] = res
  224. excel_file.close()
  225. if len(provided_sheet_names) == 1:
  226. return work_books[provided_sheet_names[0]]
  227. return work_books
  228. def _read_as_numpy(self):
  229. sheets = self._read_as_pandas_dataframe()
  230. if isinstance(sheets, dict):
  231. return {sheet_name: df.to_numpy() for sheet_name, df in sheets.items()}
  232. return sheets.to_numpy()
  233. def _do_read_excel(self, engine, sheet_names, kwargs) -> Union[Dict[Union[int, str], pd.DataFrame], pd.DataFrame]:
  234. df = pd.read_excel(
  235. self._path,
  236. sheet_name=sheet_names,
  237. **kwargs,
  238. )
  239. # We are using pandas to load modin dataframes because of a modin issue
  240. # https://github.com/modin-project/modin/issues/4924
  241. if engine == "modin":
  242. if isinstance(df, dict): # Check if it s a multiple sheet Excel file
  243. for key, value in df.items():
  244. df[key] = modin_pd.DataFrame(value)
  245. return df
  246. return modin_pd.DataFrame(df)
  247. return df
  248. def __get_sheet_names_and_header(self, sheet_names):
  249. kwargs: Dict[str, Any] = {}
  250. if sheet_names is None:
  251. sheet_names = self.properties[self.__SHEET_NAME_PROPERTY]
  252. if not self.properties[self.__HAS_HEADER_PROPERTY]:
  253. kwargs["header"] = None
  254. return sheet_names, kwargs
  255. def _read_as_pandas_dataframe(self, sheet_names=None) -> Union[Dict[Union[int, str], pd.DataFrame], pd.DataFrame]:
  256. sheet_names, kwargs = self.__get_sheet_names_and_header(sheet_names)
  257. try:
  258. return self._do_read_excel("pandas", sheet_names, kwargs)
  259. except pd.errors.EmptyDataError:
  260. return pd.DataFrame()
  261. def _read_as_modin_dataframe(
  262. self, sheet_names=None
  263. ) -> Union[Dict[Union[int, str], modin_pd.DataFrame], modin_pd.DataFrame]:
  264. sheet_names, kwargs = self.__get_sheet_names_and_header(sheet_names)
  265. try:
  266. if kwargs.get("header", None):
  267. return modin_pd.read_excel(
  268. self._path,
  269. sheet_name=sheet_names,
  270. **kwargs,
  271. )
  272. else:
  273. return self._do_read_excel("modin", sheet_names, kwargs)
  274. except pd.errors.EmptyDataError:
  275. return modin_pd.DataFrame()
  276. def __append_excel_with_single_sheet(self, append_excel_fct, *args, **kwargs):
  277. sheet_name = self.properties.get(self.__SHEET_NAME_PROPERTY)
  278. with pd.ExcelWriter(self._path, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
  279. if sheet_name:
  280. if not isinstance(sheet_name, str):
  281. sheet_name = sheet_name[0]
  282. append_excel_fct(
  283. writer, *args, **kwargs, sheet_name=sheet_name, startrow=writer.sheets[sheet_name].max_row
  284. )
  285. else:
  286. sheet_name = list(writer.sheets.keys())[0]
  287. append_excel_fct(writer, *args, **kwargs, startrow=writer.sheets[sheet_name].max_row)
  288. def __append_excel_with_multiple_sheets(self, data: Any, columns: List[str] = None):
  289. with pd.ExcelWriter(self._path, mode="a", engine="openpyxl", if_sheet_exists="overlay") as writer:
  290. # Each key stands for a sheet name
  291. for sheet_name in data.keys():
  292. if isinstance(data[sheet_name], np.ndarray):
  293. df = pd.DataFrame(data[sheet_name])
  294. else:
  295. df = data[sheet_name]
  296. if columns:
  297. data[sheet_name].columns = columns
  298. df.to_excel(
  299. writer, sheet_name=sheet_name, index=False, header=False, startrow=writer.sheets[sheet_name].max_row
  300. )
  301. def _append(self, data: Any):
  302. if isinstance(data, Dict) and all(
  303. isinstance(x, (pd.DataFrame, modin_pd.DataFrame, np.ndarray)) for x in data.values()
  304. ):
  305. self.__append_excel_with_multiple_sheets(data)
  306. elif isinstance(data, (pd.DataFrame, modin_pd.DataFrame)):
  307. self.__append_excel_with_single_sheet(data.to_excel, index=False, header=False)
  308. else:
  309. self.__append_excel_with_single_sheet(pd.DataFrame(data).to_excel, index=False, header=False)
  310. def __write_excel_with_single_sheet(self, write_excel_fct, *args, **kwargs):
  311. sheet_name = self.properties.get(self.__SHEET_NAME_PROPERTY)
  312. if sheet_name:
  313. if not isinstance(sheet_name, str):
  314. if len(sheet_name) > 1:
  315. raise SheetNameLengthMismatch
  316. else:
  317. sheet_name = sheet_name[0]
  318. write_excel_fct(*args, **kwargs, sheet_name=sheet_name)
  319. else:
  320. write_excel_fct(*args, **kwargs)
  321. def __write_excel_with_multiple_sheets(self, data: Any, columns: List[str] = None):
  322. with pd.ExcelWriter(self._path) as writer:
  323. # Each key stands for a sheet name
  324. for key in data.keys():
  325. if isinstance(data[key], np.ndarray):
  326. df = pd.DataFrame(data[key])
  327. else:
  328. df = data[key]
  329. if columns:
  330. data[key].columns = columns
  331. df.to_excel(writer, key, index=False)
  332. def _write(self, data: Any):
  333. if isinstance(data, Dict) and all(
  334. isinstance(x, (pd.DataFrame, modin_pd.DataFrame, np.ndarray)) for x in data.values()
  335. ):
  336. self.__write_excel_with_multiple_sheets(data)
  337. elif isinstance(data, (pd.DataFrame, modin_pd.DataFrame)):
  338. self.__write_excel_with_single_sheet(data.to_excel, self._path, index=False)
  339. else:
  340. self.__write_excel_with_single_sheet(pd.DataFrame(data).to_excel, self._path, index=False)
  341. def write_with_column_names(self, data: Any, columns: List[str] = None, job_id: Optional[JobId] = None):
  342. """Write a set of columns.
  343. Parameters:
  344. data (Any): The data to write.
  345. columns (List[str]): The list of column names to write.
  346. job_id (JobId^): An optional identifier of the writer.
  347. """
  348. if isinstance(data, Dict) and all(
  349. isinstance(x, (pd.DataFrame, modin_pd.DataFrame, np.ndarray)) for x in data.values()
  350. ):
  351. self.__write_excel_with_multiple_sheets(data, columns=columns)
  352. else:
  353. df = pd.DataFrame(data)
  354. if columns:
  355. df.columns = pd.Index(columns, dtype="object")
  356. self.__write_excel_with_single_sheet(df.to_excel, self.path, index=False)
  357. self.track_edit(timestamp=datetime.now(), job_id=job_id)