test_excel_data_node.py 74 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756
  1. # Copyright 2023 Avaiga Private Limited
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
  4. # the License. You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
  9. # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
  10. # specific language governing permissions and limitations under the License.
  11. import os
  12. import pathlib
  13. from datetime import datetime
  14. from time import sleep
  15. from typing import Dict
  16. import modin.pandas as modin_pd
  17. import numpy as np
  18. import pandas as pd
  19. import pytest
  20. from modin.pandas.test.utils import df_equals
  21. from pandas.testing import assert_frame_equal
  22. from taipy.config.common.scope import Scope
  23. from taipy.config.config import Config
  24. from taipy.core.data._data_manager import _DataManager
  25. from taipy.core.data.data_node_id import DataNodeId
  26. from taipy.core.data.excel import ExcelDataNode
  27. from taipy.core.data.operator import JoinOperator, Operator
  28. from taipy.core.exceptions.exceptions import (
  29. ExposedTypeLengthMismatch,
  30. InvalidExposedType,
  31. NoData,
  32. NonExistingExcelSheet,
  33. SheetNameLengthMismatch,
  34. )
  35. @pytest.fixture(scope="function", autouse=True)
  36. def cleanup():
  37. yield
  38. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/temp.xlsx")
  39. if os.path.exists(path):
  40. os.remove(path)
  41. class MyCustomObject:
  42. def __init__(self, id, integer, text):
  43. self.id = id
  44. self.integer = integer
  45. self.text = text
  46. class MyCustomObject1:
  47. def __init__(self, id, integer, text):
  48. self.id = id
  49. self.integer = integer
  50. self.text = text
  51. class MyCustomObject2:
  52. def __init__(self, id, integer, text):
  53. self.id = id
  54. self.integer = integer
  55. self.text = text
  56. class TestExcelDataNode:
  57. def test_new_excel_data_node_with_existing_file_is_ready_for_reading(self):
  58. not_ready_dn_cfg = Config.configure_data_node("not_ready_data_node_config_id", "excel", path="NOT_EXISTING.csv")
  59. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  60. ready_dn_cfg = Config.configure_data_node("ready_data_node_config_id", "excel", path=path)
  61. dns = _DataManager._bulk_get_or_create([not_ready_dn_cfg, ready_dn_cfg])
  62. assert not dns[not_ready_dn_cfg].is_ready_for_reading
  63. assert dns[ready_dn_cfg].is_ready_for_reading
  64. def test_create(self):
  65. path = "data/node/path"
  66. sheet_names = ["sheet_name_1", "sheet_name_2"]
  67. dn = ExcelDataNode(
  68. "foo_bar",
  69. Scope.SCENARIO,
  70. properties={"path": path, "has_header": False, "sheet_name": sheet_names, "name": "super name"},
  71. )
  72. assert isinstance(dn, ExcelDataNode)
  73. assert dn.storage_type() == "excel"
  74. assert dn.config_id == "foo_bar"
  75. assert dn.name == "super name"
  76. assert dn.scope == Scope.SCENARIO
  77. assert dn.id is not None
  78. assert dn.owner_id is None
  79. assert dn.parent_ids == set()
  80. assert dn.last_edit_date is None
  81. assert dn.job_ids == []
  82. assert not dn.is_ready_for_reading
  83. assert dn.path == path
  84. assert dn.has_header is False
  85. assert dn.sheet_name == sheet_names
  86. def test_get_user_properties(self, excel_file):
  87. dn_1 = ExcelDataNode("dn_1", Scope.SCENARIO, properties={"path": "data/node/path"})
  88. assert dn_1._get_user_properties() == {}
  89. dn_2 = ExcelDataNode(
  90. "dn_2",
  91. Scope.SCENARIO,
  92. properties={
  93. "exposed_type": "numpy",
  94. "default_data": "foo",
  95. "default_path": excel_file,
  96. "has_header": False,
  97. "sheet_name": ["sheet_name_1", "sheet_name_2"],
  98. "foo": "bar",
  99. },
  100. )
  101. # exposed_type, default_data, default_path, path, has_header are filtered out
  102. assert dn_2._get_user_properties() == {"foo": "bar"}
  103. def test_read_with_header(self):
  104. with pytest.raises(NoData):
  105. not_existing_excel = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": "WRONG.xlsx"})
  106. assert not_existing_excel.read() is None
  107. not_existing_excel.read_or_raise()
  108. empty_excel_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/empty.xlsx")
  109. empty_excel = ExcelDataNode(
  110. "foo",
  111. Scope.SCENARIO,
  112. properties={"path": empty_excel_path, "exposed_type": MyCustomObject, "has_header": True},
  113. )
  114. assert len(empty_excel.read()) == 0
  115. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  116. # Create ExcelDataNode without exposed_type (Default is pandas.DataFrame)
  117. excel_data_node_as_pandas = ExcelDataNode(
  118. "bar", Scope.SCENARIO, properties={"path": path, "sheet_name": "Sheet1"}
  119. )
  120. data_pandas = excel_data_node_as_pandas.read()
  121. assert isinstance(data_pandas, pd.DataFrame)
  122. assert len(data_pandas) == 5
  123. assert np.array_equal(data_pandas.to_numpy(), pd.read_excel(path).to_numpy())
  124. # Create ExcelDataNode with numpy exposed_type
  125. excel_data_node_as_numpy = ExcelDataNode(
  126. "bar", Scope.SCENARIO, properties={"path": path, "exposed_type": "numpy", "sheet_name": "Sheet1"}
  127. )
  128. data_numpy = excel_data_node_as_numpy.read()
  129. assert isinstance(data_numpy, np.ndarray)
  130. assert len(data_numpy) == 5
  131. assert np.array_equal(data_numpy, pd.read_excel(path).to_numpy())
  132. # Create the same ExcelDataNode but with custom exposed_type
  133. non_existing_sheet_name_custom = ExcelDataNode(
  134. "bar",
  135. Scope.SCENARIO,
  136. properties={"path": path, "sheet_name": "abc", "exposed_type": MyCustomObject},
  137. )
  138. with pytest.raises(NonExistingExcelSheet):
  139. non_existing_sheet_name_custom.read()
  140. excel_data_node_as_custom_object = ExcelDataNode(
  141. "bar",
  142. Scope.SCENARIO,
  143. properties={"path": path, "exposed_type": MyCustomObject, "sheet_name": "Sheet1"},
  144. )
  145. data_custom = excel_data_node_as_custom_object.read()
  146. assert isinstance(data_custom, list)
  147. assert len(data_custom) == 5
  148. for (_, row_pandas), row_custom in zip(data_pandas.iterrows(), data_custom):
  149. assert isinstance(row_custom, MyCustomObject)
  150. assert row_pandas["id"] == row_custom.id
  151. assert row_pandas["integer"] == row_custom.integer
  152. assert row_pandas["text"] == row_custom.text
  153. @pytest.mark.modin
  154. def test_read_with_header_modin(self):
  155. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  156. # Create ExcelDataNode with modin exposed_type
  157. excel_data_node_as_modin = ExcelDataNode(
  158. "bar", Scope.SCENARIO, properties={"path": path, "sheet_name": "Sheet1", "exposed_type": "modin"}
  159. )
  160. data_modin = excel_data_node_as_modin.read()
  161. assert isinstance(data_modin, modin_pd.DataFrame)
  162. assert len(data_modin) == 5
  163. assert np.array_equal(data_modin.to_numpy(), pd.read_excel(path).to_numpy())
  164. def test_read_without_header(self):
  165. not_existing_excel = ExcelDataNode(
  166. "foo", Scope.SCENARIO, properties={"path": "WRONG.xlsx", "has_header": False}
  167. )
  168. with pytest.raises(NoData):
  169. assert not_existing_excel.read() is None
  170. not_existing_excel.read_or_raise()
  171. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  172. # Create ExcelDataNode without exposed_type (Default is pandas.DataFrame)
  173. excel_data_node_as_pandas = ExcelDataNode(
  174. "bar", Scope.SCENARIO, properties={"path": path, "has_header": False, "sheet_name": "Sheet1"}
  175. )
  176. data_pandas = excel_data_node_as_pandas.read()
  177. assert isinstance(data_pandas, pd.DataFrame)
  178. assert len(data_pandas) == 6
  179. assert np.array_equal(data_pandas.to_numpy(), pd.read_excel(path, header=None).to_numpy())
  180. # Create ExcelDataNode with numpy exposed_type
  181. excel_data_node_as_numpy = ExcelDataNode(
  182. "bar",
  183. Scope.SCENARIO,
  184. properties={"path": path, "has_header": False, "exposed_type": "numpy", "sheet_name": "Sheet1"},
  185. )
  186. data_numpy = excel_data_node_as_numpy.read()
  187. assert isinstance(data_numpy, np.ndarray)
  188. assert len(data_numpy) == 6
  189. assert np.array_equal(data_numpy, pd.read_excel(path, header=None).to_numpy())
  190. # Create the same ExcelDataNode but with custom exposed_type
  191. non_existing_sheet_name_custom = ExcelDataNode(
  192. "bar",
  193. Scope.SCENARIO,
  194. properties={"path": path, "has_header": False, "sheet_name": "abc", "exposed_type": MyCustomObject},
  195. )
  196. with pytest.raises(NonExistingExcelSheet):
  197. non_existing_sheet_name_custom.read()
  198. excel_data_node_as_custom_object = ExcelDataNode(
  199. "bar",
  200. Scope.SCENARIO,
  201. properties={
  202. "path": path,
  203. "has_header": False,
  204. "exposed_type": MyCustomObject,
  205. "sheet_name": "Sheet1",
  206. },
  207. )
  208. data_custom = excel_data_node_as_custom_object.read()
  209. assert isinstance(data_custom, list)
  210. assert len(data_custom) == 6
  211. for (_, row_pandas), row_custom in zip(data_pandas.iterrows(), data_custom):
  212. assert isinstance(row_custom, MyCustomObject)
  213. assert row_pandas[0] == row_custom.id
  214. assert row_pandas[1] == row_custom.integer
  215. assert row_pandas[2] == row_custom.text
  216. @pytest.mark.modin
  217. def test_read_without_header_modin(self):
  218. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  219. # Create ExcelDataNode with modin exposed_type
  220. excel_data_node_as_modin = ExcelDataNode(
  221. "bar",
  222. Scope.SCENARIO,
  223. properties={"path": path, "has_header": False, "sheet_name": "Sheet1", "exposed_type": "modin"},
  224. )
  225. data_modin = excel_data_node_as_modin.read()
  226. assert isinstance(data_modin, modin_pd.DataFrame)
  227. assert len(data_modin) == 6
  228. assert np.array_equal(data_modin.to_numpy(), pd.read_excel(path, header=None).to_numpy())
  229. @pytest.mark.parametrize(
  230. "content,columns",
  231. [
  232. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], None),
  233. ([[11, 22, 33], [44, 55, 66]], None),
  234. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"]),
  235. ],
  236. )
  237. def test_write(self, excel_file, default_data_frame, content, columns):
  238. excel_dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1"})
  239. assert np.array_equal(excel_dn.read().values, default_data_frame.values)
  240. if not columns:
  241. excel_dn.write(content)
  242. df = pd.DataFrame(content)
  243. else:
  244. excel_dn.write_with_column_names(content, columns)
  245. df = pd.DataFrame(content, columns=columns)
  246. assert np.array_equal(excel_dn.read().values, df.values)
  247. excel_dn.write(None)
  248. assert len(excel_dn.read()) == 0
  249. @pytest.mark.parametrize(
  250. "content,sheet_name",
  251. [
  252. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], "sheet_name"),
  253. ([[11, 22, 33], [44, 55, 66]], ["sheet_name"]),
  254. ],
  255. )
  256. def test_write_with_sheet_name(self, excel_file_with_sheet_name, default_data_frame, content, sheet_name):
  257. excel_dn = ExcelDataNode(
  258. "foo", Scope.SCENARIO, properties={"path": excel_file_with_sheet_name, "sheet_name": sheet_name}
  259. )
  260. df = pd.DataFrame(content)
  261. if isinstance(sheet_name, str):
  262. assert np.array_equal(excel_dn.read().values, default_data_frame.values)
  263. else:
  264. assert np.array_equal(excel_dn.read()["sheet_name"].values, default_data_frame.values)
  265. excel_dn.write(content)
  266. if isinstance(sheet_name, str):
  267. assert np.array_equal(excel_dn.read().values, df.values)
  268. else:
  269. assert np.array_equal(excel_dn.read()["sheet_name"].values, df.values)
  270. sheet_names = pd.ExcelFile(excel_file_with_sheet_name).sheet_names
  271. expected_sheet_name = sheet_name[0] if isinstance(sheet_name, list) else sheet_name
  272. assert sheet_names[0] == expected_sheet_name
  273. excel_dn.write(None)
  274. if isinstance(sheet_name, str):
  275. assert len(excel_dn.read()) == 0
  276. else:
  277. assert len(excel_dn.read()) == 1
  278. @pytest.mark.parametrize(
  279. "content,sheet_name",
  280. [
  281. ([[11, 22, 33], [44, 55, 66]], ["sheet_name_1", "sheet_name_2"]),
  282. ],
  283. )
  284. def test_raise_write_with_sheet_name_length_mismatch(
  285. self, excel_file_with_sheet_name, default_data_frame, content, sheet_name
  286. ):
  287. excel_dn = ExcelDataNode(
  288. "foo", Scope.SCENARIO, properties={"path": excel_file_with_sheet_name, "sheet_name": sheet_name}
  289. )
  290. with pytest.raises(SheetNameLengthMismatch):
  291. excel_dn.write(content)
  292. @pytest.mark.parametrize(
  293. "content",
  294. [
  295. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  296. ],
  297. )
  298. def test_write_without_sheet_name(self, excel_file_with_sheet_name, default_data_frame, content):
  299. excel_dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file_with_sheet_name})
  300. default_data_frame = {"sheet_name": default_data_frame}
  301. df = {"Sheet1": pd.DataFrame(content)}
  302. assert np.array_equal(excel_dn.read()["sheet_name"].values, default_data_frame["sheet_name"].values)
  303. excel_dn.write(content)
  304. assert np.array_equal(excel_dn.read()["Sheet1"].values, df["Sheet1"].values)
  305. sheet_names = pd.ExcelFile(excel_file_with_sheet_name).sheet_names
  306. expected_sheet_name = "Sheet1"
  307. assert sheet_names[0] == expected_sheet_name
  308. excel_dn.write(None)
  309. assert len(excel_dn.read()) == 1
  310. @pytest.mark.parametrize(
  311. "content,columns,sheet_name",
  312. [
  313. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"], "sheet_name"),
  314. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"], ["sheet_name"]),
  315. ],
  316. )
  317. def test_write_with_column_and_sheet_name(
  318. self, excel_file_with_sheet_name, default_data_frame, content, columns, sheet_name
  319. ):
  320. excel_dn = ExcelDataNode(
  321. "foo", Scope.SCENARIO, properties={"path": excel_file_with_sheet_name, "sheet_name": sheet_name}
  322. )
  323. df = pd.DataFrame(content)
  324. if isinstance(sheet_name, str):
  325. assert np.array_equal(excel_dn.read().values, default_data_frame.values)
  326. else:
  327. assert np.array_equal(excel_dn.read()["sheet_name"].values, default_data_frame.values)
  328. excel_dn.write_with_column_names(content, columns)
  329. if isinstance(sheet_name, str):
  330. assert np.array_equal(excel_dn.read().values, df.values)
  331. else:
  332. assert np.array_equal(excel_dn.read()["sheet_name"].values, df.values)
  333. sheet_names = pd.ExcelFile(excel_file_with_sheet_name).sheet_names
  334. expected_sheet_name = sheet_name[0] if isinstance(sheet_name, list) else sheet_name
  335. assert sheet_names[0] == expected_sheet_name
  336. excel_dn.write(None)
  337. if isinstance(sheet_name, str):
  338. assert len(excel_dn.read()) == 0
  339. else:
  340. assert len(excel_dn.read()) == 1
  341. @pytest.mark.modin
  342. @pytest.mark.parametrize(
  343. "content,columns",
  344. [
  345. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], None),
  346. ([[11, 22, 33], [44, 55, 66]], None),
  347. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"]),
  348. ],
  349. )
  350. def test_write_modin(self, excel_file, default_data_frame, content, columns):
  351. excel_dn = ExcelDataNode(
  352. "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "modin"}
  353. )
  354. assert np.array_equal(excel_dn.read().values, default_data_frame.values)
  355. if not columns:
  356. excel_dn.write(content)
  357. df = modin_pd.DataFrame(content)
  358. else:
  359. excel_dn.write_with_column_names(content, columns)
  360. df = modin_pd.DataFrame(content, columns=columns)
  361. assert np.array_equal(excel_dn.read().values, df.values)
  362. excel_dn.write(None)
  363. assert len(excel_dn.read()) == 0
  364. def test_read_multi_sheet_with_header(self):
  365. not_existing_excel = ExcelDataNode(
  366. "foo",
  367. Scope.SCENARIO,
  368. properties={"path": "WRONG.xlsx", "sheet_name": ["sheet_name_1", "sheet_name_2"]},
  369. )
  370. with pytest.raises(NoData):
  371. assert not_existing_excel.read() is None
  372. not_existing_excel.read_or_raise()
  373. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  374. sheet_names = ["Sheet1", "Sheet2"]
  375. # Create ExcelDataNode without exposed_type (Default is pandas.DataFrame)
  376. excel_data_node_as_pandas = ExcelDataNode(
  377. "bar", Scope.SCENARIO, properties={"path": path, "sheet_name": sheet_names}
  378. )
  379. data_pandas = excel_data_node_as_pandas.read()
  380. assert isinstance(data_pandas, Dict)
  381. assert len(data_pandas) == 2
  382. assert all(
  383. len(data_pandas[sheet_name] == 5) and isinstance(data_pandas[sheet_name], pd.DataFrame)
  384. for sheet_name in sheet_names
  385. )
  386. assert list(data_pandas.keys()) == sheet_names
  387. for sheet_name in sheet_names:
  388. assert data_pandas[sheet_name].equals(pd.read_excel(path, sheet_name=sheet_name))
  389. excel_data_node_as_pandas_no_sheet_name = ExcelDataNode("bar", Scope.SCENARIO, properties={"path": path})
  390. data_pandas_no_sheet_name = excel_data_node_as_pandas_no_sheet_name.read()
  391. assert isinstance(data_pandas_no_sheet_name, Dict)
  392. for key in data_pandas_no_sheet_name.keys():
  393. assert isinstance(data_pandas_no_sheet_name[key], pd.DataFrame)
  394. assert data_pandas[key].equals(data_pandas_no_sheet_name[key])
  395. # Create ExcelDataNode with numpy exposed_type
  396. excel_data_node_as_numpy = ExcelDataNode(
  397. "bar",
  398. Scope.SCENARIO,
  399. properties={"path": path, "sheet_name": sheet_names, "exposed_type": "numpy"},
  400. )
  401. data_numpy = excel_data_node_as_numpy.read()
  402. assert isinstance(data_numpy, Dict)
  403. assert len(data_numpy) == 2
  404. assert all(
  405. len(data_numpy[sheet_name] == 5) and isinstance(data_numpy[sheet_name], np.ndarray)
  406. for sheet_name in sheet_names
  407. )
  408. assert list(data_numpy.keys()) == sheet_names
  409. for sheet_name in sheet_names:
  410. assert np.array_equal(data_pandas[sheet_name], pd.read_excel(path, sheet_name=sheet_name).to_numpy())
  411. excel_data_node_as_numpy_no_sheet_name = ExcelDataNode(
  412. "bar",
  413. Scope.SCENARIO,
  414. properties={"path": path, "exposed_type": "numpy"},
  415. )
  416. data_numpy_no_sheet_name = excel_data_node_as_numpy_no_sheet_name.read()
  417. assert isinstance(data_numpy_no_sheet_name, Dict)
  418. for key in data_numpy_no_sheet_name.keys():
  419. assert isinstance(data_numpy_no_sheet_name[key], np.ndarray)
  420. assert np.array_equal(data_numpy[key], data_numpy_no_sheet_name[key])
  421. # Create the same ExcelDataNode but with custom exposed_type
  422. non_existing_sheet_name_custom = ExcelDataNode(
  423. "bar",
  424. Scope.SCENARIO,
  425. properties={
  426. "path": path,
  427. "sheet_name": ["Sheet1", "xyz"],
  428. "exposed_type": MyCustomObject1,
  429. },
  430. )
  431. with pytest.raises(NonExistingExcelSheet):
  432. non_existing_sheet_name_custom.read()
  433. excel_data_node_as_custom_object = ExcelDataNode(
  434. "bar",
  435. Scope.SCENARIO,
  436. properties={"path": path, "sheet_name": sheet_names, "exposed_type": MyCustomObject1},
  437. )
  438. data_custom = excel_data_node_as_custom_object.read()
  439. assert isinstance(data_custom, Dict)
  440. assert len(data_custom) == 2
  441. assert all(len(data_custom[sheet_name]) == 5 for sheet_name in sheet_names)
  442. assert list(data_custom.keys()) == sheet_names
  443. for sheet_name in sheet_names:
  444. sheet_data_pandas, sheet_data_custom = data_pandas[sheet_name], data_custom[sheet_name]
  445. for (_, row_pandas), row_custom in zip(sheet_data_pandas.iterrows(), sheet_data_custom):
  446. assert isinstance(row_custom, MyCustomObject1)
  447. assert row_pandas["id"] == row_custom.id
  448. assert row_pandas["integer"] == row_custom.integer
  449. assert row_pandas["text"] == row_custom.text
  450. excel_data_node_as_custom_object_no_sheet_name = ExcelDataNode(
  451. "bar",
  452. Scope.SCENARIO,
  453. properties={"path": path, "exposed_type": MyCustomObject1},
  454. )
  455. data_custom_no_sheet_name = excel_data_node_as_custom_object_no_sheet_name.read()
  456. assert isinstance(data_custom_no_sheet_name, Dict)
  457. assert len(data_custom_no_sheet_name) == 2
  458. assert data_custom.keys() == data_custom_no_sheet_name.keys()
  459. for sheet_name in sheet_names:
  460. sheet_data_custom_no_sheet_name, sheet_data_custom = (
  461. data_custom_no_sheet_name[sheet_name],
  462. data_custom[sheet_name],
  463. )
  464. for row_custom_no_sheet_name, row_custom in zip(sheet_data_custom_no_sheet_name, sheet_data_custom):
  465. assert isinstance(row_custom_no_sheet_name, MyCustomObject1)
  466. assert row_custom_no_sheet_name.id == row_custom.id
  467. assert row_custom_no_sheet_name.integer == row_custom.integer
  468. assert row_custom_no_sheet_name.text == row_custom.text
  469. with pytest.raises(ExposedTypeLengthMismatch):
  470. dn = ExcelDataNode(
  471. "bar",
  472. Scope.SCENARIO,
  473. properties={
  474. "path": path,
  475. "sheet_name": ["Sheet1"],
  476. "exposed_type": [MyCustomObject1, MyCustomObject2],
  477. },
  478. )
  479. dn.read()
  480. custom_class_dict = {"Sheet1": MyCustomObject1, "Sheet2": MyCustomObject2}
  481. excel_data_node_as_multi_custom_object = ExcelDataNode(
  482. "bar",
  483. Scope.SCENARIO,
  484. properties={"path": path, "sheet_name": sheet_names, "exposed_type": custom_class_dict},
  485. )
  486. assert excel_data_node_as_multi_custom_object.properties["exposed_type"] == custom_class_dict
  487. excel_data_node_as_multi_custom_object = ExcelDataNode(
  488. "bar",
  489. Scope.SCENARIO,
  490. properties={"path": path, "sheet_name": sheet_names, "exposed_type": [MyCustomObject1, MyCustomObject2]},
  491. )
  492. assert excel_data_node_as_multi_custom_object.properties["exposed_type"] == [MyCustomObject1, MyCustomObject2]
  493. multi_data_custom = excel_data_node_as_multi_custom_object.read()
  494. assert isinstance(multi_data_custom, Dict)
  495. assert len(multi_data_custom) == 2
  496. assert all(len(multi_data_custom[sheet_name]) == 5 for sheet_name in sheet_names)
  497. assert list(multi_data_custom.keys()) == sheet_names
  498. for sheet_name, custom_class in custom_class_dict.items():
  499. sheet_data_pandas, sheet_data_custom = data_pandas[sheet_name], multi_data_custom[sheet_name]
  500. for (_, row_pandas), row_custom in zip(sheet_data_pandas.iterrows(), sheet_data_custom):
  501. assert isinstance(row_custom, custom_class)
  502. assert row_pandas["id"] == row_custom.id
  503. assert row_pandas["integer"] == row_custom.integer
  504. assert row_pandas["text"] == row_custom.text
  505. excel_data_node_as_multi_custom_object_no_sheet_name = ExcelDataNode(
  506. "bar",
  507. Scope.SCENARIO,
  508. properties={"path": path, "exposed_type": custom_class_dict},
  509. )
  510. assert excel_data_node_as_multi_custom_object_no_sheet_name.properties["exposed_type"] == custom_class_dict
  511. multi_data_custom_no_sheet_name = excel_data_node_as_multi_custom_object_no_sheet_name.read()
  512. assert isinstance(multi_data_custom_no_sheet_name, Dict)
  513. assert len(multi_data_custom_no_sheet_name) == 2
  514. assert multi_data_custom.keys() == multi_data_custom_no_sheet_name.keys()
  515. for sheet_name, custom_class in custom_class_dict.items():
  516. sheet_data_custom_no_sheet_name, sheet_data_custom = (
  517. multi_data_custom_no_sheet_name[sheet_name],
  518. multi_data_custom[sheet_name],
  519. )
  520. for row_custom_no_sheet_name, row_custom in zip(sheet_data_custom_no_sheet_name, sheet_data_custom):
  521. assert isinstance(row_custom_no_sheet_name, custom_class)
  522. assert row_custom_no_sheet_name.id == row_custom.id
  523. assert row_custom_no_sheet_name.integer == row_custom.integer
  524. assert row_custom_no_sheet_name.text == row_custom.text
  525. @pytest.mark.modin
  526. def test_read_multi_sheet_with_header_modin(self):
  527. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  528. sheet_names = ["Sheet1", "Sheet2"]
  529. # Create ExcelDataNode with modin exposed_type
  530. excel_data_node_as_modin = ExcelDataNode(
  531. "bar", Scope.SCENARIO, properties={"path": path, "sheet_name": sheet_names, "exposed_type": "modin"}
  532. )
  533. data_modin = excel_data_node_as_modin.read()
  534. assert isinstance(data_modin, Dict)
  535. assert len(data_modin) == 2
  536. assert all(
  537. len(data_modin[sheet_name] == 5) and isinstance(data_modin[sheet_name], modin_pd.DataFrame)
  538. for sheet_name in sheet_names
  539. )
  540. assert list(data_modin.keys()) == sheet_names
  541. for sheet_name in sheet_names:
  542. assert data_modin[sheet_name].equals(modin_pd.read_excel(path, sheet_name=sheet_name))
  543. excel_data_node_as_pandas_no_sheet_name = ExcelDataNode(
  544. "bar", Scope.SCENARIO, properties={"path": path, "exposed_type": "modin"}
  545. )
  546. data_modin_no_sheet_name = excel_data_node_as_pandas_no_sheet_name.read()
  547. assert isinstance(data_modin_no_sheet_name, Dict)
  548. for key in data_modin_no_sheet_name.keys():
  549. assert isinstance(data_modin_no_sheet_name[key], modin_pd.DataFrame)
  550. assert data_modin[key].equals(data_modin_no_sheet_name[key])
  551. def test_read_multi_sheet_without_header(self):
  552. not_existing_excel = ExcelDataNode(
  553. "foo",
  554. Scope.SCENARIO,
  555. properties={"path": "WRONG.xlsx", "has_header": False, "sheet_name": ["sheet_name_1", "sheet_name_2"]},
  556. )
  557. with pytest.raises(NoData):
  558. assert not_existing_excel.read() is None
  559. not_existing_excel.read_or_raise()
  560. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  561. sheet_names = ["Sheet1", "Sheet2"]
  562. # Create ExcelDataNode without exposed_type (Default is pandas.DataFrame)
  563. excel_data_node_as_pandas = ExcelDataNode(
  564. "bar", Scope.SCENARIO, properties={"path": path, "has_header": False, "sheet_name": sheet_names}
  565. )
  566. data_pandas = excel_data_node_as_pandas.read()
  567. assert isinstance(data_pandas, Dict)
  568. assert len(data_pandas) == 2
  569. assert all(len(data_pandas[sheet_name]) == 6 for sheet_name in sheet_names)
  570. assert list(data_pandas.keys()) == sheet_names
  571. for sheet_name in sheet_names:
  572. assert isinstance(data_pandas[sheet_name], pd.DataFrame)
  573. assert data_pandas[sheet_name].equals(pd.read_excel(path, header=None, sheet_name=sheet_name))
  574. excel_data_node_as_pandas_no_sheet_name = ExcelDataNode(
  575. "bar", Scope.SCENARIO, properties={"path": path, "has_header": False}
  576. )
  577. data_pandas_no_sheet_name = excel_data_node_as_pandas_no_sheet_name.read()
  578. assert isinstance(data_pandas_no_sheet_name, Dict)
  579. for key in data_pandas_no_sheet_name.keys():
  580. assert isinstance(data_pandas_no_sheet_name[key], pd.DataFrame)
  581. assert data_pandas[key].equals(data_pandas_no_sheet_name[key])
  582. # Create ExcelDataNode with numpy exposed_type
  583. excel_data_node_as_numpy = ExcelDataNode(
  584. "bar",
  585. Scope.SCENARIO,
  586. properties={"path": path, "has_header": False, "sheet_name": sheet_names, "exposed_type": "numpy"},
  587. )
  588. data_numpy = excel_data_node_as_numpy.read()
  589. assert isinstance(data_numpy, Dict)
  590. assert len(data_numpy) == 2
  591. assert all(
  592. len(data_numpy[sheet_name] == 6) and isinstance(data_numpy[sheet_name], np.ndarray)
  593. for sheet_name in sheet_names
  594. )
  595. assert list(data_numpy.keys()) == sheet_names
  596. for sheet_name in sheet_names:
  597. assert np.array_equal(
  598. data_pandas[sheet_name], pd.read_excel(path, header=None, sheet_name=sheet_name).to_numpy()
  599. )
  600. excel_data_node_as_numpy_no_sheet_name = ExcelDataNode(
  601. "bar",
  602. Scope.SCENARIO,
  603. properties={"path": path, "has_header": False, "exposed_type": "numpy"},
  604. )
  605. data_numpy_no_sheet_name = excel_data_node_as_numpy_no_sheet_name.read()
  606. assert isinstance(data_numpy_no_sheet_name, Dict)
  607. for key in data_numpy_no_sheet_name.keys():
  608. assert isinstance(data_numpy_no_sheet_name[key], np.ndarray)
  609. assert np.array_equal(data_numpy[key], data_numpy_no_sheet_name[key])
  610. # Create the same ExcelDataNode but with custom exposed_type
  611. non_existing_sheet_name_custom = ExcelDataNode(
  612. "bar",
  613. Scope.SCENARIO,
  614. properties={
  615. "path": path,
  616. "has_header": False,
  617. "sheet_name": ["Sheet1", "xyz"],
  618. "exposed_type": MyCustomObject1,
  619. },
  620. )
  621. with pytest.raises(NonExistingExcelSheet):
  622. non_existing_sheet_name_custom.read()
  623. excel_data_node_as_custom_object = ExcelDataNode(
  624. "bar",
  625. Scope.SCENARIO,
  626. properties={
  627. "path": path,
  628. "has_header": False,
  629. "sheet_name": sheet_names,
  630. "exposed_type": MyCustomObject1,
  631. },
  632. )
  633. data_custom = excel_data_node_as_custom_object.read()
  634. assert excel_data_node_as_custom_object.exposed_type == MyCustomObject1
  635. assert isinstance(data_custom, Dict)
  636. assert len(data_custom) == 2
  637. assert all(len(data_custom[sheet_name]) == 6 for sheet_name in sheet_names)
  638. assert list(data_custom.keys()) == sheet_names
  639. for sheet_name in sheet_names:
  640. sheet_data_pandas, sheet_data_custom = data_pandas[sheet_name], data_custom[sheet_name]
  641. for (_, row_pandas), row_custom in zip(sheet_data_pandas.iterrows(), sheet_data_custom):
  642. assert isinstance(row_custom, MyCustomObject1)
  643. assert row_pandas[0] == row_custom.id
  644. assert row_pandas[1] == row_custom.integer
  645. assert row_pandas[2] == row_custom.text
  646. excel_data_node_as_custom_object_no_sheet_name = ExcelDataNode(
  647. "bar",
  648. Scope.SCENARIO,
  649. properties={"path": path, "has_header": False, "exposed_type": MyCustomObject1},
  650. )
  651. data_custom_no_sheet_name = excel_data_node_as_custom_object_no_sheet_name.read()
  652. assert isinstance(data_custom_no_sheet_name, Dict)
  653. assert len(data_custom_no_sheet_name) == 2
  654. assert data_custom.keys() == data_custom_no_sheet_name.keys()
  655. for sheet_name in sheet_names:
  656. sheet_data_custom_no_sheet_name, sheet_data_custom = (
  657. data_custom_no_sheet_name[sheet_name],
  658. data_custom[sheet_name],
  659. )
  660. for row_custom_no_sheet_name, row_custom in zip(sheet_data_custom_no_sheet_name, sheet_data_custom):
  661. assert isinstance(row_custom_no_sheet_name, MyCustomObject1)
  662. assert row_custom_no_sheet_name.id == row_custom.id
  663. assert row_custom_no_sheet_name.integer == row_custom.integer
  664. assert row_custom_no_sheet_name.text == row_custom.text
  665. with pytest.raises(ExposedTypeLengthMismatch):
  666. dn = ExcelDataNode(
  667. "bar",
  668. Scope.SCENARIO,
  669. properties={
  670. "path": path,
  671. "sheet_name": ["Sheet1"],
  672. "exposed_type": [MyCustomObject1, MyCustomObject2],
  673. "has_header": False,
  674. },
  675. )
  676. dn.read()
  677. custom_class_dict = {"Sheet1": MyCustomObject1, "Sheet2": MyCustomObject2}
  678. excel_data_node_as_multi_custom_object = ExcelDataNode(
  679. "bar",
  680. Scope.SCENARIO,
  681. properties={
  682. "path": path,
  683. "sheet_name": sheet_names,
  684. "exposed_type": custom_class_dict,
  685. "has_header": False,
  686. },
  687. )
  688. assert excel_data_node_as_multi_custom_object.properties["exposed_type"] == custom_class_dict
  689. excel_data_node_as_multi_custom_object = ExcelDataNode(
  690. "bar",
  691. Scope.SCENARIO,
  692. properties={
  693. "path": path,
  694. "sheet_name": sheet_names,
  695. "exposed_type": [MyCustomObject1, MyCustomObject2],
  696. "has_header": False,
  697. },
  698. )
  699. assert excel_data_node_as_multi_custom_object.properties["exposed_type"] == [MyCustomObject1, MyCustomObject2]
  700. multi_data_custom = excel_data_node_as_multi_custom_object.read()
  701. assert isinstance(multi_data_custom, Dict)
  702. assert len(multi_data_custom) == 2
  703. assert all(len(multi_data_custom[sheet_name]) == 6 for sheet_name in sheet_names)
  704. assert list(multi_data_custom.keys()) == sheet_names
  705. for sheet_name, custom_class in custom_class_dict.items():
  706. sheet_data_pandas, sheet_data_custom = data_pandas[sheet_name], multi_data_custom[sheet_name]
  707. for (_, row_pandas), row_custom in zip(sheet_data_pandas.iterrows(), sheet_data_custom):
  708. assert isinstance(row_custom, custom_class)
  709. assert row_pandas[0] == row_custom.id
  710. assert row_pandas[1] == row_custom.integer
  711. assert row_pandas[2] == row_custom.text
  712. excel_data_node_as_multi_custom_object_no_sheet_name = ExcelDataNode(
  713. "bar",
  714. Scope.SCENARIO,
  715. properties={"path": path, "has_header": False, "exposed_type": custom_class_dict},
  716. )
  717. multi_data_custom_no_sheet_name = excel_data_node_as_multi_custom_object_no_sheet_name.read()
  718. assert isinstance(multi_data_custom_no_sheet_name, Dict)
  719. assert len(multi_data_custom_no_sheet_name) == 2
  720. assert multi_data_custom.keys() == multi_data_custom_no_sheet_name.keys()
  721. for sheet_name, custom_class in custom_class_dict.items():
  722. sheet_data_custom_no_sheet_name, sheet_data_custom = (
  723. multi_data_custom_no_sheet_name[sheet_name],
  724. multi_data_custom[sheet_name],
  725. )
  726. for row_custom_no_sheet_name, row_custom in zip(sheet_data_custom_no_sheet_name, sheet_data_custom):
  727. assert isinstance(row_custom_no_sheet_name, custom_class)
  728. assert row_custom_no_sheet_name.id == row_custom.id
  729. assert row_custom_no_sheet_name.integer == row_custom.integer
  730. assert row_custom_no_sheet_name.text == row_custom.text
  731. @pytest.mark.modin
  732. def test_read_multi_sheet_without_header_modin(self):
  733. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  734. sheet_names = ["Sheet1", "Sheet2"]
  735. # Create ExcelDataNode with modin exposed_type
  736. excel_data_node_as_modin = ExcelDataNode(
  737. "bar",
  738. Scope.SCENARIO,
  739. properties={"path": path, "has_header": False, "sheet_name": sheet_names, "exposed_type": "modin"},
  740. )
  741. data_modin = excel_data_node_as_modin.read()
  742. assert isinstance(data_modin, Dict)
  743. assert len(data_modin) == 2
  744. assert all(len(data_modin[sheet_name]) == 6 for sheet_name in sheet_names)
  745. assert list(data_modin.keys()) == sheet_names
  746. for sheet_name in sheet_names:
  747. assert isinstance(data_modin[sheet_name], modin_pd.DataFrame)
  748. assert data_modin[sheet_name].equals(pd.read_excel(path, header=None, sheet_name=sheet_name))
  749. excel_data_node_as_modin_no_sheet_name = ExcelDataNode(
  750. "bar", Scope.SCENARIO, properties={"path": path, "has_header": False, "exposed_type": "modin"}
  751. )
  752. data_modin_no_sheet_name = excel_data_node_as_modin_no_sheet_name.read()
  753. assert isinstance(data_modin_no_sheet_name, Dict)
  754. for key in data_modin_no_sheet_name.keys():
  755. assert isinstance(data_modin_no_sheet_name[key], modin_pd.DataFrame)
  756. assert data_modin[key].equals(data_modin_no_sheet_name[key])
  757. @pytest.mark.parametrize(
  758. "content,columns",
  759. [
  760. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], None),
  761. ([[11, 22, 33], [44, 55, 66]], None),
  762. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"]),
  763. ],
  764. )
  765. def test_write_multi_sheet(self, excel_file_with_multi_sheet, default_multi_sheet_data_frame, content, columns):
  766. sheet_names = ["Sheet1", "Sheet2"]
  767. excel_dn = ExcelDataNode(
  768. "foo",
  769. Scope.SCENARIO,
  770. properties={"path": excel_file_with_multi_sheet, "sheet_name": sheet_names},
  771. )
  772. for sheet_name in sheet_names:
  773. assert np.array_equal(excel_dn.read()[sheet_name].values, default_multi_sheet_data_frame[sheet_name].values)
  774. multi_sheet_content = {sheet_name: pd.DataFrame(content) for sheet_name in sheet_names}
  775. excel_dn.write(multi_sheet_content)
  776. for sheet_name in sheet_names:
  777. assert np.array_equal(excel_dn.read()[sheet_name].values, multi_sheet_content[sheet_name].values)
  778. def test_write_multi_sheet_numpy(self, excel_file_with_multi_sheet):
  779. sheet_names = ["Sheet1", "Sheet2"]
  780. excel_dn = ExcelDataNode(
  781. "foo",
  782. Scope.SCENARIO,
  783. properties={"path": excel_file_with_multi_sheet, "sheet_name": sheet_names, "exposed_type": "numpy"},
  784. )
  785. sheets_data = [[11, 22, 33], [44, 55, 66]]
  786. data = {
  787. sheet_name: pd.DataFrame(sheet_data).to_numpy() for sheet_name, sheet_data in zip(sheet_names, sheets_data)
  788. }
  789. excel_dn.write(data)
  790. read_data = excel_dn.read()
  791. assert all(np.array_equal(data[sheet_name], read_data[sheet_name]) for sheet_name in sheet_names)
  792. @pytest.mark.modin
  793. @pytest.mark.parametrize(
  794. "content,columns",
  795. [
  796. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], None),
  797. ([[11, 22, 33], [44, 55, 66]], None),
  798. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"]),
  799. ],
  800. )
  801. def test_write_multi_sheet_with_modin(
  802. self, excel_file_with_multi_sheet, default_multi_sheet_data_frame, content, columns
  803. ):
  804. sheet_names = ["Sheet1", "Sheet2"]
  805. excel_dn = ExcelDataNode(
  806. "foo",
  807. Scope.SCENARIO,
  808. properties={"path": excel_file_with_multi_sheet, "sheet_name": sheet_names, "exposed_type": "modin"},
  809. )
  810. for sheet_name in sheet_names:
  811. assert np.array_equal(excel_dn.read()[sheet_name].values, default_multi_sheet_data_frame[sheet_name].values)
  812. multi_sheet_content = {sheet_name: modin_pd.DataFrame(content) for sheet_name in sheet_names}
  813. excel_dn.write(multi_sheet_content)
  814. for sheet_name in sheet_names:
  815. assert np.array_equal(excel_dn.read()[sheet_name].values, multi_sheet_content[sheet_name].values)
  816. @pytest.mark.parametrize(
  817. "content",
  818. [
  819. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  820. (pd.DataFrame([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}])),
  821. ([[11, 22, 33], [44, 55, 66]]),
  822. ],
  823. )
  824. def test_append_pandas_with_sheetname(self, excel_file, default_data_frame, content):
  825. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1"})
  826. assert_frame_equal(dn.read(), default_data_frame)
  827. dn.append(content)
  828. assert_frame_equal(
  829. dn.read(),
  830. pd.concat([default_data_frame, pd.DataFrame(content, columns=["a", "b", "c"])]).reset_index(drop=True),
  831. )
  832. @pytest.mark.parametrize(
  833. "content",
  834. [
  835. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  836. (pd.DataFrame([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}])),
  837. ([[11, 22, 33], [44, 55, 66]]),
  838. ],
  839. )
  840. def test_append_pandas_without_sheetname(self, excel_file, default_data_frame, content):
  841. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file})
  842. assert_frame_equal(dn.read()["Sheet1"], default_data_frame)
  843. dn.append(content)
  844. assert_frame_equal(
  845. dn.read()["Sheet1"],
  846. pd.concat([default_data_frame, pd.DataFrame(content, columns=["a", "b", "c"])]).reset_index(drop=True),
  847. )
  848. @pytest.mark.parametrize(
  849. "content",
  850. [
  851. (
  852. {
  853. "Sheet1": pd.DataFrame([{"a": 11, "b": 22, "c": 33}]),
  854. "Sheet2": pd.DataFrame([{"a": 44, "b": 55, "c": 66}]),
  855. }
  856. ),
  857. (
  858. {
  859. "Sheet1": pd.DataFrame({"a": [11, 44], "b": [22, 55], "c": [33, 66]}),
  860. "Sheet2": pd.DataFrame([{"a": 77, "b": 88, "c": 99}]),
  861. }
  862. ),
  863. ({"Sheet1": np.array([[11, 22, 33], [44, 55, 66]]), "Sheet2": np.array([[77, 88, 99]])}),
  864. ],
  865. )
  866. def test_append_pandas_multisheet(self, excel_file_with_multi_sheet, default_multi_sheet_data_frame, content):
  867. dn = ExcelDataNode(
  868. "foo", Scope.SCENARIO, properties={"path": excel_file_with_multi_sheet, "sheet_name": ["Sheet1", "Sheet2"]}
  869. )
  870. assert_frame_equal(dn.read()["Sheet1"], default_multi_sheet_data_frame["Sheet1"])
  871. assert_frame_equal(dn.read()["Sheet2"], default_multi_sheet_data_frame["Sheet2"])
  872. dn.append(content)
  873. assert_frame_equal(
  874. dn.read()["Sheet1"],
  875. pd.concat(
  876. [default_multi_sheet_data_frame["Sheet1"], pd.DataFrame(content["Sheet1"], columns=["a", "b", "c"])]
  877. ).reset_index(drop=True),
  878. )
  879. assert_frame_equal(
  880. dn.read()["Sheet2"],
  881. pd.concat(
  882. [default_multi_sheet_data_frame["Sheet2"], pd.DataFrame(content["Sheet2"], columns=["a", "b", "c"])]
  883. ).reset_index(drop=True),
  884. )
  885. @pytest.mark.parametrize(
  886. "content",
  887. [
  888. ({"Sheet1": pd.DataFrame([{"a": 11, "b": 22, "c": 33}])}),
  889. (pd.DataFrame({"a": [11, 44], "b": [22, 55], "c": [33, 66]})),
  890. ([[11, 22, 33], [44, 55, 66]]),
  891. ],
  892. )
  893. def test_append_only_first_sheet_of_a_multisheet_file(
  894. self, excel_file_with_multi_sheet, default_multi_sheet_data_frame, content
  895. ):
  896. dn = ExcelDataNode(
  897. "foo", Scope.SCENARIO, properties={"path": excel_file_with_multi_sheet, "sheet_name": ["Sheet1", "Sheet2"]}
  898. )
  899. assert_frame_equal(dn.read()["Sheet1"], default_multi_sheet_data_frame["Sheet1"])
  900. assert_frame_equal(dn.read()["Sheet2"], default_multi_sheet_data_frame["Sheet2"])
  901. dn.append(content)
  902. appended_content = content["Sheet1"] if isinstance(content, dict) else content
  903. assert_frame_equal(
  904. dn.read()["Sheet1"],
  905. pd.concat(
  906. [default_multi_sheet_data_frame["Sheet1"], pd.DataFrame(appended_content, columns=["a", "b", "c"])]
  907. ).reset_index(drop=True),
  908. )
  909. assert_frame_equal(dn.read()["Sheet2"], default_multi_sheet_data_frame["Sheet2"])
  910. @pytest.mark.modin
  911. @pytest.mark.parametrize(
  912. "content",
  913. [
  914. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  915. (modin_pd.DataFrame([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}])),
  916. ([[11, 22, 33], [44, 55, 66]]),
  917. ],
  918. )
  919. def test_append_modin_with_sheetname(self, excel_file, default_data_frame, content):
  920. dn = ExcelDataNode(
  921. "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "modin"}
  922. )
  923. df_equals(dn.read(), modin_pd.DataFrame(default_data_frame))
  924. dn.append(content)
  925. df_equals(
  926. dn.read(),
  927. modin_pd.concat(
  928. [modin_pd.DataFrame(default_data_frame), modin_pd.DataFrame(content, columns=["a", "b", "c"])]
  929. ).reset_index(drop=True),
  930. )
  931. @pytest.mark.modin
  932. @pytest.mark.parametrize(
  933. "content",
  934. [
  935. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  936. (modin_pd.DataFrame([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}])),
  937. ([[11, 22, 33], [44, 55, 66]]),
  938. ],
  939. )
  940. def test_append_modin_without_sheetname(self, excel_file, default_data_frame, content):
  941. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "modin"})
  942. df_equals(dn.read()["Sheet1"], default_data_frame)
  943. dn.append(content)
  944. df_equals(
  945. dn.read()["Sheet1"],
  946. modin_pd.concat([default_data_frame, modin_pd.DataFrame(content, columns=["a", "b", "c"])]).reset_index(
  947. drop=True
  948. ),
  949. )
  950. @pytest.mark.modin
  951. @pytest.mark.parametrize(
  952. "content",
  953. [
  954. (
  955. {
  956. "Sheet1": modin_pd.DataFrame([{"a": 11, "b": 22, "c": 33}]),
  957. "Sheet2": modin_pd.DataFrame([{"a": 44, "b": 55, "c": 66}]),
  958. }
  959. ),
  960. (
  961. {
  962. "Sheet1": modin_pd.DataFrame({"a": [11, 44], "b": [22, 55], "c": [33, 66]}),
  963. "Sheet2": modin_pd.DataFrame([{"a": 77, "b": 88, "c": 99}]),
  964. }
  965. ),
  966. ({"Sheet1": np.array([[11, 22, 33], [44, 55, 66]]), "Sheet2": np.array([[77, 88, 99]])}),
  967. ],
  968. )
  969. def test_append_modin_multisheet(self, excel_file_with_multi_sheet, default_multi_sheet_data_frame, content):
  970. dn = ExcelDataNode(
  971. "foo",
  972. Scope.SCENARIO,
  973. properties={
  974. "path": excel_file_with_multi_sheet,
  975. "sheet_name": ["Sheet1", "Sheet2"],
  976. "exposed_type": "modin",
  977. },
  978. )
  979. df_equals(dn.read()["Sheet1"], default_multi_sheet_data_frame["Sheet1"])
  980. df_equals(dn.read()["Sheet2"], default_multi_sheet_data_frame["Sheet2"])
  981. dn.append(content)
  982. df_equals(
  983. dn.read()["Sheet1"],
  984. modin_pd.concat(
  985. [
  986. default_multi_sheet_data_frame["Sheet1"],
  987. modin_pd.DataFrame(content["Sheet1"], columns=["a", "b", "c"]),
  988. ]
  989. ).reset_index(drop=True),
  990. )
  991. df_equals(
  992. dn.read()["Sheet2"],
  993. modin_pd.concat(
  994. [
  995. default_multi_sheet_data_frame["Sheet2"],
  996. modin_pd.DataFrame(content["Sheet2"], columns=["a", "b", "c"]),
  997. ]
  998. ).reset_index(drop=True),
  999. )
  1000. def test_filter_pandas_exposed_type_with_sheetname(self, excel_file):
  1001. dn = ExcelDataNode(
  1002. "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "pandas"}
  1003. )
  1004. dn.write(
  1005. [
  1006. {"foo": 1, "bar": 1},
  1007. {"foo": 1, "bar": 2},
  1008. {"foo": 1},
  1009. {"foo": 2, "bar": 2},
  1010. {"bar": 2},
  1011. ]
  1012. )
  1013. # Test datanode indexing and slicing
  1014. assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None]))
  1015. assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2]))
  1016. assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  1017. # Test filter data
  1018. filtered_by_filter_method = dn.filter(("foo", 1, Operator.EQUAL))
  1019. filtered_by_indexing = dn[dn["foo"] == 1]
  1020. expected_data = pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}, {"foo": 1.0}])
  1021. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1022. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  1023. filtered_by_filter_method = dn.filter(("foo", 1, Operator.NOT_EQUAL))
  1024. filtered_by_indexing = dn[dn["foo"] != 1]
  1025. expected_data = pd.DataFrame([{"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  1026. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1027. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  1028. filtered_by_filter_method = dn.filter(("bar", 2, Operator.EQUAL))
  1029. filtered_by_indexing = dn[dn["bar"] == 2]
  1030. expected_data = pd.DataFrame([{"foo": 1.0, "bar": 2.0}, {"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  1031. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1032. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  1033. filtered_by_filter_method = dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)
  1034. filtered_by_indexing = dn[(dn["bar"] == 1) | (dn["bar"] == 2)]
  1035. expected_data = pd.DataFrame(
  1036. [
  1037. {"foo": 1.0, "bar": 1.0},
  1038. {"foo": 1.0, "bar": 2.0},
  1039. {"foo": 2.0, "bar": 2.0},
  1040. {"bar": 2.0},
  1041. ]
  1042. )
  1043. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1044. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  1045. def test_filter_pandas_exposed_type_without_sheetname(self, excel_file):
  1046. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "pandas"})
  1047. dn.write(
  1048. [
  1049. {"foo": 1, "bar": 1},
  1050. {"foo": 1, "bar": 2},
  1051. {"foo": 1},
  1052. {"foo": 2, "bar": 2},
  1053. {"bar": 2},
  1054. ]
  1055. )
  1056. assert len(dn.filter(("foo", 1, Operator.EQUAL))["Sheet1"]) == 3
  1057. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["Sheet1"]) == 2
  1058. assert len(dn.filter(("bar", 2, Operator.EQUAL))["Sheet1"]) == 3
  1059. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["Sheet1"]) == 4
  1060. assert dn["Sheet1"]["foo"].equals(pd.Series([1, 1, 1, 2, None]))
  1061. assert dn["Sheet1"]["bar"].equals(pd.Series([1, 2, None, 2, 2]))
  1062. assert dn["Sheet1"][:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  1063. def test_filter_pandas_exposed_type_multisheet(self, excel_file):
  1064. dn = ExcelDataNode(
  1065. "foo",
  1066. Scope.SCENARIO,
  1067. properties={"path": excel_file, "sheet_name": ["sheet_1", "sheet_2"], "exposed_type": "pandas"},
  1068. )
  1069. dn.write(
  1070. {
  1071. "sheet_1": pd.DataFrame(
  1072. [
  1073. {"foo": 1, "bar": 1},
  1074. {"foo": 1, "bar": 2},
  1075. {"foo": 1},
  1076. {"foo": 2, "bar": 2},
  1077. {"bar": 2},
  1078. ]
  1079. ),
  1080. "sheet_2": pd.DataFrame(
  1081. [
  1082. {"foo": 1, "bar": 3},
  1083. {"foo": 1, "bar": 4},
  1084. {"foo": 1},
  1085. {"foo": 2, "bar": 4},
  1086. {"bar": 4},
  1087. ]
  1088. ),
  1089. }
  1090. )
  1091. assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 2
  1092. assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_1"]) == 3
  1093. assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_2"]) == 3
  1094. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2
  1095. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_1"]) == 2
  1096. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_2"]) == 2
  1097. assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 2
  1098. assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_1"]) == 3
  1099. assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_2"]) == 0
  1100. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 2
  1101. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_1"]) == 4
  1102. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_2"]) == 0
  1103. assert dn["sheet_1"]["foo"].equals(pd.Series([1, 1, 1, 2, None]))
  1104. assert dn["sheet_2"]["foo"].equals(pd.Series([1, 1, 1, 2, None]))
  1105. assert dn["sheet_1"]["bar"].equals(pd.Series([1, 2, None, 2, 2]))
  1106. assert dn["sheet_2"]["bar"].equals(pd.Series([3, 4, None, 4, 4]))
  1107. assert dn["sheet_1"][:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  1108. assert dn["sheet_2"][:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 3.0}, {"foo": 1.0, "bar": 4.0}]))
  1109. @pytest.mark.modin
  1110. def test_filter_modin_exposed_type_with_sheetname(self, excel_file):
  1111. dn = ExcelDataNode(
  1112. "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "modin"}
  1113. )
  1114. dn.write(
  1115. [
  1116. {"foo": 1, "bar": 1},
  1117. {"foo": 1, "bar": 2},
  1118. {"foo": 1},
  1119. {"foo": 2, "bar": 2},
  1120. {"bar": 2},
  1121. ]
  1122. )
  1123. # Test datanode indexing and slicing
  1124. assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None]))
  1125. assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2]))
  1126. assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  1127. # Test filter data
  1128. filtered_by_filter_method = dn.filter(("foo", 1, Operator.EQUAL))
  1129. filtered_by_indexing = dn[dn["foo"] == 1]
  1130. expected_data = modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}, {"foo": 1.0}])
  1131. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1132. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  1133. filtered_by_filter_method = dn.filter(("foo", 1, Operator.NOT_EQUAL))
  1134. filtered_by_indexing = dn[dn["foo"] != 1]
  1135. expected_data = modin_pd.DataFrame([{"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  1136. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1137. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  1138. filtered_by_filter_method = dn.filter(("bar", 2, Operator.EQUAL))
  1139. filtered_by_indexing = dn[dn["bar"] == 2]
  1140. expected_data = modin_pd.DataFrame([{"foo": 1.0, "bar": 2.0}, {"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  1141. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1142. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  1143. filtered_by_filter_method = dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)
  1144. filtered_by_indexing = dn[(dn["bar"] == 1) | (dn["bar"] == 2)]
  1145. expected_data = modin_pd.DataFrame(
  1146. [
  1147. {"foo": 1.0, "bar": 1.0},
  1148. {"foo": 1.0, "bar": 2.0},
  1149. {"foo": 2.0, "bar": 2.0},
  1150. {"bar": 2.0},
  1151. ]
  1152. )
  1153. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  1154. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  1155. @pytest.mark.modin
  1156. def test_filter_modin_exposed_type_without_sheetname(self, excel_file):
  1157. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "modin"})
  1158. dn.write(
  1159. [
  1160. {"foo": 1, "bar": 1},
  1161. {"foo": 1, "bar": 2},
  1162. {"foo": 1},
  1163. {"foo": 2, "bar": 2},
  1164. {"bar": 2},
  1165. ]
  1166. )
  1167. assert len(dn.filter(("foo", 1, Operator.EQUAL))["Sheet1"]) == 3
  1168. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["Sheet1"]) == 2
  1169. assert len(dn.filter(("bar", 2, Operator.EQUAL))["Sheet1"]) == 3
  1170. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["Sheet1"]) == 4
  1171. assert dn["Sheet1"]["foo"].equals(modin_pd.Series([1, 1, 1, 2, None]))
  1172. assert dn["Sheet1"]["bar"].equals(modin_pd.Series([1, 2, None, 2, 2]))
  1173. assert dn["Sheet1"][:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  1174. @pytest.mark.modin
  1175. def test_filter_modin_exposed_type_multisheet(self, excel_file):
  1176. dn = ExcelDataNode(
  1177. "foo",
  1178. Scope.SCENARIO,
  1179. properties={"path": excel_file, "sheet_name": ["sheet_1", "sheet_2"], "exposed_type": "modin"},
  1180. )
  1181. dn.write(
  1182. {
  1183. "sheet_1": pd.DataFrame(
  1184. [
  1185. {"foo": 1, "bar": 1},
  1186. {"foo": 1, "bar": 2},
  1187. {"foo": 1},
  1188. {"foo": 2, "bar": 2},
  1189. {"bar": 2},
  1190. ]
  1191. ),
  1192. "sheet_2": pd.DataFrame(
  1193. [
  1194. {"foo": 1, "bar": 3},
  1195. {"foo": 1, "bar": 4},
  1196. {"foo": 1},
  1197. {"foo": 2, "bar": 4},
  1198. {"bar": 4},
  1199. ]
  1200. ),
  1201. }
  1202. )
  1203. assert len(dn.filter(("foo", 1, Operator.EQUAL))) == 2
  1204. assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_1"]) == 3
  1205. assert len(dn.filter(("foo", 1, Operator.EQUAL))["sheet_2"]) == 3
  1206. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))) == 2
  1207. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_1"]) == 2
  1208. assert len(dn.filter(("foo", 1, Operator.NOT_EQUAL))["sheet_2"]) == 2
  1209. assert len(dn.filter(("bar", 2, Operator.EQUAL))) == 2
  1210. assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_1"]) == 3
  1211. assert len(dn.filter(("bar", 2, Operator.EQUAL))["sheet_2"]) == 0
  1212. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)) == 2
  1213. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_1"]) == 4
  1214. assert len(dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)["sheet_2"]) == 0
  1215. assert dn["sheet_1"]["foo"].equals(modin_pd.Series([1, 1, 1, 2, None]))
  1216. assert dn["sheet_2"]["foo"].equals(modin_pd.Series([1, 1, 1, 2, None]))
  1217. assert dn["sheet_1"]["bar"].equals(modin_pd.Series([1, 2, None, 2, 2]))
  1218. assert dn["sheet_2"]["bar"].equals(modin_pd.Series([3, 4, None, 4, 4]))
  1219. assert dn["sheet_1"][:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  1220. assert dn["sheet_2"][:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 3.0}, {"foo": 1.0, "bar": 4.0}]))
  1221. def test_filter_numpy_exposed_type_with_sheetname(self, excel_file):
  1222. dn = ExcelDataNode(
  1223. "foo", Scope.SCENARIO, properties={"path": excel_file, "sheet_name": "Sheet1", "exposed_type": "numpy"}
  1224. )
  1225. dn.write(
  1226. [
  1227. [1, 1],
  1228. [1, 2],
  1229. [1, 3],
  1230. [2, 1],
  1231. [2, 2],
  1232. [2, 3],
  1233. ]
  1234. )
  1235. # Test datanode indexing and slicing
  1236. assert np.array_equal(dn[0], np.array([1, 1]))
  1237. assert np.array_equal(dn[1], np.array([1, 2]))
  1238. assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]]))
  1239. assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2]))
  1240. assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]]))
  1241. # Test filter data
  1242. assert np.array_equal(dn.filter((0, 1, Operator.EQUAL)), np.array([[1, 1], [1, 2], [1, 3]]))
  1243. assert np.array_equal(dn[dn[:, 0] == 1], np.array([[1, 1], [1, 2], [1, 3]]))
  1244. assert np.array_equal(dn.filter((0, 1, Operator.NOT_EQUAL)), np.array([[2, 1], [2, 2], [2, 3]]))
  1245. assert np.array_equal(dn[dn[:, 0] != 1], np.array([[2, 1], [2, 2], [2, 3]]))
  1246. assert np.array_equal(dn.filter((1, 2, Operator.EQUAL)), np.array([[1, 2], [2, 2]]))
  1247. assert np.array_equal(dn[dn[:, 1] == 2], np.array([[1, 2], [2, 2]]))
  1248. assert np.array_equal(
  1249. dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR),
  1250. np.array([[1, 1], [1, 2], [2, 1], [2, 2]]),
  1251. )
  1252. assert np.array_equal(dn[(dn[:, 1] == 1) | (dn[:, 1] == 2)], np.array([[1, 1], [1, 2], [2, 1], [2, 2]]))
  1253. def test_filter_numpy_exposed_type_without_sheetname(self, excel_file):
  1254. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": excel_file, "exposed_type": "numpy"})
  1255. dn.write(
  1256. [
  1257. [1, 1],
  1258. [1, 2],
  1259. [1, 3],
  1260. [2, 1],
  1261. [2, 2],
  1262. [2, 3],
  1263. ]
  1264. )
  1265. assert len(dn.filter((0, 1, Operator.EQUAL))["Sheet1"]) == 3
  1266. assert len(dn.filter((0, 1, Operator.NOT_EQUAL))["Sheet1"]) == 3
  1267. assert len(dn.filter((1, 2, Operator.EQUAL))["Sheet1"]) == 2
  1268. assert len(dn.filter([(0, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)["Sheet1"]) == 4
  1269. assert np.array_equal(dn["Sheet1"][0], np.array([1, 1]))
  1270. assert np.array_equal(dn["Sheet1"][1], np.array([1, 2]))
  1271. assert np.array_equal(dn["Sheet1"][:3], np.array([[1, 1], [1, 2], [1, 3]]))
  1272. assert np.array_equal(dn["Sheet1"][:, 0], np.array([1, 1, 1, 2, 2, 2]))
  1273. assert np.array_equal(dn["Sheet1"][1:4, :1], np.array([[1], [1], [2]]))
  1274. def test_filter_numpy_exposed_type_multisheet(self, excel_file):
  1275. dn = ExcelDataNode(
  1276. "foo",
  1277. Scope.SCENARIO,
  1278. properties={"path": excel_file, "sheet_name": ["sheet_1", "sheet_2"], "exposed_type": "numpy"},
  1279. )
  1280. dn.write(
  1281. {
  1282. "sheet_1": pd.DataFrame(
  1283. [
  1284. [1, 1],
  1285. [1, 2],
  1286. [1, 3],
  1287. [2, 1],
  1288. [2, 2],
  1289. [2, 3],
  1290. ]
  1291. ),
  1292. "sheet_2": pd.DataFrame(
  1293. [
  1294. [1, 4],
  1295. [1, 5],
  1296. [1, 6],
  1297. [2, 4],
  1298. [2, 5],
  1299. [2, 6],
  1300. ]
  1301. ),
  1302. }
  1303. )
  1304. assert len(dn.filter((0, 1, Operator.EQUAL))) == 2
  1305. assert len(dn.filter((0, 1, Operator.EQUAL))["sheet_1"]) == 3
  1306. assert len(dn.filter((0, 1, Operator.EQUAL))["sheet_2"]) == 3
  1307. assert len(dn.filter((0, 1, Operator.NOT_EQUAL))) == 2
  1308. assert len(dn.filter((0, 1, Operator.NOT_EQUAL))["sheet_1"]) == 3
  1309. assert len(dn.filter((0, 1, Operator.NOT_EQUAL))["sheet_2"]) == 3
  1310. assert len(dn.filter((1, 2, Operator.EQUAL))) == 2
  1311. assert len(dn.filter((1, 2, Operator.EQUAL))["sheet_1"]) == 2
  1312. assert len(dn.filter((1, 2, Operator.EQUAL))["sheet_2"]) == 0
  1313. assert len(dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)) == 2
  1314. assert len(dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)["sheet_1"]) == 4
  1315. assert len(dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR)["sheet_2"]) == 0
  1316. assert np.array_equal(dn["sheet_1"][0], np.array([1, 1]))
  1317. assert np.array_equal(dn["sheet_2"][0], np.array([1, 4]))
  1318. assert np.array_equal(dn["sheet_1"][1], np.array([1, 2]))
  1319. assert np.array_equal(dn["sheet_2"][1], np.array([1, 5]))
  1320. assert np.array_equal(dn["sheet_1"][:3], np.array([[1, 1], [1, 2], [1, 3]]))
  1321. assert np.array_equal(dn["sheet_2"][:3], np.array([[1, 4], [1, 5], [1, 6]]))
  1322. assert np.array_equal(dn["sheet_1"][:, 0], np.array([1, 1, 1, 2, 2, 2]))
  1323. assert np.array_equal(dn["sheet_2"][:, 1], np.array([4, 5, 6, 4, 5, 6]))
  1324. assert np.array_equal(dn["sheet_1"][1:4, :1], np.array([[1], [1], [2]]))
  1325. assert np.array_equal(dn["sheet_2"][1:4, 1:2], np.array([[5], [6], [4]]))
  1326. def test_set_path(self):
  1327. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"default_path": "foo.xlsx"})
  1328. assert dn.path == "foo.xlsx"
  1329. dn.path = "bar.xlsx"
  1330. assert dn.path == "bar.xlsx"
  1331. @pytest.mark.parametrize(
  1332. ["properties", "exists"],
  1333. [
  1334. ({}, False),
  1335. ({"default_data": {"a": ["foo", "bar"]}}, True),
  1336. ],
  1337. )
  1338. def test_create_with_default_data(self, properties, exists):
  1339. dn = ExcelDataNode("foo", Scope.SCENARIO, DataNodeId("dn_id"), properties=properties)
  1340. assert os.path.exists(dn.path) is exists
  1341. def test_read_write_after_modify_path(self):
  1342. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  1343. new_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/temp.xlsx")
  1344. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"default_path": path})
  1345. read_data = dn.read()
  1346. assert read_data is not None
  1347. dn.path = new_path
  1348. with pytest.raises(FileNotFoundError):
  1349. dn.read()
  1350. dn.write(read_data)
  1351. for sheet, df in dn.read().items():
  1352. assert np.array_equal(df.values, read_data[sheet].values)
  1353. def test_exposed_type_custom_class_after_modify_path(self):
  1354. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx") # ["Sheet1", "Sheet2"]
  1355. new_path = os.path.join(
  1356. pathlib.Path(__file__).parent.resolve(), "data_sample/example_2.xlsx"
  1357. ) # ["Sheet1", "Sheet2", "Sheet3"]
  1358. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"default_path": path, "exposed_type": MyCustomObject1})
  1359. assert dn.exposed_type == MyCustomObject1
  1360. dn.read()
  1361. dn.path = new_path
  1362. dn.read()
  1363. dn = ExcelDataNode(
  1364. "foo",
  1365. Scope.SCENARIO,
  1366. properties={"default_path": path, "exposed_type": MyCustomObject1, "sheet_name": ["Sheet4"]},
  1367. )
  1368. assert dn.exposed_type == MyCustomObject1
  1369. with pytest.raises(NonExistingExcelSheet):
  1370. dn.read()
  1371. def test_exposed_type_dict(self):
  1372. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx") # ["Sheet1", "Sheet2"]
  1373. dn = ExcelDataNode(
  1374. "foo",
  1375. Scope.SCENARIO,
  1376. properties={
  1377. "default_path": path,
  1378. "exposed_type": {
  1379. "Sheet1": MyCustomObject1,
  1380. "Sheet2": MyCustomObject2,
  1381. "Sheet3": MyCustomObject1,
  1382. },
  1383. },
  1384. )
  1385. data = dn.read()
  1386. assert isinstance(data, Dict)
  1387. assert isinstance(data["Sheet1"][0], MyCustomObject1)
  1388. assert isinstance(data["Sheet2"][0], MyCustomObject2)
  1389. def test_exposed_type_list(self):
  1390. path_1 = os.path.join(
  1391. pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx"
  1392. ) # ["Sheet1", "Sheet2"]
  1393. path_2 = os.path.join(
  1394. pathlib.Path(__file__).parent.resolve(), "data_sample/example_2.xlsx"
  1395. ) # ["Sheet1", "Sheet2", "Sheet3"]
  1396. dn = ExcelDataNode(
  1397. "foo",
  1398. Scope.SCENARIO,
  1399. properties={"default_path": path_1, "exposed_type": [MyCustomObject1, MyCustomObject2]},
  1400. )
  1401. data = dn.read()
  1402. assert isinstance(data, Dict)
  1403. assert isinstance(data["Sheet1"][0], MyCustomObject1)
  1404. assert isinstance(data["Sheet2"][0], MyCustomObject2)
  1405. dn.path = path_2
  1406. with pytest.raises(ExposedTypeLengthMismatch):
  1407. dn.read()
  1408. def test_not_trying_to_read_sheet_names_when_exposed_type_is_set(self):
  1409. dn = ExcelDataNode(
  1410. "foo", Scope.SCENARIO, properties={"default_path": "notexistyet.xlsx", "exposed_type": MyCustomObject1}
  1411. )
  1412. assert dn.path == "notexistyet.xlsx"
  1413. assert dn.exposed_type == MyCustomObject1
  1414. dn = ExcelDataNode(
  1415. "foo",
  1416. Scope.SCENARIO,
  1417. properties={"default_path": "notexistyet.xlsx", "exposed_type": [MyCustomObject1, MyCustomObject2]},
  1418. )
  1419. assert dn.path == "notexistyet.xlsx"
  1420. assert dn.exposed_type == [MyCustomObject1, MyCustomObject2]
  1421. dn = ExcelDataNode(
  1422. "foo",
  1423. Scope.SCENARIO,
  1424. properties={
  1425. "default_path": "notexistyet.xlsx",
  1426. "exposed_type": {"Sheet1": MyCustomObject1, "Sheet2": MyCustomObject2},
  1427. },
  1428. )
  1429. assert dn.path == "notexistyet.xlsx"
  1430. assert dn.exposed_type == {"Sheet1": MyCustomObject1, "Sheet2": MyCustomObject2}
  1431. def test_exposed_type_default(self):
  1432. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  1433. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"default_path": path, "sheet_name": "Sheet1"})
  1434. assert dn.exposed_type == "pandas"
  1435. data = dn.read()
  1436. assert isinstance(data, pd.DataFrame)
  1437. def test_pandas_exposed_type(self):
  1438. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  1439. dn = ExcelDataNode(
  1440. "foo", Scope.SCENARIO, properties={"default_path": path, "exposed_type": "pandas", "sheet_name": "Sheet1"}
  1441. )
  1442. assert dn.exposed_type == "pandas"
  1443. data = dn.read()
  1444. assert isinstance(data, pd.DataFrame)
  1445. def test_complex_exposed_type_dict(self):
  1446. # ["Sheet1", "Sheet2", "Sheet3", "Sheet4", "Sheet5"]
  1447. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example_4.xlsx")
  1448. dn = ExcelDataNode(
  1449. "foo",
  1450. Scope.SCENARIO,
  1451. properties={
  1452. "default_path": path,
  1453. "exposed_type": {
  1454. "Sheet1": MyCustomObject1,
  1455. "Sheet2": "numpy",
  1456. "Sheet3": "pandas",
  1457. },
  1458. "sheet_name": ["Sheet1", "Sheet2", "Sheet3", "Sheet4"],
  1459. },
  1460. )
  1461. data = dn.read()
  1462. assert isinstance(data, dict)
  1463. assert isinstance(data["Sheet1"], list)
  1464. assert isinstance(data["Sheet1"][0], MyCustomObject1)
  1465. assert isinstance(data["Sheet2"], np.ndarray)
  1466. assert isinstance(data["Sheet3"], pd.DataFrame)
  1467. assert isinstance(data["Sheet4"], pd.DataFrame)
  1468. assert data.get("Sheet5") is None
  1469. def test_complex_exposed_type_list(self):
  1470. # ["Sheet1", "Sheet2", "Sheet3", "Sheet4","Sheet5"]
  1471. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example_4.xlsx")
  1472. dn = ExcelDataNode(
  1473. "foo",
  1474. Scope.SCENARIO,
  1475. properties={
  1476. "default_path": path,
  1477. "exposed_type": [MyCustomObject1, "numpy", "pandas"],
  1478. "sheet_name": ["Sheet1", "Sheet2", "Sheet3"],
  1479. },
  1480. )
  1481. data = dn.read()
  1482. assert isinstance(data, dict)
  1483. assert isinstance(data["Sheet1"], list)
  1484. assert isinstance(data["Sheet1"][0], MyCustomObject1)
  1485. assert isinstance(data["Sheet2"], np.ndarray)
  1486. assert isinstance(data["Sheet3"], pd.DataFrame)
  1487. def test_invalid_exposed_type(self):
  1488. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.xlsx")
  1489. with pytest.raises(InvalidExposedType):
  1490. ExcelDataNode(
  1491. "foo",
  1492. Scope.SCENARIO,
  1493. properties={"default_path": path, "exposed_type": "invalid", "sheet_name": "Sheet1"},
  1494. )
  1495. with pytest.raises(InvalidExposedType):
  1496. ExcelDataNode(
  1497. "foo",
  1498. Scope.SCENARIO,
  1499. properties={
  1500. "default_path": path,
  1501. "exposed_type": ["numpy", "invalid", "pandas"],
  1502. "sheet_name": "Sheet1",
  1503. },
  1504. )
  1505. with pytest.raises(InvalidExposedType):
  1506. ExcelDataNode(
  1507. "foo",
  1508. Scope.SCENARIO,
  1509. properties={
  1510. "default_path": path,
  1511. "exposed_type": {"Sheet1": "pandas", "Sheet2": "invalid"},
  1512. "sheet_name": "Sheet1",
  1513. },
  1514. )
  1515. def test_get_system_modified_date_instead_of_last_edit_date(self, tmpdir_factory):
  1516. temp_file_path = str(tmpdir_factory.mktemp("data").join("temp.xlsx"))
  1517. pd.DataFrame([]).to_excel(temp_file_path)
  1518. dn = ExcelDataNode("foo", Scope.SCENARIO, properties={"path": temp_file_path, "exposed_type": "pandas"})
  1519. dn.write(pd.DataFrame([1, 2, 3]))
  1520. previous_edit_date = dn.last_edit_date
  1521. sleep(0.1)
  1522. pd.DataFrame([4, 5, 6]).to_excel(temp_file_path)
  1523. new_edit_date = datetime.fromtimestamp(os.path.getmtime(temp_file_path))
  1524. assert previous_edit_date < dn.last_edit_date
  1525. assert new_edit_date == dn.last_edit_date
  1526. sleep(0.1)
  1527. dn.write(pd.DataFrame([7, 8, 9]))
  1528. assert new_edit_date < dn.last_edit_date
  1529. os.unlink(temp_file_path)