test_csv_data_node.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525
  1. # Copyright 2023 Avaiga Private Limited
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
  4. # the License. You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
  9. # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
  10. # specific language governing permissions and limitations under the License.
  11. import os
  12. import pathlib
  13. from datetime import datetime
  14. from time import sleep
  15. import modin.pandas as modin_pd
  16. import numpy as np
  17. import pandas as pd
  18. import pytest
  19. from modin.pandas.test.utils import df_equals
  20. from pandas.testing import assert_frame_equal
  21. from taipy.config.common.scope import Scope
  22. from taipy.config.config import Config
  23. from taipy.config.exceptions.exceptions import InvalidConfigurationId
  24. from taipy.core.data._data_manager import _DataManager
  25. from taipy.core.data.csv import CSVDataNode
  26. from taipy.core.data.data_node_id import DataNodeId
  27. from taipy.core.data.operator import JoinOperator, Operator
  28. from taipy.core.exceptions.exceptions import InvalidExposedType, NoData
  29. @pytest.fixture(scope="function", autouse=True)
  30. def cleanup():
  31. yield
  32. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/temp.csv")
  33. if os.path.isfile(path):
  34. os.remove(path)
  35. class MyCustomObject:
  36. def __init__(self, id, integer, text):
  37. self.id = id
  38. self.integer = integer
  39. self.text = text
  40. class TestCSVDataNode:
  41. def test_create(self):
  42. path = "data/node/path"
  43. dn = CSVDataNode(
  44. "foo_bar", Scope.SCENARIO, properties={"path": path, "has_header": False, "name": "super name"}
  45. )
  46. assert isinstance(dn, CSVDataNode)
  47. assert dn.storage_type() == "csv"
  48. assert dn.config_id == "foo_bar"
  49. assert dn.name == "super name"
  50. assert dn.scope == Scope.SCENARIO
  51. assert dn.id is not None
  52. assert dn.owner_id is None
  53. assert dn.last_edit_date is None
  54. assert dn.job_ids == []
  55. assert not dn.is_ready_for_reading
  56. assert dn.path == path
  57. assert dn.has_header is False
  58. assert dn.exposed_type == "pandas"
  59. with pytest.raises(InvalidConfigurationId):
  60. dn = CSVDataNode(
  61. "foo bar", Scope.SCENARIO, properties={"path": path, "has_header": False, "name": "super name"}
  62. )
  63. def test_get_user_properties(self, csv_file):
  64. dn_1 = CSVDataNode("dn_1", Scope.SCENARIO, properties={"path": "data/node/path"})
  65. assert dn_1._get_user_properties() == {}
  66. dn_2 = CSVDataNode(
  67. "dn_2",
  68. Scope.SCENARIO,
  69. properties={
  70. "exposed_type": "numpy",
  71. "default_data": "foo",
  72. "default_path": csv_file,
  73. "has_header": False,
  74. "foo": "bar",
  75. },
  76. )
  77. # exposed_type, default_data, default_path, path, has_header, sheet_name are filtered out
  78. assert dn_2._get_user_properties() == {"foo": "bar"}
  79. def test_new_csv_data_node_with_existing_file_is_ready_for_reading(self):
  80. not_ready_dn_cfg = Config.configure_data_node("not_ready_data_node_config_id", "csv", path="NOT_EXISTING.csv")
  81. not_ready_dn = _DataManager._bulk_get_or_create([not_ready_dn_cfg])[not_ready_dn_cfg]
  82. assert not not_ready_dn.is_ready_for_reading
  83. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  84. ready_dn_cfg = Config.configure_data_node("ready_data_node_config_id", "csv", path=path)
  85. ready_dn = _DataManager._bulk_get_or_create([ready_dn_cfg])[ready_dn_cfg]
  86. assert ready_dn.is_ready_for_reading
  87. @pytest.mark.parametrize(
  88. ["properties", "exists"],
  89. [
  90. ({}, False),
  91. ({"default_data": ["foo", "bar"]}, True),
  92. ],
  93. )
  94. def test_create_with_default_data(self, properties, exists):
  95. dn = CSVDataNode("foo", Scope.SCENARIO, DataNodeId("dn_id"), properties=properties)
  96. assert os.path.exists(dn.path) is exists
  97. def test_read_with_header_pandas(self):
  98. not_existing_csv = CSVDataNode("foo", Scope.SCENARIO, properties={"path": "WRONG.csv", "has_header": True})
  99. with pytest.raises(NoData):
  100. assert not_existing_csv.read() is None
  101. not_existing_csv.read_or_raise()
  102. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  103. # # Create CSVDataNode without exposed_type (Default is pandas.DataFrame)
  104. csv_data_node_as_pandas = CSVDataNode("bar", Scope.SCENARIO, properties={"path": path})
  105. data_pandas = csv_data_node_as_pandas.read()
  106. assert isinstance(data_pandas, pd.DataFrame)
  107. assert len(data_pandas) == 10
  108. assert np.array_equal(data_pandas.to_numpy(), pd.read_csv(path).to_numpy())
  109. @pytest.mark.modin
  110. def test_read_with_header_pandas(self):
  111. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  112. # Create CSVDataNode with modin exposed_type
  113. csv_data_node_as_modin = CSVDataNode("bar", Scope.SCENARIO, properties={"path": path, "exposed_type": "modin"})
  114. data_modin = csv_data_node_as_modin.read()
  115. assert isinstance(data_modin, modin_pd.DataFrame)
  116. assert len(data_modin) == 10
  117. assert np.array_equal(data_modin.to_numpy(), modin_pd.read_csv(path).to_numpy())
  118. def test_read_with_header_numpy(self):
  119. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  120. # Create CSVDataNode with numpy exposed_type
  121. csv_data_node_as_numpy = CSVDataNode(
  122. "bar", Scope.SCENARIO, properties={"path": path, "has_header": True, "exposed_type": "numpy"}
  123. )
  124. data_numpy = csv_data_node_as_numpy.read()
  125. assert isinstance(data_numpy, np.ndarray)
  126. assert len(data_numpy) == 10
  127. assert np.array_equal(data_numpy, pd.read_csv(path).to_numpy())
  128. def test_read_with_header_custom_exposed_type(self):
  129. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  130. csv_data_node_as_pandas = CSVDataNode("bar", Scope.SCENARIO, properties={"path": path})
  131. data_pandas = csv_data_node_as_pandas.read()
  132. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  133. # Create the same CSVDataNode but with custom exposed_type
  134. csv_data_node_as_custom_object = CSVDataNode(
  135. "bar", Scope.SCENARIO, properties={"path": path, "exposed_type": MyCustomObject}
  136. )
  137. data_custom = csv_data_node_as_custom_object.read()
  138. assert isinstance(data_custom, list)
  139. assert len(data_custom) == 10
  140. for (index, row_pandas), row_custom in zip(data_pandas.iterrows(), data_custom):
  141. assert isinstance(row_custom, MyCustomObject)
  142. assert row_pandas["id"] == row_custom.id
  143. assert str(row_pandas["integer"]) == row_custom.integer
  144. assert row_pandas["text"] == row_custom.text
  145. def test_read_without_header(self):
  146. not_existing_csv = CSVDataNode("foo", Scope.SCENARIO, properties={"path": "WRONG.csv", "has_header": False})
  147. with pytest.raises(NoData):
  148. assert not_existing_csv.read() is None
  149. not_existing_csv.read_or_raise()
  150. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  151. # Create CSVDataNode without exposed_type (Default is pandas.DataFrame)
  152. csv_data_node_as_pandas = CSVDataNode("bar", Scope.SCENARIO, properties={"path": path, "has_header": False})
  153. data_pandas = csv_data_node_as_pandas.read()
  154. assert isinstance(data_pandas, pd.DataFrame)
  155. assert len(data_pandas) == 11
  156. assert np.array_equal(data_pandas.to_numpy(), pd.read_csv(path, header=None).to_numpy())
  157. # Create CSVDataNode with numpy exposed_type
  158. csv_data_node_as_numpy = CSVDataNode(
  159. "qux", Scope.SCENARIO, properties={"path": path, "has_header": False, "exposed_type": "numpy"}
  160. )
  161. data_numpy = csv_data_node_as_numpy.read()
  162. assert isinstance(data_numpy, np.ndarray)
  163. assert len(data_numpy) == 11
  164. assert np.array_equal(data_numpy, pd.read_csv(path, header=None).to_numpy())
  165. # Create the same CSVDataNode but with custom exposed_type
  166. csv_data_node_as_custom_object = CSVDataNode(
  167. "quux", Scope.SCENARIO, properties={"path": path, "has_header": False, "exposed_type": MyCustomObject}
  168. )
  169. data_custom = csv_data_node_as_custom_object.read()
  170. assert isinstance(data_custom, list)
  171. assert len(data_custom) == 11
  172. for (index, row_pandas), row_custom in zip(data_pandas.iterrows(), data_custom):
  173. assert isinstance(row_custom, MyCustomObject)
  174. assert row_pandas[0] == row_custom.id
  175. assert str(row_pandas[1]) == row_custom.integer
  176. assert row_pandas[2] == row_custom.text
  177. @pytest.mark.modin
  178. def test_read_without_header_modin(self):
  179. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  180. # Create CSVDataNode with modin exposed_type
  181. csv_data_node_as_modin = CSVDataNode(
  182. "baz", Scope.SCENARIO, properties={"path": path, "has_header": False, "exposed_type": "modin"}
  183. )
  184. data_modin = csv_data_node_as_modin.read()
  185. assert isinstance(data_modin, modin_pd.DataFrame)
  186. assert len(data_modin) == 11
  187. assert np.array_equal(data_modin.to_numpy(), modin_pd.read_csv(path, header=None).to_numpy())
  188. @pytest.mark.parametrize(
  189. "content",
  190. [
  191. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  192. (pd.DataFrame([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}])),
  193. ([[11, 22, 33], [44, 55, 66]]),
  194. ],
  195. )
  196. def test_append(self, csv_file, default_data_frame, content):
  197. csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file})
  198. assert_frame_equal(csv_dn.read(), default_data_frame)
  199. csv_dn.append(content)
  200. assert_frame_equal(
  201. csv_dn.read(),
  202. pd.concat([default_data_frame, pd.DataFrame(content, columns=["a", "b", "c"])]).reset_index(drop=True),
  203. )
  204. @pytest.mark.modin
  205. @pytest.mark.parametrize(
  206. "content",
  207. [
  208. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}]),
  209. (pd.DataFrame([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}])),
  210. ([[11, 22, 33], [44, 55, 66]]),
  211. ],
  212. )
  213. def test_append_modin(self, csv_file, default_data_frame, content):
  214. csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "modin"})
  215. df_equals(csv_dn.read(), modin_pd.DataFrame(default_data_frame))
  216. csv_dn.append(content)
  217. df_equals(
  218. csv_dn.read(),
  219. modin_pd.concat([default_data_frame, pd.DataFrame(content, columns=["a", "b", "c"])]).reset_index(
  220. drop=True
  221. ),
  222. )
  223. @pytest.mark.parametrize(
  224. "content,columns",
  225. [
  226. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], None),
  227. ([[11, 22, 33], [44, 55, 66]], None),
  228. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"]),
  229. ],
  230. )
  231. def test_write(self, csv_file, default_data_frame, content, columns):
  232. csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file})
  233. assert np.array_equal(csv_dn.read().values, default_data_frame.values)
  234. if not columns:
  235. csv_dn.write(content)
  236. df = pd.DataFrame(content)
  237. else:
  238. csv_dn.write_with_column_names(content, columns)
  239. df = pd.DataFrame(content, columns=columns)
  240. assert np.array_equal(csv_dn.read().values, df.values)
  241. csv_dn.write(None)
  242. assert len(csv_dn.read()) == 0
  243. def test_write_with_different_encoding(self, csv_file):
  244. data = pd.DataFrame([{"≥a": 1, "b": 2}])
  245. utf8_dn = CSVDataNode("utf8_dn", Scope.SCENARIO, properties={"default_path": csv_file})
  246. utf16_dn = CSVDataNode("utf16_dn", Scope.SCENARIO, properties={"default_path": csv_file, "encoding": "utf-16"})
  247. # If a file is written with utf-8 encoding, it can only be read with utf-8, not utf-16 encoding
  248. utf8_dn.write(data)
  249. assert np.array_equal(utf8_dn.read(), data)
  250. with pytest.raises(UnicodeError):
  251. utf16_dn.read()
  252. # If a file is written with utf-16 encoding, it can only be read with utf-16, not utf-8 encoding
  253. utf16_dn.write(data)
  254. assert np.array_equal(utf16_dn.read(), data)
  255. with pytest.raises(UnicodeError):
  256. utf8_dn.read()
  257. @pytest.mark.modin
  258. @pytest.mark.parametrize(
  259. "content,columns",
  260. [
  261. ([{"a": 11, "b": 22, "c": 33}, {"a": 44, "b": 55, "c": 66}], None),
  262. ([[11, 22, 33], [44, 55, 66]], None),
  263. ([[11, 22, 33], [44, 55, 66]], ["e", "f", "g"]),
  264. ],
  265. )
  266. def test_write_modin(self, csv_file, default_data_frame, content, columns):
  267. default_data_frame = modin_pd.DataFrame(default_data_frame)
  268. csv_dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "modin"})
  269. assert np.array_equal(csv_dn.read().values, default_data_frame.values)
  270. if not columns:
  271. csv_dn.write(content)
  272. df = pd.DataFrame(content)
  273. else:
  274. csv_dn.write_with_column_names(content, columns)
  275. df = pd.DataFrame(content, columns=columns)
  276. assert np.array_equal(csv_dn.read().values, df.values)
  277. csv_dn.write(None)
  278. assert len(csv_dn.read()) == 0
  279. @pytest.mark.modin
  280. def test_write_modin_with_different_encoding(self, csv_file):
  281. data = pd.DataFrame([{"≥a": 1, "b": 2}])
  282. utf8_dn = CSVDataNode("utf8_dn", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "modin"})
  283. utf16_dn = CSVDataNode(
  284. "utf16_dn", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "modin", "encoding": "utf-16"}
  285. )
  286. # If a file is written with utf-8 encoding, it can only be read with utf-8, not utf-16 encoding
  287. utf8_dn.write(data)
  288. assert np.array_equal(utf8_dn.read(), data)
  289. with pytest.raises(UnicodeError):
  290. utf16_dn.read()
  291. # If a file is written with utf-16 encoding, it can only be read with utf-16, not utf-8 encoding
  292. utf16_dn.write(data)
  293. assert np.array_equal(utf16_dn.read(), data)
  294. with pytest.raises(UnicodeError):
  295. utf8_dn.read()
  296. def test_set_path(self):
  297. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"default_path": "foo.csv"})
  298. assert dn.path == "foo.csv"
  299. dn.path = "bar.csv"
  300. assert dn.path == "bar.csv"
  301. def test_read_write_after_modify_path(self):
  302. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  303. new_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/temp.csv")
  304. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"default_path": path})
  305. read_data = dn.read()
  306. assert read_data is not None
  307. dn.path = new_path
  308. with pytest.raises(FileNotFoundError):
  309. dn.read()
  310. dn.write(read_data)
  311. assert dn.read().equals(read_data)
  312. def test_pandas_exposed_type(self):
  313. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  314. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": path, "exposed_type": "pandas"})
  315. assert isinstance(dn.read(), pd.DataFrame)
  316. def test_filter_pandas_exposed_type(self, csv_file):
  317. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "pandas"})
  318. dn.write(
  319. [
  320. {"foo": 1, "bar": 1},
  321. {"foo": 1, "bar": 2},
  322. {"foo": 1},
  323. {"foo": 2, "bar": 2},
  324. {"bar": 2},
  325. ]
  326. )
  327. # Test datanode indexing and slicing
  328. assert dn["foo"].equals(pd.Series([1, 1, 1, 2, None]))
  329. assert dn["bar"].equals(pd.Series([1, 2, None, 2, 2]))
  330. assert dn[:2].equals(pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  331. # Test filter data
  332. filtered_by_filter_method = dn.filter(("foo", 1, Operator.EQUAL))
  333. filtered_by_indexing = dn[dn["foo"] == 1]
  334. expected_data = pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}, {"foo": 1.0}])
  335. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  336. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  337. filtered_by_filter_method = dn.filter(("foo", 1, Operator.NOT_EQUAL))
  338. filtered_by_indexing = dn[dn["foo"] != 1]
  339. expected_data = pd.DataFrame([{"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  340. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  341. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  342. filtered_by_filter_method = dn.filter(("bar", 2, Operator.EQUAL))
  343. filtered_by_indexing = dn[dn["bar"] == 2]
  344. expected_data = pd.DataFrame([{"foo": 1.0, "bar": 2.0}, {"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  345. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  346. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  347. filtered_by_filter_method = dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)
  348. filtered_by_indexing = dn[(dn["bar"] == 1) | (dn["bar"] == 2)]
  349. expected_data = pd.DataFrame(
  350. [
  351. {"foo": 1.0, "bar": 1.0},
  352. {"foo": 1.0, "bar": 2.0},
  353. {"foo": 2.0, "bar": 2.0},
  354. {"bar": 2.0},
  355. ]
  356. )
  357. assert_frame_equal(filtered_by_filter_method.reset_index(drop=True), expected_data)
  358. assert_frame_equal(filtered_by_indexing.reset_index(drop=True), expected_data)
  359. @pytest.mark.modin
  360. def test_filter_modin_exposed_type(self, csv_file):
  361. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "modin"})
  362. dn.write(
  363. [
  364. {"foo": 1, "bar": 1},
  365. {"foo": 1, "bar": 2},
  366. {"foo": 1},
  367. {"foo": 2, "bar": 2},
  368. {"bar": 2},
  369. ]
  370. )
  371. # Test datanode indexing and slicing
  372. assert dn["foo"].equals(modin_pd.Series([1, 1, 1, 2, None]))
  373. assert dn["bar"].equals(modin_pd.Series([1, 2, None, 2, 2]))
  374. assert dn[:2].equals(modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}]))
  375. # Test filter data
  376. filtered_by_filter_method = dn.filter(("foo", 1, Operator.EQUAL))
  377. filtered_by_indexing = dn[dn["foo"] == 1]
  378. expected_data = modin_pd.DataFrame([{"foo": 1.0, "bar": 1.0}, {"foo": 1.0, "bar": 2.0}, {"foo": 1.0}])
  379. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  380. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  381. filtered_by_filter_method = dn.filter(("foo", 1, Operator.NOT_EQUAL))
  382. filtered_by_indexing = dn[dn["foo"] != 1]
  383. expected_data = modin_pd.DataFrame([{"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  384. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  385. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  386. filtered_by_filter_method = dn.filter(("bar", 2, Operator.EQUAL))
  387. filtered_by_indexing = dn[dn["bar"] == 2]
  388. expected_data = modin_pd.DataFrame([{"foo": 1.0, "bar": 2.0}, {"foo": 2.0, "bar": 2.0}, {"bar": 2.0}])
  389. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  390. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  391. filtered_by_filter_method = dn.filter([("bar", 1, Operator.EQUAL), ("bar", 2, Operator.EQUAL)], JoinOperator.OR)
  392. filtered_by_indexing = dn[(dn["bar"] == 1) | (dn["bar"] == 2)]
  393. expected_data = modin_pd.DataFrame(
  394. [
  395. {"foo": 1.0, "bar": 1.0},
  396. {"foo": 1.0, "bar": 2.0},
  397. {"foo": 2.0, "bar": 2.0},
  398. {"bar": 2.0},
  399. ]
  400. )
  401. df_equals(filtered_by_filter_method.reset_index(drop=True), expected_data)
  402. df_equals(filtered_by_indexing.reset_index(drop=True), expected_data)
  403. def test_filter_numpy_exposed_type(self, csv_file):
  404. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": csv_file, "exposed_type": "numpy"})
  405. dn.write(
  406. [
  407. [1, 1],
  408. [1, 2],
  409. [1, 3],
  410. [2, 1],
  411. [2, 2],
  412. [2, 3],
  413. ]
  414. )
  415. # Test datanode indexing and slicing
  416. assert np.array_equal(dn[0], np.array([1, 1]))
  417. assert np.array_equal(dn[1], np.array([1, 2]))
  418. assert np.array_equal(dn[:3], np.array([[1, 1], [1, 2], [1, 3]]))
  419. assert np.array_equal(dn[:, 0], np.array([1, 1, 1, 2, 2, 2]))
  420. assert np.array_equal(dn[1:4, :1], np.array([[1], [1], [2]]))
  421. # Test filter data
  422. assert np.array_equal(dn.filter((0, 1, Operator.EQUAL)), np.array([[1, 1], [1, 2], [1, 3]]))
  423. assert np.array_equal(dn[dn[:, 0] == 1], np.array([[1, 1], [1, 2], [1, 3]]))
  424. assert np.array_equal(dn.filter((0, 1, Operator.NOT_EQUAL)), np.array([[2, 1], [2, 2], [2, 3]]))
  425. assert np.array_equal(dn[dn[:, 0] != 1], np.array([[2, 1], [2, 2], [2, 3]]))
  426. assert np.array_equal(dn.filter((1, 2, Operator.EQUAL)), np.array([[1, 2], [2, 2]]))
  427. assert np.array_equal(dn[dn[:, 1] == 2], np.array([[1, 2], [2, 2]]))
  428. assert np.array_equal(
  429. dn.filter([(1, 1, Operator.EQUAL), (1, 2, Operator.EQUAL)], JoinOperator.OR),
  430. np.array([[1, 1], [1, 2], [2, 1], [2, 2]]),
  431. )
  432. assert np.array_equal(dn[(dn[:, 1] == 1) | (dn[:, 1] == 2)], np.array([[1, 1], [1, 2], [2, 1], [2, 2]]))
  433. def test_raise_error_invalid_exposed_type(self):
  434. path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data_sample/example.csv")
  435. with pytest.raises(InvalidExposedType):
  436. CSVDataNode("foo", Scope.SCENARIO, properties={"path": path, "exposed_type": "foo"})
  437. def test_get_system_modified_date_instead_of_last_edit_date(self, tmpdir_factory):
  438. temp_file_path = str(tmpdir_factory.mktemp("data").join("temp.csv"))
  439. pd.DataFrame([]).to_csv(temp_file_path)
  440. dn = CSVDataNode("foo", Scope.SCENARIO, properties={"path": temp_file_path, "exposed_type": "pandas"})
  441. dn.write(pd.DataFrame([1, 2, 3]))
  442. previous_edit_date = dn.last_edit_date
  443. sleep(0.1)
  444. pd.DataFrame([4, 5, 6]).to_csv(temp_file_path)
  445. new_edit_date = datetime.fromtimestamp(os.path.getmtime(temp_file_path))
  446. assert previous_edit_date < dn.last_edit_date
  447. assert new_edit_date == dn.last_edit_date
  448. sleep(0.1)
  449. dn.write(pd.DataFrame([7, 8, 9]))
  450. assert new_edit_date < dn.last_edit_date
  451. os.unlink(temp_file_path)