From 9b3daf6881eece5eefcf8e923d34bd9f52c61a26 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 10:56:10 -0400 Subject: [PATCH 01/17] new: test cases --- tests/test_document.py | 8 ++++++++ tests/test_graph.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tests/test_document.py b/tests/test_document.py index 95446da1..65875622 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -1832,6 +1832,14 @@ def test_document_import_bulk(col, bad_col, docs): assert col[doc_key]["loc"] == doc["loc"] empty_collection(col) + # Test import bulk with batch_size + results = col.import_bulk(docs, batch_size=len(docs) // 2) + assert type(results) is list + assert len(results) == 2 + + result = col.import_bulk(docs, batch_size=len(docs) * 2) + assert type(result) is dict + # Test import bulk on_duplicate actions doc = docs[0] doc_key = doc["_key"] diff --git a/tests/test_graph.py b/tests/test_graph.py index de440642..34215661 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -57,6 +57,34 @@ def test_graph_properties(graph, bad_graph, db): assert isinstance(properties["revision"], str) +def test_graph_provision(graph, db): + vertices = [{"foo": i} for i in range(1, 101)] + edges = [ + {"_from": f"numbers/{j}", "_to": f"numbers/{i}", "result": j / i} + for i in range(1, 101) + for j in range(1, 101) + if j % i == 0 + ] + e_d = [ + { + "edge_collection": "is_divisible_by", + "from_vertex_collections": ["numbers"], + "to_vertex_collections": ["numbers"], + } + ] + graph = db.create_graph( + name="DivisibilityGraph", + edge_definitions=e_d, + collections={ + "numbers": {"docs": vertices, "options": {"batch_size": 5}}, + "is_divisible_by": {"docs": edges, "options": {"sync": True}}, + }, + ) + + assert graph.vertex_collection("numbers").count() == len(vertices) + assert graph.edge_collection("is_divisible_by").count() == len(edges) + + def test_graph_management(db, bad_db): # Test create graph graph_name = generate_graph_name() From 0a61e9cb3474cae8118460e19414c40e207f15c2 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 10:58:56 -0400 Subject: [PATCH 02/17] initial feature commit --- arango/collection.py | 45 +++++++++++++++++++++++++++++------------ arango/database.py | 48 ++++++++++++++++++++++++++++++++++++++++---- arango/utils.py | 26 +++++++++++++++++++++++- 3 files changed, 101 insertions(+), 18 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index 4fd848fa..8e819619 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -42,7 +42,7 @@ from arango.response import Response from arango.result import Result from arango.typings import Fields, Headers, Json, Params -from arango.utils import get_doc_id, is_none_or_int, is_none_or_str +from arango.utils import get_batches, get_doc_id, is_none_or_int, is_none_or_str class Collection(ApiGroup): @@ -187,11 +187,15 @@ def _ensure_key_in_body(self, body: Json) -> Json: return body raise DocumentParseError('field "_key" or "_id" required') - def _ensure_key_from_id(self, body: Json) -> Json: + def _ensure_key_from_id(self, body: Json, index: Optional[int] = None) -> Json: """Return the body with "_key" field if it has "_id" field. + If it has neither, set the "_key" value to i, where i + is the document's index position in the sequence. :param body: Document body. :type body: dict + :param index: Document index value in the original list of documents. + :param index: int | None :return: Document body with "_key" field if it has "_id" field. :rtype: dict """ @@ -199,6 +203,11 @@ def _ensure_key_from_id(self, body: Json) -> Json: doc_id = self._validate_id(body["_id"]) body = body.copy() body["_key"] = doc_id[len(self._id_prefix) :] + + if "_id" not in body and "_key" not in body: + body = body.copy() + body["_key"] = str(index) + return body @property @@ -1934,7 +1943,8 @@ def import_bulk( overwrite: Optional[bool] = None, on_duplicate: Optional[str] = None, sync: Optional[bool] = None, - ) -> Result[Json]: + batch_size: Optional[int] = None, + ) -> Union[Result[Json], List[Result[Json]]]: """Insert multiple documents into the collection. .. note:: @@ -1984,11 +1994,16 @@ def import_bulk( :type on_duplicate: str :param sync: Block until operation is synchronized to disk. :type sync: bool | None + :param batch_size: Max number of documents to import at once. If + unspecified, will import all documents at once. + :type batch_size: int | None :return: Result of the bulk import. :rtype: dict :raise arango.exceptions.DocumentInsertError: If import fails. """ - documents = [self._ensure_key_from_id(doc) for doc in documents] + documents = [ + self._ensure_key_from_id(doc, i) for i, doc in enumerate(documents, 1) + ] params: Params = {"type": "array", "collection": self.name} if halt_on_error is not None: @@ -2006,21 +2021,25 @@ def import_bulk( if sync is not None: params["waitForSync"] = sync - request = Request( - method="post", - endpoint="/_api/import", - data=documents, - params=params, - write=self.name, - ) - def response_handler(resp: Response) -> Json: if resp.is_success: result: Json = resp.body return result raise DocumentInsertError(resp, request) - return self._execute(request, response_handler) + result = [] + for batch in get_batches(documents, batch_size): + request = Request( + method="post", + endpoint="/_api/import", + data=batch, + params=params, + write=self.name, + ) + + result.append(self._execute(request, response_handler)) + + return result[0] if len(result) == 1 else result class StandardCollection(Collection): diff --git a/arango/database.py b/arango/database.py index ea9ac058..89e92955 100644 --- a/arango/database.py +++ b/arango/database.py @@ -1170,6 +1170,7 @@ def create_graph( shard_count: Optional[int] = None, replication_factor: Optional[int] = None, write_concern: Optional[int] = None, + collections: Optional[Json] = None, ) -> Result[Graph]: """Create a new graph. @@ -1217,18 +1218,49 @@ def create_graph( parameter cannot be larger than that of **replication_factor**. Default value is 1. Used for clusters only. :type write_concern: int + :param collections: A list collection data objects to provision + the graph with. See below for example. + :type collections: dict | None :return: Graph API wrapper. :rtype: arango.graph.Graph :raise arango.exceptions.GraphCreateError: If create fails. Here is an example entry for parameter **edge_definitions**: + .. code-block:: python + + [ + { + 'edge_collection': 'teach', + 'from_vertex_collections': ['teachers'], + 'to_vertex_collections': ['lectures'] + } + ] + + Here is an example entry for parameter **collections**: + TODO: Rework **collections** data structure? .. code-block:: python { - 'edge_collection': 'teach', - 'from_vertex_collections': ['teachers'], - 'to_vertex_collections': ['lectures'] + 'teachers': { + 'docs': teacher_vertices_to_insert + 'options': { + 'overwrite' = True, + 'sync' = True, + 'batch_size' = 50 + } + }, + 'lectures': { + 'docs': lecture_vertices_to_insert + 'options': { + 'overwrite' = False, + 'sync' = False, + 'batch_size' = 4 + } + }, + 'teach': { + 'docs': teach_edges_to_insert + } } """ data: Json = {"name": name, "options": dict()} @@ -1263,7 +1295,15 @@ def response_handler(resp: Response) -> Graph: return Graph(self._conn, self._executor, name) raise GraphCreateError(resp, request) - return self._execute(request, response_handler) + graph = self._execute(request, response_handler) + + if collections is not None: + for name, data in collections.items(): + self.collection(name).import_bulk( + data["docs"], **data.get("options", {}) + ) + + return graph def delete_graph( self, diff --git a/arango/utils.py b/arango/utils.py index 42d7fff3..2b48d94b 100644 --- a/arango/utils.py +++ b/arango/utils.py @@ -8,7 +8,7 @@ import logging from contextlib import contextmanager -from typing import Any, Iterator, Union +from typing import Any, Iterator, List, Optional, Union from arango.exceptions import DocumentParseError from arango.typings import Json @@ -82,3 +82,27 @@ def is_none_or_str(obj: Any) -> bool: :rtype: bool """ return obj is None or isinstance(obj, str) + + +def get_batches( + l: List[Any], batch_size: Optional[int] = None +) -> Union[List[List[Any]], Iterator[List[Any]]]: + """Generator to split a list in batches + of (maximum) **batch_size** elements each. + If **batch_size** is invalid, return entire + list as one batch. + + :param l: The list of elements. + :type l: list + :param batch_size: Number of elements per batch. + :type batch_size: int | None + """ + if batch_size is None or batch_size <= 0 or batch_size >= len(l): + return [l] + + def generator() -> Iterator[List[Any]]: + n = int(batch_size) # type: ignore # (false positive) + for i in range(0, len(l), n): + yield l[i : i + n] + + return generator() From 247ed1c8c7defb28054edf90616c9c5625d2496e Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 11:06:38 -0400 Subject: [PATCH 03/17] fix: empty_collection(col) --- tests/test_document.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_document.py b/tests/test_document.py index 65875622..eacb407b 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -1836,9 +1836,11 @@ def test_document_import_bulk(col, bad_col, docs): results = col.import_bulk(docs, batch_size=len(docs) // 2) assert type(results) is list assert len(results) == 2 + empty_collection(col) result = col.import_bulk(docs, batch_size=len(docs) * 2) assert type(result) is dict + empty_collection(col) # Test import bulk on_duplicate actions doc = docs[0] From 87fb1c1021152e65a15a7497d4cd87051bfe3d4f Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 15:22:20 -0400 Subject: [PATCH 04/17] attempt fix: tests --- arango/collection.py | 29 ++++++++++++++--------------- arango/utils.py | 8 ++++---- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index 8e819619..89697871 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -169,14 +169,18 @@ def _prep_from_doc( else: return doc_id, doc_id, {"If-Match": rev} - def _ensure_key_in_body(self, body: Json) -> Json: - """Return the document body with "_key" field populated. + def _ensure_key_in_body(self, body: Json, index: Optional[int] = None) -> Json: + """Return the document body with "_key" field populated. If + neither "_key" or "_id" exist, set "_key" value to **index**, + where **index** is the document's position in the sequence. :param body: Document body. :type body: dict + :param index: Document index value in the original list of documents. + :param index: int | None :return: Document body with "_key" field. :rtype: dict - :raise arango.exceptions.DocumentParseError: On missing ID and key. + :raise arango.exceptions.DocumentParseError: On missing _key, _id, & index. """ if "_key" in body: return body @@ -185,17 +189,17 @@ def _ensure_key_in_body(self, body: Json) -> Json: body = body.copy() body["_key"] = doc_id[len(self._id_prefix) :] return body + elif index: + body = body.copy() + body["_key"] = str(index) + return body + raise DocumentParseError('field "_key" or "_id" required') - def _ensure_key_from_id(self, body: Json, index: Optional[int] = None) -> Json: + def _ensure_key_from_id(self, body: Json) -> Json: """Return the body with "_key" field if it has "_id" field. - If it has neither, set the "_key" value to i, where i - is the document's index position in the sequence. - :param body: Document body. :type body: dict - :param index: Document index value in the original list of documents. - :param index: int | None :return: Document body with "_key" field if it has "_id" field. :rtype: dict """ @@ -203,11 +207,6 @@ def _ensure_key_from_id(self, body: Json, index: Optional[int] = None) -> Json: doc_id = self._validate_id(body["_id"]) body = body.copy() body["_key"] = doc_id[len(self._id_prefix) :] - - if "_id" not in body and "_key" not in body: - body = body.copy() - body["_key"] = str(index) - return body @property @@ -2002,7 +2001,7 @@ def import_bulk( :raise arango.exceptions.DocumentInsertError: If import fails. """ documents = [ - self._ensure_key_from_id(doc, i) for i, doc in enumerate(documents, 1) + self._ensure_key_in_body(doc, i) for i, doc in enumerate(documents, 1) ] params: Params = {"type": "array", "collection": self.name} diff --git a/arango/utils.py b/arango/utils.py index 2b48d94b..4f9242b5 100644 --- a/arango/utils.py +++ b/arango/utils.py @@ -8,7 +8,7 @@ import logging from contextlib import contextmanager -from typing import Any, Iterator, List, Optional, Union +from typing import Any, Iterator, List, Optional, Sequence, Union from arango.exceptions import DocumentParseError from arango.typings import Json @@ -85,8 +85,8 @@ def is_none_or_str(obj: Any) -> bool: def get_batches( - l: List[Any], batch_size: Optional[int] = None -) -> Union[List[List[Any]], Iterator[List[Any]]]: + l: Sequence[Json], batch_size: Optional[int] = None +) -> Union[List[Sequence[Json]], Iterator[Sequence[Json]]]: """Generator to split a list in batches of (maximum) **batch_size** elements each. If **batch_size** is invalid, return entire @@ -100,7 +100,7 @@ def get_batches( if batch_size is None or batch_size <= 0 or batch_size >= len(l): return [l] - def generator() -> Iterator[List[Any]]: + def generator() -> Iterator[Sequence[Json]]: n = int(batch_size) # type: ignore # (false positive) for i in range(0, len(l), n): yield l[i : i + n] From 918391e69be6eedc62fe767d7f8215bd1af5185c Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 15:25:02 -0400 Subject: [PATCH 05/17] improve readability --- arango/collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arango/collection.py b/arango/collection.py index 89697871..8f116ab9 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -189,7 +189,7 @@ def _ensure_key_in_body(self, body: Json, index: Optional[int] = None) -> Json: body = body.copy() body["_key"] = doc_id[len(self._id_prefix) :] return body - elif index: + elif index is not None: body = body.copy() body["_key"] = str(index) return body From 3549999b217b64bb427bfbc79b5e3ef23955e33d Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 15:42:37 -0400 Subject: [PATCH 06/17] cleanup: test_graph_provision --- tests/test_graph.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_graph.py b/tests/test_graph.py index 34215661..14898459 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -72,8 +72,11 @@ def test_graph_provision(graph, db): "to_vertex_collections": ["numbers"], } ] + + name = "divisibility-graph" + db.delete_graph(name, drop_collections=True, ignore_missing=True) graph = db.create_graph( - name="DivisibilityGraph", + name=name, edge_definitions=e_d, collections={ "numbers": {"docs": vertices, "options": {"batch_size": 5}}, From 0d1b1b2728bd1238962ebb9724bddd78b6d52db5 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 14 Jun 2022 21:02:58 -0400 Subject: [PATCH 07/17] whitespace --- arango/collection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arango/collection.py b/arango/collection.py index 8f116ab9..bc399800 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -198,6 +198,7 @@ def _ensure_key_in_body(self, body: Json, index: Optional[int] = None) -> Json: def _ensure_key_from_id(self, body: Json) -> Json: """Return the body with "_key" field if it has "_id" field. + :param body: Document body. :type body: dict :return: Document body with "_key" field if it has "_id" field. From 2095fdd71bd0d5f900aca05b214e5bdc125e2a33 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Fri, 17 Jun 2022 10:48:55 -0400 Subject: [PATCH 08/17] fix: typo --- arango/collection.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index bc399800..a04f6332 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -2027,7 +2027,7 @@ def response_handler(resp: Response) -> Json: return result raise DocumentInsertError(resp, request) - result = [] + results = [] for batch in get_batches(documents, batch_size): request = Request( method="post", @@ -2037,9 +2037,9 @@ def response_handler(resp: Response) -> Json: write=self.name, ) - result.append(self._execute(request, response_handler)) + results.append(self._execute(request, response_handler)) - return result[0] if len(result) == 1 else result + return results[0] if len(results) == 1 else results class StandardCollection(Collection): From 800fafdf2e80a7fc35066425bce2be322e2c1606 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Fri, 17 Jun 2022 20:05:16 -0400 Subject: [PATCH 09/17] revert: _ensure_key_in_body changes (no suitable for client-side functionality) --- arango/collection.py | 20 ++++---------------- tests/test_graph.py | 2 +- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index a04f6332..549710e2 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -169,18 +169,13 @@ def _prep_from_doc( else: return doc_id, doc_id, {"If-Match": rev} - def _ensure_key_in_body(self, body: Json, index: Optional[int] = None) -> Json: - """Return the document body with "_key" field populated. If - neither "_key" or "_id" exist, set "_key" value to **index**, - where **index** is the document's position in the sequence. - + def _ensure_key_in_body(self, body: Json) -> Json: + """Return the document body with "_key" field populated. :param body: Document body. :type body: dict - :param index: Document index value in the original list of documents. - :param index: int | None :return: Document body with "_key" field. :rtype: dict - :raise arango.exceptions.DocumentParseError: On missing _key, _id, & index. + :raise arango.exceptions.DocumentParseError: On missing ID and key. """ if "_key" in body: return body @@ -189,11 +184,6 @@ def _ensure_key_in_body(self, body: Json, index: Optional[int] = None) -> Json: body = body.copy() body["_key"] = doc_id[len(self._id_prefix) :] return body - elif index is not None: - body = body.copy() - body["_key"] = str(index) - return body - raise DocumentParseError('field "_key" or "_id" required') def _ensure_key_from_id(self, body: Json) -> Json: @@ -2001,9 +1991,7 @@ def import_bulk( :rtype: dict :raise arango.exceptions.DocumentInsertError: If import fails. """ - documents = [ - self._ensure_key_in_body(doc, i) for i, doc in enumerate(documents, 1) - ] + documents = [self._ensure_key_from_id(doc) for doc in documents] params: Params = {"type": "array", "collection": self.name} if halt_on_error is not None: diff --git a/tests/test_graph.py b/tests/test_graph.py index 14898459..7b9c8427 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -58,7 +58,7 @@ def test_graph_properties(graph, bad_graph, db): def test_graph_provision(graph, db): - vertices = [{"foo": i} for i in range(1, 101)] + vertices = [{"_key": str(i)} for i in range(1, 101)] edges = [ {"_from": f"numbers/{j}", "_to": f"numbers/{i}", "result": j / i} for i in range(1, 101) From 847ab74449d4e1b4ec352ab3d05216f3e5b4fe68 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Fri, 17 Jun 2022 20:06:12 -0400 Subject: [PATCH 10/17] fix: whitespace --- arango/collection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arango/collection.py b/arango/collection.py index 549710e2..fb06fb60 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -171,6 +171,7 @@ def _prep_from_doc( def _ensure_key_in_body(self, body: Json) -> Json: """Return the document body with "_key" field populated. + :param body: Document body. :type body: dict :return: Document body with "_key" field. From dc6f87fa3abe3f08d4dfd6181f8b6e8d1408ec2d Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 21 Jun 2022 13:36:49 -0400 Subject: [PATCH 11/17] fix: address comment pt1 https://github.com/ArangoDB-Community/python-arango/pull/207#discussion_r901236443 --- arango/collection.py | 2 +- arango/utils.py | 25 +++++++------------------ 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index fb06fb60..995ddb81 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -2017,7 +2017,7 @@ def response_handler(resp: Response) -> Json: raise DocumentInsertError(resp, request) results = [] - for batch in get_batches(documents, batch_size): + for batch in get_batches(documents, batch_size or len(documents)): request = Request( method="post", endpoint="/_api/import", diff --git a/arango/utils.py b/arango/utils.py index 4f9242b5..f3e6702a 100644 --- a/arango/utils.py +++ b/arango/utils.py @@ -8,7 +8,7 @@ import logging from contextlib import contextmanager -from typing import Any, Iterator, List, Optional, Sequence, Union +from typing import Any, Iterator, Sequence, Union from arango.exceptions import DocumentParseError from arango.typings import Json @@ -84,25 +84,14 @@ def is_none_or_str(obj: Any) -> bool: return obj is None or isinstance(obj, str) -def get_batches( - l: Sequence[Json], batch_size: Optional[int] = None -) -> Union[List[Sequence[Json]], Iterator[Sequence[Json]]]: +def get_batches(elements: Sequence[Json], batch_size: int) -> Iterator[Sequence[Json]]: """Generator to split a list in batches of (maximum) **batch_size** elements each. - If **batch_size** is invalid, return entire - list as one batch. - :param l: The list of elements. - :type l: list + :param elements: The list of elements. + :type elements: Sequence[Json] :param batch_size: Number of elements per batch. - :type batch_size: int | None + :type batch_size: int """ - if batch_size is None or batch_size <= 0 or batch_size >= len(l): - return [l] - - def generator() -> Iterator[Sequence[Json]]: - n = int(batch_size) # type: ignore # (false positive) - for i in range(0, len(l), n): - yield l[i : i + n] - - return generator() + for index in range(0, len(elements), batch_size): + yield elements[index : index + batch_size] From 56d129fd337e7aa6eb97cf6c8dd33086827c4674 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 21 Jun 2022 13:40:53 -0400 Subject: [PATCH 12/17] address comment pt2 https://github.com/ArangoDB-Community/python-arango/pull/207#discussion_r902156983 --- arango/collection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index 995ddb81..e7cd0b96 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -1986,10 +1986,11 @@ def import_bulk( :param sync: Block until operation is synchronized to disk. :type sync: bool | None :param batch_size: Max number of documents to import at once. If - unspecified, will import all documents at once. + unspecified, will import all documents at once. Note that the + output type changes to list[dict] if **batch_size** is specified. :type batch_size: int | None :return: Result of the bulk import. - :rtype: dict + :rtype: dict | list[dict] :raise arango.exceptions.DocumentInsertError: If import fails. """ documents = [self._ensure_key_from_id(doc) for doc in documents] From 56d74fd898bc598561c6b44d06f64efd25286769 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Tue, 21 Jun 2022 14:00:24 -0400 Subject: [PATCH 13/17] remove: graph provision functionality https://github.com/ArangoDB-Community/python-arango/pull/207#discussion_r901241261 --- arango/database.py | 37 +------------------------------------ tests/test_graph.py | 31 ------------------------------- 2 files changed, 1 insertion(+), 67 deletions(-) diff --git a/arango/database.py b/arango/database.py index 89e92955..ea3adc6a 100644 --- a/arango/database.py +++ b/arango/database.py @@ -1170,7 +1170,6 @@ def create_graph( shard_count: Optional[int] = None, replication_factor: Optional[int] = None, write_concern: Optional[int] = None, - collections: Optional[Json] = None, ) -> Result[Graph]: """Create a new graph. @@ -1236,32 +1235,6 @@ def create_graph( 'to_vertex_collections': ['lectures'] } ] - - Here is an example entry for parameter **collections**: - TODO: Rework **collections** data structure? - .. code-block:: python - - { - 'teachers': { - 'docs': teacher_vertices_to_insert - 'options': { - 'overwrite' = True, - 'sync' = True, - 'batch_size' = 50 - } - }, - 'lectures': { - 'docs': lecture_vertices_to_insert - 'options': { - 'overwrite' = False, - 'sync' = False, - 'batch_size' = 4 - } - }, - 'teach': { - 'docs': teach_edges_to_insert - } - } """ data: Json = {"name": name, "options": dict()} if edge_definitions is not None: @@ -1295,15 +1268,7 @@ def response_handler(resp: Response) -> Graph: return Graph(self._conn, self._executor, name) raise GraphCreateError(resp, request) - graph = self._execute(request, response_handler) - - if collections is not None: - for name, data in collections.items(): - self.collection(name).import_bulk( - data["docs"], **data.get("options", {}) - ) - - return graph + return self._execute(request, response_handler) def delete_graph( self, diff --git a/tests/test_graph.py b/tests/test_graph.py index 7b9c8427..de440642 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -57,37 +57,6 @@ def test_graph_properties(graph, bad_graph, db): assert isinstance(properties["revision"], str) -def test_graph_provision(graph, db): - vertices = [{"_key": str(i)} for i in range(1, 101)] - edges = [ - {"_from": f"numbers/{j}", "_to": f"numbers/{i}", "result": j / i} - for i in range(1, 101) - for j in range(1, 101) - if j % i == 0 - ] - e_d = [ - { - "edge_collection": "is_divisible_by", - "from_vertex_collections": ["numbers"], - "to_vertex_collections": ["numbers"], - } - ] - - name = "divisibility-graph" - db.delete_graph(name, drop_collections=True, ignore_missing=True) - graph = db.create_graph( - name=name, - edge_definitions=e_d, - collections={ - "numbers": {"docs": vertices, "options": {"batch_size": 5}}, - "is_divisible_by": {"docs": edges, "options": {"sync": True}}, - }, - ) - - assert graph.vertex_collection("numbers").count() == len(vertices) - assert graph.edge_collection("is_divisible_by").count() == len(edges) - - def test_graph_management(db, bad_db): # Test create graph graph_name = generate_graph_name() From 3aeadecfa2db5ce9f65463d3037f4f01d84be100 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Date: Tue, 21 Jun 2022 15:25:30 -0400 Subject: [PATCH 14/17] Update database.py --- arango/database.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/arango/database.py b/arango/database.py index ea3adc6a..5e9dcfa8 100644 --- a/arango/database.py +++ b/arango/database.py @@ -1217,9 +1217,6 @@ def create_graph( parameter cannot be larger than that of **replication_factor**. Default value is 1. Used for clusters only. :type write_concern: int - :param collections: A list collection data objects to provision - the graph with. See below for example. - :type collections: dict | None :return: Graph API wrapper. :rtype: arango.graph.Graph :raise arango.exceptions.GraphCreateError: If create fails. From 5db70551dc96de6b241bdff373121ea6b3111e46 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Wed, 22 Jun 2022 08:33:24 -0400 Subject: [PATCH 15/17] address docstring comments --- arango/collection.py | 13 +++++++++---- arango/utils.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index e7cd0b96..01d35b73 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -1985,10 +1985,15 @@ def import_bulk( :type on_duplicate: str :param sync: Block until operation is synchronized to disk. :type sync: bool | None - :param batch_size: Max number of documents to import at once. If - unspecified, will import all documents at once. Note that the - output type changes to list[dict] if **batch_size** is specified. - :type batch_size: int | None + :param batch_size: Split up **documents** into batches of max length + **batch_size** and import them in a loop on the client side. If + **batch_size** is specified, the return type of this method + changes from a result object to a list of result objects. + IMPORTANT NOTE: this parameter may go through breaking changes + in the future where the return type may not be a list of result + objects anymore. Use it at your own risk, and avoid + depending on the return value if possible. + :type batch_size: int :return: Result of the bulk import. :rtype: dict | list[dict] :raise arango.exceptions.DocumentInsertError: If import fails. diff --git a/arango/utils.py b/arango/utils.py index f3e6702a..27d459ba 100644 --- a/arango/utils.py +++ b/arango/utils.py @@ -90,7 +90,7 @@ def get_batches(elements: Sequence[Json], batch_size: int) -> Iterator[Sequence[ :param elements: The list of elements. :type elements: Sequence[Json] - :param batch_size: Number of elements per batch. + :param batch_size: Max number of elements per batch. :type batch_size: int """ for index in range(0, len(elements), batch_size): From f722effe4fa8136f49c72c47ab6e328d93cc82d0 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Wed, 22 Jun 2022 09:15:14 -0400 Subject: [PATCH 16/17] adjust import_bulk() return behavior (always return result in `list` format if `batch_size` is specified) --- arango/collection.py | 2 +- tests/test_document.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index 01d35b73..4c834efc 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -2034,7 +2034,7 @@ def response_handler(resp: Response) -> Json: results.append(self._execute(request, response_handler)) - return results[0] if len(results) == 1 else results + return results[0] if batch_size is None else results class StandardCollection(Collection): diff --git a/tests/test_document.py b/tests/test_document.py index eacb407b..02c47bc4 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -1839,7 +1839,8 @@ def test_document_import_bulk(col, bad_col, docs): empty_collection(col) result = col.import_bulk(docs, batch_size=len(docs) * 2) - assert type(result) is dict + assert type(result) is list + assert len(result) == 1 empty_collection(col) # Test import bulk on_duplicate actions From c65af2add238347c12d990a70bb7fb733a0a0ed7 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Wed, 22 Jun 2022 20:43:57 -0400 Subject: [PATCH 17/17] address comments --- arango/collection.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arango/collection.py b/arango/collection.py index 4c834efc..3aea6f81 100644 --- a/arango/collection.py +++ b/arango/collection.py @@ -2022,19 +2022,29 @@ def response_handler(resp: Response) -> Json: return result raise DocumentInsertError(resp, request) - results = [] - for batch in get_batches(documents, batch_size or len(documents)): + if batch_size is None: request = Request( method="post", endpoint="/_api/import", - data=batch, + data=documents, params=params, write=self.name, ) - results.append(self._execute(request, response_handler)) + return self._execute(request, response_handler) + else: + results = [] + for batch in get_batches(documents, batch_size): + request = Request( + method="post", + endpoint="/_api/import", + data=batch, + params=params, + write=self.name, + ) + results.append(self._execute(request, response_handler)) - return results[0] if batch_size is None else results + return results class StandardCollection(Collection):