Skip to content

Commit 371be70

Browse files
authored
[DE-664] Deprecating Batch API (#279)
* Using TreadPoolExecutor for batch API * Adapted tests * Adapted formatter * Changing changelog and driver version * Updating docs
1 parent bedbd72 commit 371be70

File tree

9 files changed

+126
-140
lines changed

9 files changed

+126
-140
lines changed

CHANGELOG.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,33 @@
11
main
22
-----
33

4+
* Refactoring `BatchDatabase` API
5+
6+
- The batch API is deprecated since ArangoDB 3.8.0 and will be removed in a future version.
7+
- The `BatchDatabase` is still available, but it now uses a `TreadPoolExecutor` internally.
8+
- For backwards compatibility, the `BatchDatabase` uses only one worker thread, essentially
9+
sending the requests sequentially. Feel free to set the `max_workers` parameter to a higher
10+
value if you want to use multiple threads, but be aware that the requests will be sent in
11+
parallel, which may cause problems if you are using transactions.
12+
- To discourage the use of this API, we now issue a warning when the `BatchDatabase` is used.
13+
14+
Note that `{"foo": "bar"}` may be inserted after `{"foo": "baz"}` in the following example:
15+
```python
16+
with db.begin_batch_execution(max_workers=2) as batch_db:
17+
job1 = batch_db.collection.insert({"foo": "bar"})
18+
job2 = batch_db.collection.insert({"foo": "baz"})
19+
```
20+
21+
7.6.2
22+
-----
23+
24+
* Fix: build_filter_conditions utils method
25+
426
7.6.1
527
-----
628

729
* [DE-542] Added `shards()` method to `Collection` by @apetenchea in https://github.com/ArangoDB-Community/python-arango/pull/274
8-
* [DE-584] Refactor deprecated `/_api/simple` methods by @aMahanna in https://github.com/ArangoDB-Community/python-arango/pull/268
30+
* [DE-584] Refactor deprecated `/_api/simple` methods by @aMahanna in https://github.com/ArangoDB-Community/python-arango/pull/275
931
* Added `raise_on_document_error` parameter to `Collection.update_many()` by @aMahanna in https://github.com/ArangoDB-Community/python-arango/pull/273
1032
* Added `computed_values` parameter to `Collection.onfigure()` by @aMahanna in https://github.com/ArangoDB-Community/python-arango/pull/268
1133
* Various bug fixes

arango/database.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datetime import datetime
1010
from numbers import Number
1111
from typing import Any, List, Optional, Sequence, Union
12+
from warnings import warn
1213

1314
from arango.api import ApiGroup
1415
from arango.aql import AQL
@@ -2549,18 +2550,38 @@ def begin_async_execution(self, return_result: bool = True) -> "AsyncDatabase":
25492550
"""
25502551
return AsyncDatabase(self._conn, return_result)
25512552

2552-
def begin_batch_execution(self, return_result: bool = True) -> "BatchDatabase":
2553+
def begin_batch_execution(
2554+
self,
2555+
return_result: bool = True,
2556+
max_workers: Optional[int] = 1,
2557+
) -> "BatchDatabase":
25532558
"""Begin batch execution.
25542559
2560+
.. warning::
2561+
2562+
The batch request API is deprecated since ArangoDB 3.8.0.
2563+
This functionality should no longer be used.
2564+
To send multiple documents at once to an ArangoDB instance,
2565+
please use any of :class:`arango.collection.Collection` methods
2566+
that accept a list of documents as input.
2567+
See :func:`~arango.collection.Collection.insert_many`,
2568+
:func:`~arango.collection.Collection.update_many`,
2569+
:func:`~arango.collection.Collection.replace_many`,
2570+
:func:`~arango.collection.Collection.delete_many`.
2571+
25552572
:param return_result: If set to True, API executions return instances
25562573
of :class:`arango.job.BatchJob` that are populated with results on
25572574
commit. If set to False, API executions return None and no results
25582575
are tracked client-side.
25592576
:type return_result: bool
2577+
:param max_workers: Maximum number of workers to use for submitting
2578+
requests asynchronously. If None, the default value is the minimum
2579+
between `os.cpu_count()` and the number of requests.
2580+
:type max_workers: Optional[int]
25602581
:return: Database API wrapper object specifically for batch execution.
25612582
:rtype: arango.database.BatchDatabase
25622583
"""
2563-
return BatchDatabase(self._conn, return_result)
2584+
return BatchDatabase(self._conn, return_result, max_workers)
25642585

25652586
def begin_transaction(
25662587
self,
@@ -2648,20 +2669,34 @@ def __repr__(self) -> str:
26482669
class BatchDatabase(Database):
26492670
"""Database API wrapper tailored specifically for batch execution.
26502671
2651-
See :func:`arango.database.StandardDatabase.begin_batch_execution`.
2672+
.. note::
2673+
2674+
This class is not intended to be instantiated directly.
2675+
See
2676+
:func:`arango.database.StandardDatabase.begin_batch_execution`.
26522677
26532678
:param connection: HTTP connection.
26542679
:param return_result: If set to True, API executions return instances of
26552680
:class:`arango.job.BatchJob` that are populated with results on commit.
26562681
If set to False, API executions return None and no results are tracked
26572682
client-side.
26582683
:type return_result: bool
2684+
:param max_workers: Use a thread pool of at most `max_workers`.
2685+
:type max_workers: Optional[int]
26592686
"""
26602687

2661-
def __init__(self, connection: Connection, return_result: bool) -> None:
2688+
def __init__(
2689+
self, connection: Connection, return_result: bool, max_workers: Optional[int]
2690+
) -> None:
26622691
self._executor: BatchApiExecutor
26632692
super().__init__(
2664-
connection=connection, executor=BatchApiExecutor(connection, return_result)
2693+
connection=connection,
2694+
executor=BatchApiExecutor(connection, return_result, max_workers),
2695+
)
2696+
warn(
2697+
"The batch request API is deprecated since ArangoDB version 3.8.0.",
2698+
FutureWarning,
2699+
stacklevel=3,
26652700
)
26662701

26672702
def __repr__(self) -> str:

arango/executor.py

Lines changed: 23 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,13 @@
88
]
99

1010
from collections import OrderedDict
11+
from concurrent.futures import ThreadPoolExecutor
12+
from os import cpu_count
1113
from typing import Any, Callable, Optional, Sequence, Tuple, TypeVar, Union
12-
from urllib.parse import urlencode
13-
from uuid import uuid4
1414

1515
from arango.connection import Connection
1616
from arango.exceptions import (
1717
AsyncExecuteError,
18-
BatchExecuteError,
1918
BatchStateError,
2019
OverloadControlExecutorError,
2120
TransactionAbortError,
@@ -27,7 +26,6 @@
2726
from arango.request import Request
2827
from arango.response import Response
2928
from arango.typings import Fields, Json
30-
from arango.utils import suppress_warning
3129

3230
ApiExecutor = Union[
3331
"DefaultApiExecutor",
@@ -126,35 +124,29 @@ class BatchApiExecutor:
126124
If set to False, API executions return None and no results are tracked
127125
client-side.
128126
:type return_result: bool
127+
:param max_workers: Use a thread pool of at most `max_workers`. If None,
128+
the default value is the number of CPUs. For backwards compatibility,
129+
the default value is 1, effectively behaving like single-threaded
130+
execution.
131+
:type max_workers: Optional[int]
129132
"""
130133

131-
def __init__(self, connection: Connection, return_result: bool) -> None:
134+
def __init__(
135+
self,
136+
connection: Connection,
137+
return_result: bool,
138+
max_workers: Optional[int] = 1,
139+
) -> None:
132140
self._conn = connection
133141
self._return_result: bool = return_result
134142
self._queue: OrderedDict[str, Tuple[Request, BatchJob[Any]]] = OrderedDict()
135143
self._committed: bool = False
144+
self._max_workers: int = max_workers or cpu_count() # type: ignore
136145

137146
@property
138147
def context(self) -> str:
139148
return "batch"
140149

141-
def _stringify_request(self, request: Request) -> str:
142-
path = request.endpoint
143-
144-
if request.params is not None:
145-
path += f"?{urlencode(request.params)}"
146-
buffer = [f"{request.method} {path} HTTP/1.1"]
147-
148-
if request.headers is not None:
149-
for key, value in sorted(request.headers.items()):
150-
buffer.append(f"{key}: {value}")
151-
152-
if request.data is not None:
153-
serialized = self._conn.serialize(request.data)
154-
buffer.append("\r\n" + serialized)
155-
156-
return "\r\n".join(buffer)
157-
158150
@property
159151
def jobs(self) -> Optional[Sequence[BatchJob[Any]]]:
160152
"""Return the queued batch jobs.
@@ -190,7 +182,7 @@ def execute(
190182
return job if self._return_result else None
191183

192184
def commit(self) -> Optional[Sequence[BatchJob[Any]]]:
193-
"""Execute the queued requests in a single batch API request.
185+
"""Execute the queued requests in a batch of requests.
194186
195187
If **return_result** parameter was set to True during initialization,
196188
:class:`arango.job.BatchJob` instances are populated with results.
@@ -199,9 +191,7 @@ def commit(self) -> Optional[Sequence[BatchJob[Any]]]:
199191
False during initialization.
200192
:rtype: [arango.job.BatchJob] | None
201193
:raise arango.exceptions.BatchStateError: If batch state is invalid
202-
(e.g. batch was already committed or size of response from server
203-
did not match the expected).
204-
:raise arango.exceptions.BatchExecuteError: If commit fails.
194+
(e.g. batch was already committed).
205195
"""
206196
if self._committed:
207197
raise BatchStateError("batch already committed")
@@ -211,65 +201,18 @@ def commit(self) -> Optional[Sequence[BatchJob[Any]]]:
211201
if len(self._queue) == 0:
212202
return self.jobs
213203

214-
# Boundary used for multipart request
215-
boundary = uuid4().hex
204+
with ThreadPoolExecutor(
205+
max_workers=min(self._max_workers, len(self._queue))
206+
) as executor:
207+
for req, job in self._queue.values():
208+
job._future = executor.submit(self._conn.send_request, req)
216209

217-
# Build the batch request payload
218-
buffer = []
219-
for req, job in self._queue.values():
220-
buffer.append(f"--{boundary}")
221-
buffer.append("Content-Type: application/x-arango-batchpart")
222-
buffer.append(f"Content-Id: {job.id}")
223-
buffer.append("\r\n" + self._stringify_request(req))
224-
buffer.append(f"--{boundary}--")
225-
226-
request = Request(
227-
method="post",
228-
endpoint="/_api/batch",
229-
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
230-
data="\r\n".join(buffer),
231-
)
232-
with suppress_warning("requests.packages.urllib3.connectionpool"):
233-
resp = self._conn.send_request(request)
234-
235-
if not resp.is_success:
236-
raise BatchExecuteError(resp, request)
210+
for _, job in self._queue.values():
211+
job._status = "done"
237212

238213
if not self._return_result:
239214
return None
240215

241-
url_prefix = resp.url.strip("/_api/batch")
242-
raw_responses = resp.raw_body.split(f"--{boundary}")[1:-1]
243-
244-
if len(self._queue) != len(raw_responses):
245-
raise BatchStateError(
246-
"expecting {} parts in batch response but got {}".format(
247-
len(self._queue), len(raw_responses)
248-
)
249-
)
250-
for raw_resp in raw_responses:
251-
# Parse and breakdown the batch response body
252-
resp_parts = raw_resp.strip().split("\r\n")
253-
raw_content_id = resp_parts[1]
254-
raw_body = resp_parts[-1]
255-
raw_status = resp_parts[3]
256-
job_id = raw_content_id.split(" ")[1]
257-
_, status_code, status_text = raw_status.split(" ", 2)
258-
259-
# Update the corresponding batch job
260-
queued_req, queued_job = self._queue[job_id]
261-
262-
queued_job._status = "done"
263-
resp = Response(
264-
method=queued_req.method,
265-
url=url_prefix + queued_req.endpoint,
266-
headers={},
267-
status_code=int(status_code),
268-
status_text=status_text,
269-
raw_body=raw_body,
270-
)
271-
queued_job._response = self._conn.prep_response(resp)
272-
273216
return self.jobs
274217

275218

arango/formatter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,8 @@ def format_database(body: Json) -> Json:
157157
result["replication_factor"] = body["replicationFactor"]
158158
if "writeConcern" in body:
159159
result["write_concern"] = body["writeConcern"]
160+
if "replicationVersion" in body:
161+
result["replication_version"] = body["replicationVersion"]
160162

161163
return verify_format(body, result)
162164

arango/job.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
__all__ = ["AsyncJob", "BatchJob"]
22

3+
from concurrent.futures import Future
34
from typing import Callable, Generic, Optional, TypeVar
45
from uuid import uuid4
56

@@ -160,13 +161,13 @@ class BatchJob(Generic[T]):
160161
:type response_handler: callable
161162
"""
162163

163-
__slots__ = ["_id", "_status", "_response", "_response_handler"]
164+
__slots__ = ["_id", "_status", "_response_handler", "_future"]
164165

165166
def __init__(self, response_handler: Callable[[Response], T]) -> None:
166167
self._id = uuid4().hex
167168
self._status = "pending"
168-
self._response: Optional[Response] = None
169169
self._response_handler = response_handler
170+
self._future: Optional[Future[Response]] = None
170171

171172
def __repr__(self) -> str:
172173
return f"<BatchJob {self._id}>"
@@ -200,7 +201,7 @@ def result(self) -> T:
200201
:raise arango.exceptions.BatchJobResultError: If job result is not
201202
available (i.e. batch is not committed yet).
202203
"""
203-
if self._status == "pending" or self._response is None:
204+
if self._status == "pending" or self._future is None or not self._future.done():
204205
raise BatchJobResultError("result not available yet")
205206

206-
return self._response_handler(self._response)
207+
return self._response_handler(self._future.result())

arango/request.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def normalize_headers(
1212
if driver_flags is not None:
1313
for flag in driver_flags:
1414
flags = flags + flag + ";"
15-
driver_version = "7.6.1"
15+
driver_version = "7.6.2"
1616
driver_header = "python-arango/" + driver_version + " (" + flags + ")"
1717
normalized_headers: Headers = {
1818
"charset": "utf-8",

docs/batch.rst

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,30 @@
11
Batch API Execution
22
-------------------
3-
4-
In **batch API executions**, requests to ArangoDB server are stored in client-side
5-
in-memory queue, and committed together in a single HTTP call. After the commit,
6-
results can be retrieved later from :ref:`BatchJob` objects.
3+
.. warning::
4+
5+
The batch request API is deprecated since ArangoDB 3.8.0.
6+
We discourage its use, as it will be removed in a future release.
7+
It is already slow and seems to regularly create weird errors when
8+
used with recent versions of ArangoDB.
9+
10+
The driver functionality has been refactored to no longer use the batch API,
11+
but a `ThreadPoolExecutor` instead. For backwards compatibility,
12+
`max_workers` is set to 1 by default, but can be increased to speed up
13+
batch operations. Essentially, the batch API can now be used to send
14+
multiple requests in parallel, but not to send multiple requests in a
15+
single HTTP call. Note that sending multiple requests in parallel may
16+
cause conflicts on the servers side (for example, requests that modify the same document).
17+
18+
To send multiple documents at once to an ArangoDB instance,
19+
please use any of :class:`arango.collection.Collection` methods
20+
that accept a list of documents as input, such as:
21+
22+
* :func:`~arango.collection.Collection.insert_many`
23+
* :func:`~arango.collection.Collection.update_many`
24+
* :func:`~arango.collection.Collection.replace_many`
25+
* :func:`~arango.collection.Collection.delete_many`
26+
27+
After the commit, results can be retrieved later from :ref:`BatchJob` objects.
728

829
**Example:**
930

tests/helpers.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77

88
from arango.cursor import Cursor
9-
from arango.exceptions import AsyncExecuteError, BatchExecuteError, TransactionInitError
9+
from arango.exceptions import AsyncExecuteError, TransactionInitError
1010

1111

1212
def generate_db_name():
@@ -180,6 +180,4 @@ def assert_raises(*exc):
180180
:param exc: Expected exception(s).
181181
:type: exc
182182
"""
183-
return pytest.raises(
184-
exc + (AsyncExecuteError, BatchExecuteError, TransactionInitError)
185-
)
183+
return pytest.raises(exc + (AsyncExecuteError, TransactionInitError))

0 commit comments

Comments
 (0)