Skip to content

Explicit trace ID propagation for SFN #526

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions datadog_lambda/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
# This product includes software developed at Datadog (https://www.datadoghq.com/).
# Copyright 2019 Datadog, Inc.

# Datadog trace sampling priority


# Datadog trace sampling priority
class SamplingPriority(object):
USER_REJECT = -1
AUTO_REJECT = 0
Expand All @@ -18,6 +17,7 @@ class TraceHeader(object):
TRACE_ID = "x-datadog-trace-id"
PARENT_ID = "x-datadog-parent-id"
SAMPLING_PRIORITY = "x-datadog-sampling-priority"
TAGS = "x-datadog-tags"


# X-Ray subsegment to save Datadog trace metadata
Expand Down
59 changes: 40 additions & 19 deletions datadog_lambda/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,10 @@ def extract_context_from_kinesis_event(event, lambda_context):


def _deterministic_sha256_hash(s: str, part: str) -> (int, int):
sha256_hash = hashlib.sha256(s.encode()).hexdigest()
return _sha256_to_binary_part(hashlib.sha256(s.encode()).hexdigest(), part)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Split this up with a helper so we have an entrypoint for our sha256_hash from the x-datadog-trace-id-hash and x-datadog-parent-id-hash

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!



def _sha256_to_binary_part(sha256_hash: str, part: str) -> (int, int):
# First two chars is '0b'. zfill to ensure 256 bits, but we only care about the first 128 bits
binary_hash = bin(int(sha256_hash, 16))[2:].zfill(256)
if part == HIGHER_64_BITS:
Expand All @@ -377,30 +379,49 @@ def extract_context_from_step_functions(event, lambda_context):
into lambda's event dict.
"""
try:
execution_id = event.get("Execution").get("Id")
state_name = event.get("State").get("Name")
state_entered_time = event.get("State").get("EnteredTime")
# returning 128 bits since 128bit traceId will be break up into
# traditional traceId and _dd.p.tid tag
# https://github.com/DataDog/dd-trace-py/blob/3e34d21cb9b5e1916e549047158cb119317b96ab/ddtrace/propagation/http.py#L232-L240
trace_id = _deterministic_sha256_hash(execution_id, LOWER_64_BITS)

parent_id = _deterministic_sha256_hash(
f"{execution_id}#{state_name}#{state_entered_time}", HIGHER_64_BITS
)
meta = {}

if "_datadog" in event:
dd_data = event.get("_datadog")
parent_id = _sha256_to_binary_part(
dd_data.get("x-datadog-parent-id-hash"), HIGHER_64_BITS
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: Since this extract_context_from_step_functions() is executed when we know _datadog.serverless-version=v2, we don't need to check if this hash key exists. The v2 contract is that they need to have the _datadog.x-datadog-parent-id-hash.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be misunderstanding, we're not checking if the hash key exists right?

We're computing the parent_id no matter what, the only thing we check in this branch is if x-datadog-trace-id exists

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we're saying it's the same EventTypes.STEP_FUNCTIONS whether its the old format with context object or new one with _datadog. Both cases will enter extract_context_from_step_functions()

If we want to separate them, one thing we could do is make a new V2 event with a V2 extractor to go along with it

)
if "x-datadog-trace-id" in dd_data: # lambda root
dd_data["x-datadog-parent-id"] = str(parent_id)
return propagator.extract(dd_data)
else: # sfn root
trace_id = _sha256_to_binary_part(
dd_data.get("x-datadog-trace-id-hash"), LOWER_64_BITS
)
meta["_dd.p.tid"] = hex(
_sha256_to_binary_part(
dd_data.get("x-datadog-trace-id-hash"), HIGHER_64_BITS
)
)[2:]
Comment on lines +392 to +400
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code Quality Violation

else is not necessary since the if clause has a return (...read more)

If the code in the if branch returns a value, do not have the else branch present.

View in Datadog  Leave us feedback  Documentation

else:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole branch should be the old behavior

execution_id = event.get("Execution").get("Id")
state_name = event.get("State").get("Name")
state_entered_time = event.get("State").get("EnteredTime")
# returning 128 bits since 128bit traceId will be break up into
# traditional traceId and _dd.p.tid tag
# https://github.com/DataDog/dd-trace-py/blob/3e34d21cb9b5e1916e549047158cb119317b96ab/ddtrace/propagation/http.py#L232-L240
trace_id = _deterministic_sha256_hash(execution_id, LOWER_64_BITS)
# take the higher 64 bits as _dd.p.tid tag and use hex to encode
# [2:] to remove '0x' in the hex str
meta["_dd.p.tid"] = hex(
_deterministic_sha256_hash(execution_id, HIGHER_64_BITS)
)[2:]

parent_id = _deterministic_sha256_hash(
f"{execution_id}#{state_name}#{state_entered_time}", HIGHER_64_BITS
)

sampling_priority = SamplingPriority.AUTO_KEEP
return Context(
trace_id=trace_id,
span_id=parent_id,
sampling_priority=sampling_priority,
# take the higher 64 bits as _dd.p.tid tag and use hex to encode
# [2:] to remove '0x' in the hex str
meta={
"_dd.p.tid": hex(
_deterministic_sha256_hash(execution_id, HIGHER_64_BITS)
)[2:]
},
meta=meta,
)
except Exception as e:
logger.debug("The Step Functions trace extractor returned with error %s", e)
Expand Down
4 changes: 3 additions & 1 deletion datadog_lambda/trigger.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,9 @@ def parse_event_source(event: dict) -> _EventSource:
if event.get("source") == "aws.events" or has_event_categories:
event_source = _EventSource(EventTypes.CLOUDWATCH_EVENTS)

if "Execution" in event and "StateMachine" in event and "State" in event:
if (
"_datadog" in event and event.get("_datadog").get("serverless-version") == "v2"
) or ("Execution" in event and "StateMachine" in event and "State" in event):
event_source = _EventSource(EventTypes.STEPFUNCTIONS)

event_record = get_first_record(event)
Expand Down
78 changes: 75 additions & 3 deletions tests/test_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ def test_with_complete_datadog_trace_headers_with_trigger_tags(self):
@with_trace_propagation_style("datadog")
def test_step_function_trace_data(self):
lambda_ctx = get_mock_context()
sqs_event = {
sfn_event = {
"Execution": {
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
},
Expand All @@ -627,7 +627,7 @@ def test_step_function_trace_data(self):
"EnteredTime": "Mon Nov 13 12:43:33 PST 2023",
},
}
ctx, source, event_source = extract_dd_trace_context(sqs_event, lambda_ctx)
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
self.assertEqual(source, "event")
expected_context = Context(
trace_id=3675572987363469717,
Expand All @@ -642,7 +642,79 @@ def test_step_function_trace_data(self):
TraceHeader.TRACE_ID: "3675572987363469717",
TraceHeader.PARENT_ID: "10713633173203262661",
TraceHeader.SAMPLING_PRIORITY: "1",
"x-datadog-tags": "_dd.p.tid=e987c84b36b11ab",
TraceHeader.TAGS: "_dd.p.tid=e987c84b36b11ab",
},
)
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
self.mock_send_segment.assert_called_with(
XraySubsegment.TRACE_KEY,
expected_context,
)

@with_trace_propagation_style("datadog")
def test_step_function_trace_data_lambda_root(self):
lambda_ctx = get_mock_context()
sfn_event = {
"_datadog": {
"x-datadog-trace-id": "5821803790426892636",
"x-datadog-sampling-priority": "1",
"x-datadog-tags": "_dd.p.dm=-0,_dd.p.tid=672a7cb100000000",
"traceparent": "00-672a7cb10000000050cb33b3c06ae95c-5fda9d8d1d1373f9-01",
"tracestate": "dd=p:5fda9d8d1d1373f9;s:1;t.dm:-0;t.tid:672a7cb100000000",
"x-datadog-parent-id-hash": "a926584eba705d6ec904c54db2ecc4d4a2c91e7dabe7ce87ac26edb43388fbc5",
"serverless-version": "v2",
}
}
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
self.assertEqual(source, "event")
expected_context = Context(
trace_id=137131089076080415507232535361568303452,
span_id=2965154499828669806,
sampling_priority=1,
meta={"_dd.p.dm": "-0", "_dd.p.tid": "672a7cb100000000"},
)
self.assertEqual(ctx, expected_context)
self.assertEqual(
get_dd_trace_context(),
{
TraceHeader.TRACE_ID: "5821803790426892636",
TraceHeader.PARENT_ID: "10713633173203262661",
TraceHeader.SAMPLING_PRIORITY: "1",
TraceHeader.TAGS: "_dd.p.dm=-0,_dd.p.tid=672a7cb100000000",
},
)
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
self.mock_send_segment.assert_called_with(
XraySubsegment.TRACE_KEY,
expected_context,
)

@with_trace_propagation_style("datadog")
def test_step_function_trace_data_sfn_root(self):
lambda_ctx = get_mock_context()
sfn_event = {
"_datadog": {
"x-datadog-trace-id-hash": "fed93f8c162880cb9aa90fcd1f8395383835841d5470d30215f3dd52906ebc58",
"x-datadog-parent-id-hash": "c5eb94cc9220ab5783e1db53debd54b8c93f6f2a3eae1c680d7b849f2d34e551",
"serverless-version": "v2",
}
}
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
self.assertEqual(source, "event")
expected_context = Context(
trace_id=1921084089721656632,
span_id=5038284214489885527,
sampling_priority=1,
meta={"_dd.p.tid": "7ed93f8c162880cb"},
)
self.assertEqual(ctx, expected_context)
self.assertEqual(
get_dd_trace_context(),
{
TraceHeader.TRACE_ID: "1921084089721656632",
TraceHeader.PARENT_ID: "10713633173203262661",
TraceHeader.SAMPLING_PRIORITY: "1",
TraceHeader.TAGS: "_dd.p.tid=7ed93f8c162880cb",
},
)
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
Expand Down
Loading