From 57bb8a64d1ab5f493c857ac91cb722f6f24bfad6 Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Wed, 4 Dec 2024 10:31:43 -0500 Subject: [PATCH 1/8] use redrive count to generate step functions parent ID --- datadog_lambda/tracing.py | 11 ++++++++++- tests/test_tracing.py | 6 +++--- tests/test_xray.py | 12 ++++++------ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/datadog_lambda/tracing.py b/datadog_lambda/tracing.py index 347e2fb7..f620940c 100644 --- a/datadog_lambda/tracing.py +++ b/datadog_lambda/tracing.py @@ -384,12 +384,21 @@ def _parse_high_64_bits(trace_tags: str) -> str: def _generate_sfn_parent_id(context: dict) -> int: + """ + The upstream Step Function can propagate its execution context to downstream Lambdas. The Lambda can use these + details to share the same traceID and infer its parent's spanID. + + Excluding redriveCount when its 0 to account for cases where customers are using an old version of the Lambda layer + that doesn't use this value for its parentID generation. + """ execution_id = context.get("Execution").get("Id") + redrive_count = context.get("Execution").get("RedriveCount") state_name = context.get("State").get("Name") state_entered_time = context.get("State").get("EnteredTime") return _deterministic_sha256_hash( - f"{execution_id}#{state_name}#{state_entered_time}", HIGHER_64_BITS + f"{execution_id}#{state_name}#{state_entered_time}#{redrive_count}", + HIGHER_64_BITS, ) diff --git a/tests/test_tracing.py b/tests/test_tracing.py index 4c530ad0..680067bd 100644 --- a/tests/test_tracing.py +++ b/tests/test_tracing.py @@ -631,7 +631,7 @@ def test_step_function_trace_data(self): self.assertEqual(source, "event") expected_context = Context( trace_id=3675572987363469717, - span_id=6880978411788117524, + span_id=4929949072763648481, sampling_priority=1, meta={"_dd.p.tid": "e987c84b36b11ab"}, ) @@ -673,7 +673,7 @@ def test_step_function_trace_data_lambda_root(self): self.assertEqual(source, "event") expected_context = Context( trace_id=5821803790426892636, - span_id=6880978411788117524, + span_id=4929949072763648481, sampling_priority=1, meta={"_dd.p.tid": "672a7cb100000000"}, ) @@ -714,7 +714,7 @@ def test_step_function_trace_data_sfn_root(self): self.assertEqual(source, "event") expected_context = Context( trace_id=4521899030418994483, - span_id=6880978411788117524, + span_id=4929949072763648481, sampling_priority=1, meta={"_dd.p.tid": "12d1270d99cc5e03"}, ) diff --git a/tests/test_xray.py b/tests/test_xray.py index 7f33f891..8177e46e 100644 --- a/tests/test_xray.py +++ b/tests/test_xray.py @@ -34,9 +34,9 @@ def test_get_xray_host_port_success(self): def test_send_segment_sampled_out(self): os.environ["AWS_XRAY_DAEMON_ADDRESS"] = "fake-agent.com:8080" - os.environ[ - "_X_AMZN_TRACE_ID" - ] = "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=0;Lineage=c6c5b1b9:0" + os.environ["_X_AMZN_TRACE_ID"] = ( + "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=0;Lineage=c6c5b1b9:0" + ) with patch( "datadog_lambda.xray.sock.send", MagicMock(return_value=None) @@ -47,9 +47,9 @@ def test_send_segment_sampled_out(self): def test_send_segment_sampled(self): os.environ["AWS_XRAY_DAEMON_ADDRESS"] = "fake-agent.com:8080" - os.environ[ - "_X_AMZN_TRACE_ID" - ] = "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=1;Lineage=c6c5b1b9:0" + os.environ["_X_AMZN_TRACE_ID"] = ( + "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=1;Lineage=c6c5b1b9:0" + ) with patch( "datadog_lambda.xray.sock.send", MagicMock(return_value=None) ) as mock_send: From bec850f949a7ae1eedd12e4501c2a14620df70ea Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Wed, 4 Dec 2024 10:38:10 -0500 Subject: [PATCH 2/8] lint --- tests/test_xray.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_xray.py b/tests/test_xray.py index 8177e46e..7f33f891 100644 --- a/tests/test_xray.py +++ b/tests/test_xray.py @@ -34,9 +34,9 @@ def test_get_xray_host_port_success(self): def test_send_segment_sampled_out(self): os.environ["AWS_XRAY_DAEMON_ADDRESS"] = "fake-agent.com:8080" - os.environ["_X_AMZN_TRACE_ID"] = ( - "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=0;Lineage=c6c5b1b9:0" - ) + os.environ[ + "_X_AMZN_TRACE_ID" + ] = "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=0;Lineage=c6c5b1b9:0" with patch( "datadog_lambda.xray.sock.send", MagicMock(return_value=None) @@ -47,9 +47,9 @@ def test_send_segment_sampled_out(self): def test_send_segment_sampled(self): os.environ["AWS_XRAY_DAEMON_ADDRESS"] = "fake-agent.com:8080" - os.environ["_X_AMZN_TRACE_ID"] = ( - "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=1;Lineage=c6c5b1b9:0" - ) + os.environ[ + "_X_AMZN_TRACE_ID" + ] = "Root=1-5e272390-8c398be037738dc042009320;Parent=94ae789b969f1cc5;Sampled=1;Lineage=c6c5b1b9:0" with patch( "datadog_lambda.xray.sock.send", MagicMock(return_value=None) ) as mock_send: From 007658165e339e7fc38aa3b3df1c346c55049a1e Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Wed, 4 Dec 2024 10:42:35 -0500 Subject: [PATCH 3/8] lint --- datadog_lambda/tracing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datadog_lambda/tracing.py b/datadog_lambda/tracing.py index f620940c..6a4683ad 100644 --- a/datadog_lambda/tracing.py +++ b/datadog_lambda/tracing.py @@ -385,11 +385,11 @@ def _parse_high_64_bits(trace_tags: str) -> str: def _generate_sfn_parent_id(context: dict) -> int: """ - The upstream Step Function can propagate its execution context to downstream Lambdas. The Lambda can use these - details to share the same traceID and infer its parent's spanID. + The upstream Step Function can propagate its execution context to downstream Lambdas. The + Lambda can use these details to share the same traceID and infer its parent's spanID. - Excluding redriveCount when its 0 to account for cases where customers are using an old version of the Lambda layer - that doesn't use this value for its parentID generation. + Excluding redriveCount when its 0 to account for cases where customers are using an old + version of the Lambda layer that doesn't use this value for its parentID generation. """ execution_id = context.get("Execution").get("Id") redrive_count = context.get("Execution").get("RedriveCount") From 4ec08d3d41f93aefe57b866f8f1cff8af8e7b549 Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Wed, 4 Dec 2024 15:05:25 -0500 Subject: [PATCH 4/8] dont use redriveCount of 0 --- datadog_lambda/tracing.py | 4 +++- tests/test_tracing.py | 47 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/datadog_lambda/tracing.py b/datadog_lambda/tracing.py index 6a4683ad..890310c9 100644 --- a/datadog_lambda/tracing.py +++ b/datadog_lambda/tracing.py @@ -396,8 +396,10 @@ def _generate_sfn_parent_id(context: dict) -> int: state_name = context.get("State").get("Name") state_entered_time = context.get("State").get("EnteredTime") + redrive_postfix = "" if redrive_count == "0" else f"#{redrive_count}" + return _deterministic_sha256_hash( - f"{execution_id}#{state_name}#{state_entered_time}#{redrive_count}", + f"{execution_id}#{state_name}#{state_entered_time}{redrive_postfix}", HIGHER_64_BITS, ) diff --git a/tests/test_tracing.py b/tests/test_tracing.py index 680067bd..96d03469 100644 --- a/tests/test_tracing.py +++ b/tests/test_tracing.py @@ -620,6 +620,7 @@ def test_step_function_trace_data(self): sfn_event = { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", + "RedriveCount": "0", }, "StateMachine": {}, "State": { @@ -631,7 +632,45 @@ def test_step_function_trace_data(self): self.assertEqual(source, "event") expected_context = Context( trace_id=3675572987363469717, - span_id=4929949072763648481, + span_id=6880978411788117524, + sampling_priority=1, + meta={"_dd.p.tid": "e987c84b36b11ab"}, + ) + self.assertEqual(ctx, expected_context) + self.assertEqual( + get_dd_trace_context(), + { + TraceHeader.TRACE_ID: "3675572987363469717", + TraceHeader.PARENT_ID: "10713633173203262661", + TraceHeader.SAMPLING_PRIORITY: "1", + TraceHeader.TAGS: "_dd.p.tid=e987c84b36b11ab", + }, + ) + create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY) + self.mock_send_segment.assert_called_with( + XraySubsegment.TRACE_KEY, + expected_context, + ) + + @with_trace_propagation_style("datadog") + def test_step_function_trace_data_redrive(self): + lambda_ctx = get_mock_context() + sfn_event = { + "Execution": { + "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", + "RedriveCount": "1", + }, + "StateMachine": {}, + "State": { + "Name": "my-awesome-state", + "EnteredTime": "Mon Nov 13 12:43:33 PST 2023", + }, + } + ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx) + self.assertEqual(source, "event") + expected_context = Context( + trace_id=3675572987363469717, + span_id=1201185214297576513, sampling_priority=1, meta={"_dd.p.tid": "e987c84b36b11ab"}, ) @@ -658,6 +697,7 @@ def test_step_function_trace_data_lambda_root(self): "_datadog": { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", + "RedriveCount": "0", }, "StateMachine": {}, "State": { @@ -673,7 +713,7 @@ def test_step_function_trace_data_lambda_root(self): self.assertEqual(source, "event") expected_context = Context( trace_id=5821803790426892636, - span_id=4929949072763648481, + span_id=6880978411788117524, sampling_priority=1, meta={"_dd.p.tid": "672a7cb100000000"}, ) @@ -700,6 +740,7 @@ def test_step_function_trace_data_sfn_root(self): "_datadog": { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", + "RedriveCount": "0", }, "StateMachine": {}, "State": { @@ -714,7 +755,7 @@ def test_step_function_trace_data_sfn_root(self): self.assertEqual(source, "event") expected_context = Context( trace_id=4521899030418994483, - span_id=4929949072763648481, + span_id=6880978411788117524, sampling_priority=1, meta={"_dd.p.tid": "12d1270d99cc5e03"}, ) From 06bffa4e6406cd666cabc4ad6038b8b1c62fc50d Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Mon, 16 Dec 2024 12:04:22 -0500 Subject: [PATCH 5/8] read redriveCount as int --- datadog_lambda/tracing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datadog_lambda/tracing.py b/datadog_lambda/tracing.py index 890310c9..ae07c2da 100644 --- a/datadog_lambda/tracing.py +++ b/datadog_lambda/tracing.py @@ -396,7 +396,7 @@ def _generate_sfn_parent_id(context: dict) -> int: state_name = context.get("State").get("Name") state_entered_time = context.get("State").get("EnteredTime") - redrive_postfix = "" if redrive_count == "0" else f"#{redrive_count}" + redrive_postfix = "" if redrive_count == 0 else f"#{redrive_count}" return _deterministic_sha256_hash( f"{execution_id}#{state_name}#{state_entered_time}{redrive_postfix}", From 9c77aa8d2ac2e993300c753be47af07a6f151050 Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Mon, 16 Dec 2024 12:54:45 -0500 Subject: [PATCH 6/8] update redriveCount value in tests --- tests/test_tracing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_tracing.py b/tests/test_tracing.py index 96d03469..b60013a1 100644 --- a/tests/test_tracing.py +++ b/tests/test_tracing.py @@ -620,7 +620,7 @@ def test_step_function_trace_data(self): sfn_event = { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", - "RedriveCount": "0", + "RedriveCount": 0, }, "StateMachine": {}, "State": { @@ -658,7 +658,7 @@ def test_step_function_trace_data_redrive(self): sfn_event = { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", - "RedriveCount": "1", + "RedriveCount": 1, }, "StateMachine": {}, "State": { @@ -697,7 +697,7 @@ def test_step_function_trace_data_lambda_root(self): "_datadog": { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", - "RedriveCount": "0", + "RedriveCount": 0, }, "StateMachine": {}, "State": { @@ -740,7 +740,7 @@ def test_step_function_trace_data_sfn_root(self): "_datadog": { "Execution": { "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", - "RedriveCount": "0", + "RedriveCount": 0, }, "StateMachine": {}, "State": { From 93fd23fb259dabbcfd221ea9cff317674842b423 Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Tue, 17 Dec 2024 10:57:06 -0500 Subject: [PATCH 7/8] match test case with logs-backend snapshot --- tests/test_tracing.py | 51 ++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/tests/test_tracing.py b/tests/test_tracing.py index b60013a1..2c90478e 100644 --- a/tests/test_tracing.py +++ b/tests/test_tracing.py @@ -619,31 +619,38 @@ def test_step_function_trace_data(self): lambda_ctx = get_mock_context() sfn_event = { "Execution": { - "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", + "Id": "arn:aws:states:sa-east-1:425362996713:execution:abhinav-activity-state-machine:72a7ca3e-901c-41bb-b5a3-5f279b92a316", + "Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316", + "RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j", + "StartTime": "2024-12-04T19:38:04.069Z", "RedriveCount": 0, }, - "StateMachine": {}, "State": { - "Name": "my-awesome-state", - "EnteredTime": "Mon Nov 13 12:43:33 PST 2023", + "Name": "Lambda Invoke", + "EnteredTime": "2024-12-04T19:38:04.118Z", + "RetryCount": 0, + }, + "StateMachine": { + "Id": "arn:aws:states:sa-east-1:425362996713:stateMachine:abhinav-activity-state-machine", + "Name": "abhinav-activity-state-machine", }, } ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx) self.assertEqual(source, "event") expected_context = Context( - trace_id=3675572987363469717, - span_id=6880978411788117524, + trace_id=435175499815315247, + span_id=3929055471293792800, sampling_priority=1, - meta={"_dd.p.tid": "e987c84b36b11ab"}, + meta={"_dd.p.tid": "3e7a89d1b7310603"}, ) self.assertEqual(ctx, expected_context) self.assertEqual( get_dd_trace_context(), { - TraceHeader.TRACE_ID: "3675572987363469717", + TraceHeader.TRACE_ID: "435175499815315247", TraceHeader.PARENT_ID: "10713633173203262661", TraceHeader.SAMPLING_PRIORITY: "1", - TraceHeader.TAGS: "_dd.p.tid=e987c84b36b11ab", + TraceHeader.TAGS: "_dd.p.tid=3e7a89d1b7310603", }, ) create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY) @@ -652,36 +659,44 @@ def test_step_function_trace_data(self): expected_context, ) + # https://github.com/DataDog/logs-backend/blob/c17618cb552fc369ca40282bae0a65803f82f694/domains/serverless/apps/logs-to-traces-reducer/src/test/resources/test-json-files/stepfunctions/RedriveTest/snapshots/RedriveLambdaSuccessTraceMerging.json#L46 @with_trace_propagation_style("datadog") def test_step_function_trace_data_redrive(self): lambda_ctx = get_mock_context() sfn_event = { "Execution": { - "Id": "665c417c-1237-4742-aaca-8b3becbb9e75", + "Id": "arn:aws:states:sa-east-1:425362996713:execution:abhinav-activity-state-machine:72a7ca3e-901c-41bb-b5a3-5f279b92a316", + "Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316", + "RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j", + "StartTime": "2024-12-04T19:38:04.069Z", "RedriveCount": 1, }, - "StateMachine": {}, "State": { - "Name": "my-awesome-state", - "EnteredTime": "Mon Nov 13 12:43:33 PST 2023", + "Name": "Lambda Invoke", + "EnteredTime": "2024-12-04T19:38:04.118Z", + "RetryCount": 0, + }, + "StateMachine": { + "Id": "arn:aws:states:sa-east-1:425362996713:stateMachine:abhinav-activity-state-machine", + "Name": "abhinav-activity-state-machine", }, } ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx) self.assertEqual(source, "event") expected_context = Context( - trace_id=3675572987363469717, - span_id=1201185214297576513, + trace_id=435175499815315247, + span_id=5063839446130725204, sampling_priority=1, - meta={"_dd.p.tid": "e987c84b36b11ab"}, + meta={"_dd.p.tid": "3e7a89d1b7310603"}, ) self.assertEqual(ctx, expected_context) self.assertEqual( get_dd_trace_context(), { - TraceHeader.TRACE_ID: "3675572987363469717", + TraceHeader.TRACE_ID: "435175499815315247", TraceHeader.PARENT_ID: "10713633173203262661", TraceHeader.SAMPLING_PRIORITY: "1", - TraceHeader.TAGS: "_dd.p.tid=e987c84b36b11ab", + TraceHeader.TAGS: "_dd.p.tid=3e7a89d1b7310603", }, ) create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY) From b7462df3fb90f069430fbf136041a51d96d76324 Mon Sep 17 00:00:00 2001 From: Abhinav Vedmala Date: Mon, 30 Dec 2024 13:39:18 -0500 Subject: [PATCH 8/8] account for missing redrive count --- datadog_lambda/tracing.py | 2 +- tests/test_tracing.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/datadog_lambda/tracing.py b/datadog_lambda/tracing.py index ae07c2da..c90baff1 100644 --- a/datadog_lambda/tracing.py +++ b/datadog_lambda/tracing.py @@ -392,7 +392,7 @@ def _generate_sfn_parent_id(context: dict) -> int: version of the Lambda layer that doesn't use this value for its parentID generation. """ execution_id = context.get("Execution").get("Id") - redrive_count = context.get("Execution").get("RedriveCount") + redrive_count = context.get("Execution").get("RedriveCount", 0) state_name = context.get("State").get("Name") state_entered_time = context.get("State").get("EnteredTime") diff --git a/tests/test_tracing.py b/tests/test_tracing.py index 2c90478e..f7d54542 100644 --- a/tests/test_tracing.py +++ b/tests/test_tracing.py @@ -623,7 +623,6 @@ def test_step_function_trace_data(self): "Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316", "RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j", "StartTime": "2024-12-04T19:38:04.069Z", - "RedriveCount": 0, }, "State": { "Name": "Lambda Invoke",