From e434e7a6df4f2f463e7ab762e8bf6207608958cc Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Tue, 26 Sep 2023 14:34:54 +0100 Subject: [PATCH] Add documentation on how to use memray --- docs-website/sidebars.js | 1 + .../docs/dev_guides/profiling_ingestions.md | 55 +++++++++++++++++++ metadata-ingestion/setup.py | 2 +- 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/docs/dev_guides/profiling_ingestions.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 03ea38fd622d4d..d8e21fd1740b9c 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -133,6 +133,7 @@ module.exports = { "metadata-ingestion/docs/dev_guides/classification", "metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source", "metadata-ingestion/docs/dev_guides/sql_profiles", + "metadata-ingestion/docs/dev_guides/profiling_ingestions, ], }, ], diff --git a/metadata-ingestion/docs/dev_guides/profiling_ingestions.md b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md new file mode 100644 index 00000000000000..7896c051ade90e --- /dev/null +++ b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md @@ -0,0 +1,55 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Profiling ingestions + + + +**🤝 Version compatibility** +> Open Source DataHub: **0.10.6** | Acryl: **0.2.12** + +This page documents how to perform memory profiles of ingestion runs. +It is useful when trying to size the amount of resources necessary to ingest some source or when developing new features or sources. + +## How to use +Install the `debug` plugin for DataHub's CLI wherever the ingestion runs: + +```bash +pip install 'acryl-datahub[debug]' +``` + +This will install [memray](https://github.com/bloomberg/memray) in your python environment. + +Add a flag to your ingestion recipe to generate a memray memory dump of your ingestion: +````yaml +source: + ... + +sink: + ... + +flags: + generate_memory_profiles: "" +```` + +Once the ingestion run starts a binary file will be created and appended to during the execution of the ingestion. + +These files follow the pattern `file-.bin` for a unique identification. +Once the ingestion has finished you can use `memray` to analyze the memory dump in a flamegraph view using: + +`$ memray flamegraph file-None-file-2023_09_18-21_38_43.bin` + +This will generate an interactive HTML file for analysis: + +

+ +

+ + +`memray` has an extensive set of features for memory investigation. Take a look at their [documentation](https://bloomberg.github.io/memray/overview.html) to see the full feature set. + + +## Questions + +If you've got any questions on configuring profiling, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 5354ed82588265..af1ea42a4c1e09 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -697,7 +697,7 @@ def get_long_description(): }, entry_points=entry_points, # Dependencies. - install_requires=list(base_requirements | framework_common | debug_requirements), + install_requires=list(base_requirements | framework_common), extras_require={ "base": list(framework_common), **{