From e73e92699947084b5ecb1f5d3e0c5762dc446bbf Mon Sep 17 00:00:00 2001 From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com> Date: Wed, 8 Nov 2023 12:32:41 +0530 Subject: [PATCH 01/29] feat(integration/fivetran): Fivetran connector integration (#9018) Co-authored-by: Harshal Sheth --- .../app/ingest/source/builder/constants.ts | 4 + .../app/ingest/source/builder/sources.json | 7 + datahub-web-react/src/images/fivetranlogo.png | Bin 0 -> 10230 bytes .../docs/sources/fivetran/fivetran_pre.md | 86 +++ .../docs/sources/fivetran/fivetran_recipe.yml | 43 ++ metadata-ingestion/setup.py | 3 + .../datahub/api/entities/datajob/datajob.py | 25 +- .../dataprocess/dataprocess_instance.py | 27 +- metadata-ingestion/src/datahub/emitter/mcp.py | 4 +- .../datahub/ingestion/api/source_helpers.py | 13 +- .../ingestion/source/fivetran/__init__.py | 0 .../ingestion/source/fivetran/config.py | 145 ++++ .../ingestion/source/fivetran/data_classes.py | 36 + .../ingestion/source/fivetran/fivetran.py | 289 ++++++++ .../source/fivetran/fivetran_log_api.py | 147 ++++ .../source/fivetran/fivetran_query.py | 76 ++ .../ingestion/source_config/sql/snowflake.py | 82 ++- .../integration/fivetran/fivetran_golden.json | 658 ++++++++++++++++++ .../integration/fivetran/test_fivetran.py | 192 +++++ .../main/resources/boot/data_platforms.json | 10 + 20 files changed, 1777 insertions(+), 70 deletions(-) create mode 100644 datahub-web-react/src/images/fivetranlogo.png create mode 100644 metadata-ingestion/docs/sources/fivetran/fivetran_pre.md create mode 100644 metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py create mode 100644 metadata-ingestion/tests/integration/fivetran/fivetran_golden.json create mode 100644 metadata-ingestion/tests/integration/fivetran/test_fivetran.py diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index dba8e8bb1dce6b..fdb094d721304b 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -29,6 +29,7 @@ import databricksLogo from '../../../../images/databrickslogo.png'; import verticaLogo from '../../../../images/verticalogo.png'; import mlflowLogo from '../../../../images/mlflowlogo.png'; import dynamodbLogo from '../../../../images/dynamodblogo.png'; +import fivetranLogo from '../../../../images/fivetranlogo.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -105,6 +106,8 @@ export const DBT_CLOUD = 'dbt-cloud'; export const DBT_CLOUD_URN = `urn:li:dataPlatform:dbt`; export const VERTICA = 'vertica'; export const VERTICA_URN = `urn:li:dataPlatform:${VERTICA}`; +export const FIVETRAN = 'fivetran'; +export const FIVETRAN_URN = `urn:li:dataPlatform:${FIVETRAN}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, @@ -138,6 +141,7 @@ export const PLATFORM_URN_TO_LOGO = { [SUPERSET_URN]: supersetLogo, [UNITY_CATALOG_URN]: databricksLogo, [VERTICA_URN]: verticaLogo, + [FIVETRAN_URN]: fivetranLogo, }; export const SOURCE_TO_PLATFORM_URN = { diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index b18384909c33f0..9619abebbd54e6 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -216,6 +216,13 @@ "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertica/", "recipe": "source:\n type: vertica\n config:\n # Coordinates\n host_port: localhost:5433\n # The name of the vertica database\n database: Database_Name\n # Credentials\n username: Vertica_User\n password: Vertica_Password\n\n include_tables: true\n include_views: true\n include_projections: true\n include_models: true\n include_view_lineage: true\n include_projection_lineage: true\n profiling:\n enabled: false\n stateful_ingestion:\n enabled: true " }, + { + "urn": "urn:li:dataPlatform:fivetran", + "name": "fivetran", + "displayName": "Fivetran", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/fivetran/", + "recipe": "source:\n type: fivetran\n config:\n # Fivetran log connector destination server configurations\n fivetran_log_config:\n destination_platform: snowflake\n destination_config:\n # Coordinates\n account_id: snowflake_account_id\n warehouse: warehouse_name\n database: snowflake_db\n log_schema: fivetran_log_schema\n\n # Credentials\n username: ${SNOWFLAKE_USER}\n password: ${SNOWFLAKE_PASS}\n role: snowflake_role\n\n # Optional - filter for certain connector names instead of ingesting everything.\n # connector_patterns:\n # allow:\n # - connector_name\n\n # Optional -- This mapping is optional and only required to configure platform-instance for source\n # A mapping of Fivetran connector id to data platform instance\n # sources_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV\n\n # Optional -- This mapping is optional and only required to configure platform-instance for destination.\n # A mapping of Fivetran destination id to data platform instance\n # destination_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV" + }, { "urn": "urn:li:dataPlatform:custom", "name": "custom", diff --git a/datahub-web-react/src/images/fivetranlogo.png b/datahub-web-react/src/images/fivetranlogo.png new file mode 100644 index 0000000000000000000000000000000000000000..d5c999ad2d86e99273971dd0d31a18fd5e94733b GIT binary patch literal 10230 zcmZX42{@G9`@c1NmIh-dEmUN(?;)BZ%h*YT?6QWj#Sp!d7?E8eOZG^zXW!RES(+?a zLUvgye&?BaU)T44{jY1T>ABCj&;7a2=U&cvO#kR=UuU4@q$MFCVYq?3h9V&$1CL}R z)a2lwmfSO)BqZb{x>^S6KqT=#BJukFJ`avaJdQ{_|7ZO8`S{^@mCw8c~zvKOiu5ycXyamw<0D2ylt||NBkECHe!l z29^Ugq7fhxM|}T7Bw9O0NB^h!zlgx#v4=!`LxLKSieqSC1kiYF3TP6sjvVPRQPIQ2%|G!g2CyAQJ)&M8RYl+K= z;_)DO08{{dzyCDB2od_&2+=a}`f6X30106TQ6wxdgKb`_cz=tUQ z?;!E{cosYmfsP-5DFV9H!xSk{0!Q8`ZG959;I7r*v$+~H&UK52B{zg(K5g%P#&WvU z^IV^QbMi#&pLJcFu;t$#*iG9>dZ~lXgaz7VpZ3e16Qq}(Wy!gXG(O!-YNp!JzZ21F z$W%r)edR5yoeEsumGRdL_p$E$tyBFEwaiOJ`6;SUPrj2!bKg@#c*i3%hF(7q+tsx2 zx2gHT)g;t5$GM7qCs})z;^FX+sBdwKk|`T#y+?j6wbC8| z>Mwa!QybVdm#r{r0 z>HRg+$h6x!3ZwrF^cg+3%6Vn|AyB5gSA^lKKD9pyiD2%HYbpkQ<3H2)KP1{5cEos| zk-NYks9EbYh++21dZs1B>0EhcOf}^a^N_IfYdcj`RQye3Wvqz#m{X_i%C$#DEnlqR zyPtc*))=?^CKq;;UVdJ#`LkI%NxD?iEKlKUjK^2st5CR$wNw)RURTd#aJDqs1%yZ5v!Sl_YHI|q?Y ze?+rl)_G^<@WIl z9_83;Q?7ygm}*y1u;z*!auRU2yBsxU9x_ISI#|aw;3HtgL7KAUe=h%wMEqUaO!~H? z7XKIBNeU)P?gS3X@jOOQZ6vK&AsuzZyBRm6vK@BVWxRyixqW0}WXVt`=LD|%bD^qE ztgFa(hj4V-L`OtMhXt#sMQ{9i<<2ml-c@7WTSnIFf%8;glx>>8Wj(N_F|Ifg`4f6a z%9e`PizPd)S4|t(b*G6Yi+F)WwD@V&eiKA`9XJU(i?xCUl4-u@FRO z)NX26Bk?Ax1Je`zV+wXQ)IU1ame9ZljZGD2>rPsk$PU{D)$Vf=HlGOF+oI@Ci$Q$2 zq-ef;G;#{1v4y(;EF{d32StT?vBYrDeHav-;(JS1&mere#m|16qg*X!7QvxC@#-Vl z(JSneItF1gEaUbQL~ig!UN4$#YEfRQ002k9x2Q;;BbvpmVAMeH_U4{Zyo0h`G{fM&JPo_7~i5am&`DlUKGZrU?)iM>~dapgy1K0zo<;zq=&sumUdr?$?WN`zs3ObQpn^E zwFp351%>QRuAqU^z4KhObdM-w#(sj0imGVW&rv^j)3g4njn(K5!(ep_>^UNtgk8y zVaOrn7PSZ{iKC~(DqG#dYvMbm=za?@ICPq*HJRV8iSs>IOKwv`BW8#@z4$1&=)Hit zbGmlmV84X>VKW4NPr)|hy}+YoGbM)sP7UOzX+)K7)ru_V6{lFByX=HdUNMR?Jy+$| z{HR%eg9$R~e^?qgD0Ri1Vi*genz7a9>83{exvx%KpFtd700nW6l6O-Y6w9bfs5qCE zmUwV$T&Awc;Qov{RD*Mxpg(ZQ?D#?4c{liRz(VMT zDw*p~;a(RII*0bts?(qSI+vmTG8Z_w9`(MwJed%YLp3PT+}8WyH>D{~<%m3LcH6|U z0=;5uySnGk>xtQ@ERlxcPQ}lsHvyD;7tvAE&Tr}Z8ihN{Zn($Z7y2q>ZrD#qTjdHb zC_c!S#t)1`ek6j}#q z6|d09?VDV2Vqb^9 z3*Hf>eT?YG0xbfft|IQaC)4-aUr1Bv&%ayp?VjvP*=V z5jh=k?Oc=Pz*r#pf*(}Wl*xA!bjGXB>@AOZKFv+Ch>okIxFxo>0}_em(7%8FZj`l; ztoHapcx2if@NH(fp!@@u7-nov#<9>-p3{0i`>Ce=dcR4@5Rp21Ue5+1EqRYZ1%W-f zM>^jJDn(vac-+u6zE`brs#g>cu-2vKWMTJ(7L+nalwPlB2iVoypXP)tkDkaqK?e}3 zbWPkV6&t}Bvm>__mD7X)o`C91a~*(=ZW_7!=A4%Z+nSD0uuTA=d>$9z3x4m!hP~R}E6Q7H;>RnK1 z*A)t+>`Pkyd3}0G$lZ=TN-ql62-@?A7o(dpHj&?QR-8QI$b*yi(SE@~JKhIYI#ee< zMgt$Vgi^v~#7nb9+d9rVyjA!_KM53B>GN;N8rFSU)NY~eIpG>e`gufz>hy&r9~-S= zXV`{YP?vn%(?+<;ci>4~ql-OlX(Q4uaViS;`hjHKBV=sp#?w&oXB|_j)7dQB z#DjuaV!JSCn1>eV&YDN*Rh)TEkoYZa{t*y&fydNT-!5dv0xeC^j4iz@tIk?+ z22gP0ED~3GH@|?X;*1!K_Wa0gY#>9R@p;D^A#{)8oJ7?dPBaw8{f}nVf0ATkMqk2Q ziT2Pl5_V`EM#?<5R!&8e?6y#v0fxf0@{;cZYyw4bo<1bqOXyW1X#Wy(j*31q*2D6~Hd#2MJsH_D5T>RQRhbeKG2 zeCAXX<7Z!PgU}dymNELy`_i?<)h2oI!N%kPn@fb%Akh+PYwFQj7hor|hAH~F!x3Qb z`MNUU2Y+M1Ze6xKdE;mwg*ouyld(_4>=M-4v~+{+;z%NU#2;D|E?kI9qj^zX0g2w!_;}bGzk@^a?m+(F@du{GL{=jz}0KwX!~*3XoR32fwU4b`}<8v!D>Vb z-}~90fQ#j=6BI9k?ko%6iYEaKsv(EI1R0Yrh!a_2#wX`7#QFJ76VUJr>0L_gNe?K9 zTws}5bU0-Si53~r_PHmUlm3;ADf$)uqaw3A(a*jHE+f4~Kyw2wKWs@FBZJ%$P`BBY zMra_4&au)`#b%WK?ukfzYh^-UP{n33hl?s71E%KHgQ!#>vdgl1#BG4EkMhws$7j6o3)_xXur)Y74 zeEQigZ-bVlix`1oSd;48zUhhABP@%=WtG*=%>_~rl5(45EeZ%?L;=^x(b7}x$xW0L z2~&au;S;FhP0t1tjw1ntWwB8klt7}FJuxowplSD@iK+bhGN?56_CGgpVJVR2{sXjq zMZ-B}EhBl6i_U zQsOsUCY6{W;1!l$G71OpBe2f^(m=Zfu71)9L@?71vswR^J}p-P3gHkEpL{MNaVwE1 zp+S@1hrjAnxdrhm^o`4AD)3($v*~3~IFpwIXgQ0i-nM_$RJ^6uv>3@X2%8C8F3>+o?Tu^+BA3yLU@ElIC>?>r}Cu^-3qA`Kcrc`4@OO6@$QXe}XC<%y69O zwKoUSeN+(!G4$y@v)sEtMgS4&56JnyWtGqIw6Bi|sIg+a;bK zF#)rMAiY&j^H7H4vI5u4$QM|6hnwY%JeIWZ$(w3oc3iw(Sl`z(pLhUO>2SeLm7l9T z1<12P-{jjve%#Qn{1t7GePFasD?@7IW5>}`PtIDH@NSkb=3v>dJU$d_^lhE7 z@h>)_CQ4P~11QR^dP7x@BMFwaW^~gUrL+hF;9@g{+9KU2Q93P}vN_V{zr#i;?Q1diGjtx3Y!Z-a&Ltt%b17KrOJ`2SN z+lameVK4)yIVj?&L2t+|0)bz0xz9YT7A-s6U4D5^?ke8LRo6a=4+s7@L>mp(TkVT zOKBniP!@Dy&tpb51|$2@)Oby|EkBZC^b&yzMnGVG^mH*I-i^j=m;rPnWI^@n_u?)# z)$U5Nn3AUPVvvXyx8=GY;#d6 z_iypf(UYEK2w=#%NI6O$9&d;tk7R>#Le{wcR|HP)MZ7&B z3DC7c{5ioFUu;(TzCJx-gyCEG#;@)^CH4RZ^?^GWsws7(9=A;0G)S$3LfoQDK##lFf(hOcy)u1|X|L^Rk&)?s_Y_-SRn^UBRa7e$o>~ zgQDMSO1_I#VJ(DEDaeD%{|ek=V?=o8X@NkKD!cOMyb8h8>Ysa_K{JbXZlCQHYs$b5 zKoO#FTX12?j$+eYpPb2N$@Y`~z#t6n34ZrIP{J&A=?uF&x6s&3`Fj8)in)1r$C6eE znxi>*YZax-j@0u$_q$u(Aa;ohq~5pB5Bti!8Q|U&)94)Rnj?9!x#C6uvI!TBh##PgkNa2hzlsI0 z8xX7wfbHR*v*Z2v!x+f=3v|38Y`81tSH~caZ+e(U%2ylQ@6ccJhg!jWy*-~AHVal9 z3)P1;{&yU%+(Z}allre{@F(<~kh-SendU`(e2NQaw$RO?Rf7MwM;}DM)P=_tAy7<@ zTnp8>E$=f8WO(dM`Kkso`Ao(XIKf%6tBTASgEJT$8};2avDv5!Gw`a$=iuu1IP zt~z*s85;SF<2cu;C`Nzr<^`2gXlHuy23R$fpNSrbqW2CDi7>dX>uW&Ue8BccX5nhL zse&!|lP}LOseRTnZB} zOxA4$m1i4V0OYcs!oAzjjXwtz$qAh2>yVG%pa7>fa;DXkCYg@)5JM)kOe>6u=`-4a zf^-6eA&6gnqipOmF}~CQQc3E=7J#g=Sf9O3EzT}&4#xvC4T89Os z$nig^+E1& zQzGe>7MXjh(zX$b&1&wEGbU#pYD_mGCr1Kkc}W6|w;Dn)^04a#e<_<%V`~qpF(Vrm zPqQvEtm<+Xj&C*93Lmu|x-^{%S#zSWaHM~`EciUDW+rZOWPrKoP;xRX>tf$5Uc=83 zml3}^KRLo$9O&Otn*z>k+7}jlh0zOP($XJlW+;PqRy3<3r72+wwMwY;OVs$0>cWR- zO#UiWMH*AyqjvM>n$DU3Fdz0_&Uc50^8Bp`k8c)46aC-Aj1R#pE5=9(493t2mHGkQ zXv|b`uZ6`>H&x8+?&hSre-~%Gho3*QsggC?r5Pu)BYoIQaW}NeS|cn85~}(mgKv(0Jt8#oE+c zT^2#1Ry#V=yyLV&4E0afpJlDBKB+GJaM6Uv9{eD3ZyC)som&&Qey3lt%=nCpt$Cy3 zO1qlV8Z}KDF4EV__@$7^--}g|wUqG}+yv{g;hVUJ>A7%0gIkmzbx?ubtE;ZzsZujx z#(fXQCI2jqnN?ANn~kLU$=Jeojah$|D;z=iWAr`Nyq~{kzLUb=Ngr?-kaQ##!MCuh zFem-aC%pBNlHZ7yE&F#~uHdw!<*;JFM7so#B4QcD^Q+)*?OVrf!X0T zA9efPBCg36Uj+&sd|Y%hn@Lfty?+T5LtZw3A^o>Ru^;3)JLVCN9J?8TeVF#Z;H>X?%owb0Cw%k?Wv4 z0U=QaJ@jGvv}Opw&~Sc=en}!6dtv3bJ~%mPtEkk&#&6(Q@?mQOY=n%S?iB+=UDPba z6l(*2m=*p;(7p&MoaUuSW_ZO05-?|$f^9BlikbePC|7!C0A(w68>*?L_cLLM%9!hj z#Z&eXhWJ+f1Ab(M9EK79$#Slw1QZY5WQ!|VHw+Y1DGf$i#CJ!WqEII?KzF+?nN&FT z=Fm`%*XmLX&0)8g#P+G}y6XHk^=R&NMA4PV-8g(p~>YADlW<)d-N5mK-*B z!4*P~RLKKth+$S(p0gXhpzT=hI>k%DLjyF6{MD?6>Ir*#8XlnQeP(YGA``W@CGO0c z{*Q-Uk~Jr1iF%YF26p(T*Q^VkIqlA9xz3~}Vx5i3USF^u4dyrb>r)l^kXERQ4S^EA z;=Hg>((ta4vqd)Dij#8{#aGC=)l_)yt(6o3(=i#p?0A{_GI*Xi74D4ks&<51iT1F#3OS$=_f$= z%_&ExXtmos+8IFebmZr&J6C-@ADgYHMww^jjh1|LKUQJ^-JEKR|GQoiAl=J6ElZ>bBMh_4JlKWCrjg>`T|1(o=*8Z$M=mviT=iQ)$+ zPt#9ab!7b!T+jV)~QRcA-sgsjIq>5mpq;0@hK=1^1qQJ|iUg zkaMk!%(>vlt-o_(<2T9Jm;JbZP(KnQAv?SHr2q9%-*NUB-s1;-X!M=`DbRMxrLMDsZICgSOCC^&OR zr|^eZQ^_hxiz1e^&vWc!7m5(uzg`ZgxjWaGyekHc{qzI_W#I>g-8b(;Fy_PR1#i?V zT`5Pkc*1|V=UW-|odx{=U;)unucUP@m!9GkT@Zk*ISIX4Y?>V5V=KZ_uHJnbf{9Gf zDA0fZJ#6>Z>XJEiQIoJdc0SzC!Lf*?7oBgC_P)wCib;EnS>k-rUKrFgVPnCmwKW+L z6S(qM4%#LiP}9S@Y>BtPVvK#nCe;FXH&e~zRcEwA)j!Xb941)pfJVy8s#^nfnFn$k zP39@JHK#Y$7f(7chUG>~s;#g{{4(EtQ`dIZCY(1s;LGLjyw=8+q3ZjIK9~gWv9+nA zfdI;{YTr?|5pzcz^{Q64DrLvjw(`_g1$-=;sMZ=Lp_MhwikBFwBfpG11w~Ov()MPd z#bd_ji(;#2!?Uedf7%%5i_4+{7;7TG7)Cz*x@WilWcO9xjmp&7`p7TuWK&t+RMw27 zDqb(S?=;8i!&p{`?O~u0e!$MHR;-k}f7Hlp{h4+~r%93bdg15u^EWExhsV1_R={Y3 zuN04YH}?yB`G?MqZ$Ei7J<~f|q~z3vQqZb" to role fivetran_datahub; + +// Grant access to view database and schema in which your log and metadata tables exist +grant usage on DATABASE "" to role fivetran_datahub; +grant usage on SCHEMA ""."" to role fivetran_datahub; + +// Grant access to execute select query on schema in which your log and metadata tables exist +grant select on all tables in SCHEMA ""."" to role fivetran_datahub; + +// Grant the fivetran_datahub to the snowflake user. +grant role fivetran_datahub to user snowflake_user; +``` + +## Advanced Configurations + +### Working with Platform Instances +If you've multiple instances of source/destination systems that are referred in your `fivetran` setup, you'd need to configure platform instance for these systems in `fivetran` recipe to generate correct lineage edges. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this. + +While configuration of platform instance for source system you need to provide connector id as key and for destination system provide destination id as key. + +#### Example - Multiple Postgres Source Connectors each reading from different postgres instance +```yml + # Map of connector source to platform instance + sources_to_platform_instance: + postgres_connector_id1: + platform_instance: cloud_postgres_instance + env: PROD + + postgres_connector_id2: + platform_instance: local_postgres_instance + env: DEV +``` + +#### Example - Multiple Snowflake Destinations each writing to different snowflake instance +```yml + # Map of destination to platform instance + destination_to_platform_instance: + snowflake_destination_id1: + platform_instance: prod_snowflake_instance + env: PROD + + snowflake_destination_id2: + platform_instance: dev_snowflake_instance + env: PROD +``` + + + diff --git a/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml b/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml new file mode 100644 index 00000000000000..7c654df59723c1 --- /dev/null +++ b/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml @@ -0,0 +1,43 @@ +source: + type: fivetran + config: + # Fivetran log connector destination server configurations + fivetran_log_config: + destination_platform: snowflake + destination_config: + # Coordinates + account_id: "abc48144" + warehouse: "COMPUTE_WH" + database: "MY_SNOWFLAKE_DB" + log_schema: "FIVETRAN_LOG" + + # Credentials + username: "${SNOWFLAKE_USER}" + password: "${SNOWFLAKE_PASS}" + role: "snowflake_role" + + # Optional - filter for certain connector names instead of ingesting everything. + # connector_patterns: + # allow: + # - connector_name + + # Optional -- A mapping of the connector's all sources to its database. + # sources_to_database: + # connector_id: source_db + + # Optional -- This mapping is optional and only required to configure platform-instance for source + # A mapping of Fivetran connector id to data platform instance + # sources_to_platform_instance: + # connector_id: + # platform_instance: cloud_instance + # env: DEV + + # Optional -- This mapping is optional and only required to configure platform-instance for destination. + # A mapping of Fivetran destination id to data platform instance + # destination_to_platform_instance: + # destination_id: + # platform_instance: cloud_instance + # env: DEV + +sink: + # sink configs diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index afce8dcee840b4..2392fce0580613 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -395,6 +395,7 @@ "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"}, "unity-catalog": databricks | sqllineage_lib, + "fivetran": snowflake_common, } # This is mainly used to exclude plugins from the Docker image. @@ -525,6 +526,7 @@ "nifi", "vertica", "mode", + "fivetran", "kafka-connect", ] if plugin @@ -629,6 +631,7 @@ "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource", "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource", "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource", + "fivetran = datahub.ingestion.source.fivetran.fivetran:FivetranSource", ], "datahub.ingestion.transformer.plugins": [ "simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership", diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 0face6415bacc4..6c42e830e223b1 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -100,7 +100,9 @@ def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]: ) return [tags] - def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: + def generate_mcp( + self, materialize_iolets: bool = True + ) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), aspect=DataJobInfoClass( @@ -113,7 +115,9 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: ) yield mcp - yield from self.generate_data_input_output_mcp() + yield from self.generate_data_input_output_mcp( + materialize_iolets=materialize_iolets + ) for owner in self.generate_ownership_aspect(): mcp = MetadataChangeProposalWrapper( @@ -144,7 +148,9 @@ def emit( for mcp in self.generate_mcp(): emitter.emit(mcp, callback) - def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: + def generate_data_input_output_mcp( + self, materialize_iolets: bool + ) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), aspect=DataJobInputOutputClass( @@ -157,10 +163,9 @@ def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapp yield mcp # Force entity materialization - for iolet in self.inlets + self.outlets: - mcp = MetadataChangeProposalWrapper( - entityUrn=str(iolet), - aspect=StatusClass(removed=False), - ) - - yield mcp + if materialize_iolets: + for iolet in self.inlets + self.outlets: + yield MetadataChangeProposalWrapper( + entityUrn=str(iolet), + aspect=StatusClass(removed=False), + ) diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py index cf6080c7072e69..2f07e4a112f934 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py +++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py @@ -220,12 +220,10 @@ def emit_process_end( self._emit_mcp(mcp, emitter, callback) def generate_mcp( - self, created_ts_millis: Optional[int] = None + self, created_ts_millis: Optional[int] = None, materialize_iolets: bool = True ) -> Iterable[MetadataChangeProposalWrapper]: - """ - Generates mcps from the object - :rtype: Iterable[MetadataChangeProposalWrapper] - """ + """Generates mcps from the object""" + mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), aspect=DataProcessInstanceProperties( @@ -253,7 +251,7 @@ def generate_mcp( ) yield mcp - yield from self.generate_inlet_outlet_mcp() + yield from self.generate_inlet_outlet_mcp(materialize_iolets=materialize_iolets) @staticmethod def _emit_mcp( @@ -329,7 +327,9 @@ def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance": dpi._template_object = dataflow return dpi - def generate_inlet_outlet_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: + def generate_inlet_outlet_mcp( + self, materialize_iolets: bool + ) -> Iterable[MetadataChangeProposalWrapper]: if self.inlets: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), @@ -349,10 +349,9 @@ def generate_inlet_outlet_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: yield mcp # Force entity materialization - for iolet in self.inlets + self.outlets: - mcp = MetadataChangeProposalWrapper( - entityUrn=str(iolet), - aspect=StatusClass(removed=False), - ) - - yield mcp + if materialize_iolets: + for iolet in self.inlets + self.outlets: + yield MetadataChangeProposalWrapper( + entityUrn=str(iolet), + aspect=StatusClass(removed=False), + ) diff --git a/metadata-ingestion/src/datahub/emitter/mcp.py b/metadata-ingestion/src/datahub/emitter/mcp.py index 9085ac152ea0b2..d6aa695665e4ef 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp.py +++ b/metadata-ingestion/src/datahub/emitter/mcp.py @@ -240,7 +240,7 @@ def from_obj_require_wrapper( return mcp def as_workunit( - self, *, treat_errors_as_warnings: bool = False + self, *, treat_errors_as_warnings: bool = False, is_primary_source: bool = True ) -> "MetadataWorkUnit": from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -254,10 +254,12 @@ def as_workunit( id=f"{self.entityUrn}-{self.aspectName}-{ts}", mcp=self, treat_errors_as_warnings=treat_errors_as_warnings, + is_primary_source=is_primary_source, ) return MetadataWorkUnit( id=f"{self.entityUrn}-{self.aspectName}", mcp=self, treat_errors_as_warnings=treat_errors_as_warnings, + is_primary_source=is_primary_source, ) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 2ce9e07bc57bc8..fae260226195ce 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -17,6 +17,7 @@ from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.emitter.mce_builder import make_dataplatform_instance_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import entity_supports_aspect from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( BrowsePathEntryClass, @@ -64,9 +65,9 @@ def auto_status_aspect( """ For all entities that don't have a status aspect, add one with removed set to false. """ - all_urns: Set[str] = set() status_urns: Set[str] = set() + skip_urns: Set[str] = set() for wu in stream: urn = wu.get_urn() all_urns.add(urn) @@ -89,9 +90,17 @@ def auto_status_aspect( else: raise ValueError(f"Unexpected type {type(wu.metadata)}") + if not isinstance( + wu.metadata, MetadataChangeEventClass + ) and not entity_supports_aspect(wu.metadata.entityType, StatusClass): + # If any entity does not support aspect 'status' then skip that entity from adding status aspect. + # Example like dataProcessInstance doesn't suppport status aspect. + # If not skipped gives error: java.lang.RuntimeException: Unknown aspect status for entity dataProcessInstance + skip_urns.add(urn) + yield wu - for urn in sorted(all_urns - status_urns): + for urn in sorted(all_urns - status_urns - skip_urns): yield MetadataChangeProposalWrapper( entityUrn=urn, aspect=StatusClass(removed=False), diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py new file mode 100644 index 00000000000000..b0843182c5cac4 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -0,0 +1,145 @@ +import logging +from dataclasses import dataclass, field as dataclass_field +from typing import Dict, List, Optional + +import pydantic +from pydantic import Field, root_validator + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) +from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig + +logger = logging.getLogger(__name__) + + +class Constant: + """ + keys used in fivetran plugin + """ + + ORCHESTRATOR = "fivetran" + # table column name + SOURCE_SCHEMA_NAME = "source_schema_name" + SOURCE_TABLE_NAME = "source_table_name" + SOURCE_TABLE_ID = "source_table_id" + SOURCE_COLUMN_NAME = "source_column_name" + DESTINATION_SCHEMA_NAME = "destination_schema_name" + DESTINATION_TABLE_NAME = "destination_table_name" + DESTINATION_TABLE_ID = "destination_table_id" + DESTINATION_COLUMN_NAME = "destination_column_name" + SYNC_ID = "sync_id" + MESSAGE_DATA = "message_data" + TIME_STAMP = "time_stamp" + STATUS = "status" + USER_ID = "user_id" + GIVEN_NAME = "given_name" + FAMILY_NAME = "family_name" + CONNECTOR_ID = "connector_id" + CONNECTOR_NAME = "connector_name" + CONNECTOR_TYPE_ID = "connector_type_id" + PAUSED = "paused" + SYNC_FREQUENCY = "sync_frequency" + DESTINATION_ID = "destination_id" + CONNECTING_USER_ID = "connecting_user_id" + # Job status constants + SUCCESSFUL = "SUCCESSFUL" + FAILURE_WITH_TASK = "FAILURE_WITH_TASK" + CANCELED = "CANCELED" + + +KNOWN_DATA_PLATFORM_MAPPING = { + "postgres": "postgres", + "snowflake": "snowflake", +} + + +class DestinationConfig(BaseSnowflakeConfig): + database: str = Field(description="The fivetran connector log database.") + log_schema: str = Field(description="The fivetran connector log schema.") + + +class FivetranLogConfig(ConfigModel): + destination_platform: str = pydantic.Field( + default="snowflake", + description="The destination platform where fivetran connector log tables are dumped.", + ) + destination_config: Optional[DestinationConfig] = pydantic.Field( + default=None, + description="If destination platform is 'snowflake', provide snowflake configuration.", + ) + + @root_validator(pre=True) + def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict: + destination_platform = values["destination_platform"] + if destination_platform == "snowflake": + if "destination_config" not in values: + raise ValueError( + "If destination platform is 'snowflake', user must provide snowflake destination configuration in the recipe." + ) + else: + raise ValueError( + f"Destination platform '{destination_platform}' is not yet supported." + ) + return values + + +@dataclass +class FivetranSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered_connectors: List[str] = dataclass_field(default_factory=list) + + def report_connectors_scanned(self, count: int = 1) -> None: + self.connectors_scanned += count + + def report_connectors_dropped(self, model: str) -> None: + self.filtered_connectors.append(model) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = pydantic.Field( + default=None, + description="The instance of the platform that all assets produced by this recipe belong to", + ) + env: str = pydantic.Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) + + +class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): + fivetran_log_config: FivetranLogConfig = pydantic.Field( + description="Fivetran log connector destination server configurations.", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for connectors to filter in ingestion.", + ) + include_column_lineage: bool = Field( + default=True, + description="Populates table->table column lineage.", + ) + sources_to_database: Dict[str, str] = pydantic.Field( + default={}, + description="A mapping of the connector's all sources to its database. Use connector id as key.", + ) + # Configuration for stateful ingestion + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( + default=None, description="Airbyte Stateful Ingestion Config." + ) + # Fivetran connector all sources to platform instance mapping + sources_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field( + default={}, + description="A mapping of the connector's all sources dataset to platform instance. Use connector id as key.", + ) + # Fivetran destination to platform instance mapping + destination_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field( + default={}, + description="A mapping of destination dataset to platform instance. Use destination id as key.", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py new file mode 100644 index 00000000000000..82bb5f3467c2a6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import List + + +@dataclass +class ColumnLineage: + source_column: str + destination_column: str + + +@dataclass +class TableLineage: + source_table: str + destination_table: str + column_lineage: List[ColumnLineage] + + +@dataclass +class Connector: + connector_id: str + connector_name: str + connector_type: str + paused: bool + sync_frequency: int + destination_id: str + user_name: str + table_lineage: List[TableLineage] + jobs: List["Job"] + + +@dataclass +class Job: + job_id: str + start_time: int + end_time: int + status: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py new file mode 100644 index 00000000000000..c0395b4e4e7963 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -0,0 +1,289 @@ +import logging +from typing import Dict, Iterable, List, Optional + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.fivetran.config import ( + KNOWN_DATA_PLATFORM_MAPPING, + Constant, + FivetranSourceConfig, + FivetranSourceReport, + PlatformDetail, +) +from datahub.ingestion.source.fivetran.data_classes import Connector, Job +from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, +) +from datahub.metadata.schema_classes import StatusClass +from datahub.utilities.urns.data_flow_urn import DataFlowUrn +from datahub.utilities.urns.dataset_urn import DatasetUrn + +# Logger instance +logger = logging.getLogger(__name__) + + +@platform_name("Fivetran") +@config_class(FivetranSourceConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_FINE, + "Enabled by default, can be disabled via configuration `include_column_lineage`", +) +class FivetranSource(StatefulIngestionSourceBase): + """ + This plugin extracts fivetran users, connectors, destinations and sync history. + This plugin is in beta and has only been tested on Snowflake connector. + """ + + config: FivetranSourceConfig + report: FivetranSourceReport + platform: str = "fivetran" + + def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext): + super(FivetranSource, self).__init__(config, ctx) + self.config = config + self.report = FivetranSourceReport() + + self.audit_log = FivetranLogAPI(self.config.fivetran_log_config) + + # Create and register the stateful ingestion use-case handler. + self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ) + + def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: + input_dataset_urn_list: List[DatasetUrn] = [] + output_dataset_urn_list: List[DatasetUrn] = [] + fine_grained_lineage: List[FineGrainedLineage] = [] + + source_platform_detail: PlatformDetail = PlatformDetail() + destination_platform_detail: PlatformDetail = PlatformDetail() + # Get platform details for connector source + source_platform_detail = self.config.sources_to_platform_instance.get( + connector.connector_id, PlatformDetail() + ) + + # Get platform details for destination + destination_platform_detail = self.config.destination_to_platform_instance.get( + connector.destination_id, PlatformDetail() + ) + + # Get database for connector source + # TODO: Once Fivetran exposes this, we shouldn't ask for it via config. + source_database: Optional[str] = self.config.sources_to_database.get( + connector.connector_id + ) + + if connector.connector_type in KNOWN_DATA_PLATFORM_MAPPING: + source_platform = KNOWN_DATA_PLATFORM_MAPPING[connector.connector_type] + else: + source_platform = connector.connector_type + logger.info( + f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." + ) + + for table_lineage in connector.table_lineage: + input_dataset_urn = DatasetUrn.create_from_ids( + platform_id=source_platform, + table_name=f"{source_database.lower()}.{table_lineage.source_table}" + if source_database + else table_lineage.source_table, + env=source_platform_detail.env, + platform_instance=source_platform_detail.platform_instance, + ) + input_dataset_urn_list.append(input_dataset_urn) + + output_dataset_urn: Optional[DatasetUrn] = None + if self.audit_log.fivetran_log_database: + output_dataset_urn = DatasetUrn.create_from_ids( + platform_id=self.config.fivetran_log_config.destination_platform, + table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}", + env=destination_platform_detail.env, + platform_instance=destination_platform_detail.platform_instance, + ) + output_dataset_urn_list.append(output_dataset_urn) + + if self.config.include_column_lineage: + for column_lineage in table_lineage.column_lineage: + fine_grained_lineage.append( + FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=[ + builder.make_schema_field_urn( + str(input_dataset_urn), + column_lineage.source_column, + ) + ] + if input_dataset_urn + else [], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[ + builder.make_schema_field_urn( + str(output_dataset_urn), + column_lineage.destination_column, + ) + ] + if output_dataset_urn + else [], + ) + ) + + datajob.inlets.extend(input_dataset_urn_list) + datajob.outlets.extend(output_dataset_urn_list) + datajob.fine_grained_lineages.extend(fine_grained_lineage) + return None + + def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow: + return DataFlow( + orchestrator=Constant.ORCHESTRATOR, + id=connector.connector_id, + env=self.config.env, + name=connector.connector_name, + platform_instance=self.config.platform_instance, + ) + + def _generate_datajob_from_connector(self, connector: Connector) -> DataJob: + dataflow_urn = DataFlowUrn.create_from_ids( + orchestrator=Constant.ORCHESTRATOR, + flow_id=connector.connector_id, + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + datajob = DataJob( + id=connector.connector_id, + flow_urn=dataflow_urn, + name=connector.connector_name, + owners={connector.user_name}, + ) + + job_property_bag: Dict[str, str] = {} + allowed_connection_keys = [ + Constant.PAUSED, + Constant.SYNC_FREQUENCY, + Constant.DESTINATION_ID, + ] + for key in allowed_connection_keys: + if hasattr(connector, key) and getattr(connector, key) is not None: + job_property_bag[key] = repr(getattr(connector, key)) + datajob.properties = job_property_bag + + # Map connector source and destination table with dataset entity + # Also extend the fine grained lineage of column if include_column_lineage is True + self._extend_lineage(connector=connector, datajob=datajob) + + # TODO: Add fine grained lineages of dataset after FineGrainedLineageDownstreamType.DATASET enabled + + return datajob + + def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance: + return DataProcessInstance.from_datajob( + datajob=datajob, + id=job.job_id, + clone_inlets=True, + clone_outlets=True, + ) + + def _get_dpi_workunits( + self, job: Job, dpi: DataProcessInstance + ) -> Iterable[MetadataWorkUnit]: + status_result_map: Dict[str, InstanceRunResult] = { + Constant.SUCCESSFUL: InstanceRunResult.SUCCESS, + Constant.FAILURE_WITH_TASK: InstanceRunResult.FAILURE, + Constant.CANCELED: InstanceRunResult.SKIPPED, + } + if job.status not in status_result_map: + logger.debug( + f"Status should be either SUCCESSFUL, FAILURE_WITH_TASK or CANCELED and it was " + f"{job.status}" + ) + return [] + result = status_result_map[job.status] + start_timestamp_millis = job.start_time * 1000 + for mcp in dpi.generate_mcp( + created_ts_millis=start_timestamp_millis, materialize_iolets=False + ): + yield mcp.as_workunit() + for mcp in dpi.start_event_mcp(start_timestamp_millis): + yield mcp.as_workunit() + for mcp in dpi.end_event_mcp( + end_timestamp_millis=job.end_time * 1000, + result=result, + result_type=Constant.ORCHESTRATOR, + ): + yield mcp.as_workunit() + + def _get_connector_workunits( + self, connector: Connector + ) -> Iterable[MetadataWorkUnit]: + self.report.report_connectors_scanned() + # Create dataflow entity with same name as connector name + dataflow = self._generate_dataflow_from_connector(connector) + for mcp in dataflow.generate_mcp(): + yield mcp.as_workunit() + + # Map Fivetran's connector entity with Datahub's datajob entity + datajob = self._generate_datajob_from_connector(connector) + for mcp in datajob.generate_mcp(materialize_iolets=True): + if mcp.entityType == "dataset" and isinstance(mcp.aspect, StatusClass): + # While we "materialize" the referenced datasets, we don't want them + # to be tracked by stateful ingestion. + yield mcp.as_workunit(is_primary_source=False) + else: + yield mcp.as_workunit() + + # Map Fivetran's job/sync history entity with Datahub's data process entity + for job in connector.jobs: + dpi = self._generate_dpi_from_job(job, datajob) + yield from self._get_dpi_workunits(job, dpi) + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = FivetranSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + self.stale_entity_removal_handler.workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """ + Datahub Ingestion framework invoke this method + """ + logger.info("Fivetran plugin execution is started") + connectors = self.audit_log.get_connectors_list() + for connector in connectors: + if not self.config.connector_patterns.allowed(connector.connector_name): + self.report.report_connectors_dropped(connector.connector_name) + continue + logger.info(f"Processing connector id: {connector.connector_id}") + yield from self._get_connector_workunits(connector) + + def get_report(self) -> SourceReport: + return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py new file mode 100644 index 00000000000000..d5d146559d9183 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -0,0 +1,147 @@ +import json +import logging +from typing import Any, Dict, List, Optional + +from sqlalchemy import create_engine + +from datahub.ingestion.source.fivetran.config import Constant, FivetranLogConfig +from datahub.ingestion.source.fivetran.data_classes import ( + ColumnLineage, + Connector, + Job, + TableLineage, +) +from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery + +logger: logging.Logger = logging.getLogger(__name__) + + +class FivetranLogAPI: + def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: + self.fivetran_log_database: Optional[str] = None + self.fivetran_log_config = fivetran_log_config + self.engine = self._get_log_destination_engine() + + def _get_log_destination_engine(self) -> Any: + destination_platform = self.fivetran_log_config.destination_platform + engine = None + # For every destination, create sqlalchemy engine, + # select the database and schema and set fivetran_log_database class variable + if destination_platform == "snowflake": + snowflake_destination_config = self.fivetran_log_config.destination_config + if snowflake_destination_config is not None: + engine = create_engine( + snowflake_destination_config.get_sql_alchemy_url(), + **snowflake_destination_config.get_options(), + ) + engine.execute( + FivetranLogQuery.use_schema( + snowflake_destination_config.database, + snowflake_destination_config.log_schema, + ) + ) + self.fivetran_log_database = snowflake_destination_config.database + return engine + + def _query(self, query: str) -> List[Dict]: + logger.debug("Query : {}".format(query)) + resp = self.engine.execute(query) + return [row for row in resp] + + def _get_table_lineage(self, connector_id: str) -> List[TableLineage]: + table_lineage_result = self._query( + FivetranLogQuery.get_table_lineage_query(connector_id=connector_id) + ) + table_lineage_list: List[TableLineage] = [] + for table_lineage in table_lineage_result: + column_lineage_result = self._query( + FivetranLogQuery.get_column_lineage_query( + source_table_id=table_lineage[Constant.SOURCE_TABLE_ID], + destination_table_id=table_lineage[Constant.DESTINATION_TABLE_ID], + ) + ) + column_lineage_list: List[ColumnLineage] = [ + ColumnLineage( + source_column=column_lineage[Constant.SOURCE_COLUMN_NAME], + destination_column=column_lineage[Constant.DESTINATION_COLUMN_NAME], + ) + for column_lineage in column_lineage_result + ] + table_lineage_list.append( + TableLineage( + source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}", + destination_table=f"{table_lineage[Constant.DESTINATION_SCHEMA_NAME]}.{table_lineage[Constant.DESTINATION_TABLE_NAME]}", + column_lineage=column_lineage_list, + ) + ) + + return table_lineage_list + + def _get_jobs_list(self, connector_id: str) -> List[Job]: + jobs: List[Job] = [] + sync_start_logs = { + row[Constant.SYNC_ID]: row + for row in self._query( + FivetranLogQuery.get_sync_start_logs_query(connector_id=connector_id) + ) + } + sync_end_logs = { + row[Constant.SYNC_ID]: row + for row in self._query( + FivetranLogQuery.get_sync_end_logs_query(connector_id=connector_id) + ) + } + for sync_id in sync_start_logs.keys(): + if sync_end_logs.get(sync_id) is None: + # If no sync-end event log for this sync id that means sync is still in progress + continue + + message_data = json.loads(sync_end_logs[sync_id][Constant.MESSAGE_DATA]) + if isinstance(message_data, str): + # Sometimes message_data contains json string inside string + # Ex: '"{\"status\":\"SUCCESSFUL\"}"' + # Hence, need to do json loads twice. + message_data = json.loads(message_data) + + jobs.append( + Job( + job_id=sync_id, + start_time=round( + sync_start_logs[sync_id][Constant.TIME_STAMP].timestamp() + ), + end_time=round( + sync_end_logs[sync_id][Constant.TIME_STAMP].timestamp() + ), + status=message_data[Constant.STATUS], + ) + ) + return jobs + + def _get_user_name(self, user_id: str) -> str: + user_details = self._query(FivetranLogQuery.get_user_query(user_id=user_id))[0] + return ( + f"{user_details[Constant.GIVEN_NAME]} {user_details[Constant.FAMILY_NAME]}" + ) + + def get_connectors_list(self) -> List[Connector]: + connectors: List[Connector] = [] + connector_list = self._query(FivetranLogQuery.get_connectors_query()) + for connector in connector_list: + connectors.append( + Connector( + connector_id=connector[Constant.CONNECTOR_ID], + connector_name=connector[Constant.CONNECTOR_NAME], + connector_type=connector[Constant.CONNECTOR_TYPE_ID], + paused=connector[Constant.PAUSED], + sync_frequency=connector[Constant.SYNC_FREQUENCY], + destination_id=connector[Constant.DESTINATION_ID], + user_name=self._get_user_name( + connector[Constant.CONNECTING_USER_ID] + ), + table_lineage=self._get_table_lineage( + connector[Constant.CONNECTOR_ID] + ), + jobs=self._get_jobs_list(connector[Constant.CONNECTOR_ID]), + ) + ) + return connectors diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py new file mode 100644 index 00000000000000..4f52fcd5d884fb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -0,0 +1,76 @@ +class FivetranLogQuery: + @staticmethod + def use_schema(db_name: str, schema_name: str) -> str: + return f'use schema "{db_name}"."{schema_name}"' + + @staticmethod + def get_connectors_query() -> str: + return """ + SELECT connector_id as "CONNECTOR_ID", + connecting_user_id as "CONNECTING_USER_ID", + connector_type_id as "CONNECTOR_TYPE_ID", + connector_name as "CONNECTOR_NAME", + paused as "PAUSED", + sync_frequency as "SYNC_FREQUENCY", + destination_id as "DESTINATION_ID" + FROM CONNECTOR + WHERE _fivetran_deleted = FALSE""" + + @staticmethod + def get_user_query(user_id: str) -> str: + return f""" + SELECT id as "USER_ID", + given_name as "GIVEN_NAME", + family_name as "FAMILY_NAME" + FROM USER + WHERE id = '{user_id}'""" + + @staticmethod + def get_sync_start_logs_query( + connector_id: str, + ) -> str: + return f""" + SELECT time_stamp as "TIME_STAMP", + sync_id as "SYNC_ID" + FROM LOG + WHERE message_event = 'sync_start' + and connector_id = '{connector_id}' order by time_stamp""" + + @staticmethod + def get_sync_end_logs_query(connector_id: str) -> str: + return f""" + SELECT time_stamp as "TIME_STAMP", + sync_id as "SYNC_ID", + message_data as "MESSAGE_DATA" + FROM LOG + WHERE message_event = 'sync_end' + and connector_id = '{connector_id}' order by time_stamp""" + + @staticmethod + def get_table_lineage_query(connector_id: str) -> str: + return f""" + SELECT stm.id as "SOURCE_TABLE_ID", + stm.name as "SOURCE_TABLE_NAME", + ssm.name as "SOURCE_SCHEMA_NAME", + dtm.id as "DESTINATION_TABLE_ID", + dtm.name as "DESTINATION_TABLE_NAME", + dsm.name as "DESTINATION_SCHEMA_NAME" + FROM table_lineage as tl + JOIN source_table_metadata as stm on tl.source_table_id = stm.id + JOIN destination_table_metadata as dtm on tl.destination_table_id = dtm.id + JOIN source_schema_metadata as ssm on stm.schema_id = ssm.id + JOIN destination_schema_metadata as dsm on dtm.schema_id = dsm.id + WHERE stm.connector_id = '{connector_id}'""" + + @staticmethod + def get_column_lineage_query( + source_table_id: str, destination_table_id: str + ) -> str: + return f""" + SELECT scm.name as "SOURCE_COLUMN_NAME", + dcm.name as "DESTINATION_COLUMN_NAME" + FROM column_lineage as cl + JOIN source_column_metadata as scm on + (cl.source_column_id = scm.id and scm.table_id = {source_table_id}) + JOIN destination_column_metadata as dcm on + (cl.destination_column_id = dcm.id and dcm.table_id = {destination_table_id})""" diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index c3e8c175f1de54..9fc697018ecd6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -12,7 +12,7 @@ OAUTH_AUTHENTICATOR, ) -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -42,9 +42,14 @@ SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" -class BaseSnowflakeConfig(BaseTimeWindowConfig): +class BaseSnowflakeConfig(ConfigModel): # Note: this config model is also used by the snowflake-usage source. + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) + scheme: str = "snowflake" username: Optional[str] = pydantic.Field( default=None, description="Snowflake username." @@ -82,14 +87,6 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig): default=None, description="Snowflake warehouse." ) role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.") - include_table_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", - ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) connect_args: Optional[Dict[str, Any]] = pydantic.Field( default=None, description="Connect args to pass to Snowflake SqlAlchemy driver", @@ -166,18 +163,6 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: "but should be set when using use_certificate false for oauth_config" ) - @pydantic.root_validator() - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values - def get_sql_alchemy_url( self, database: Optional[str] = None, @@ -261,28 +246,8 @@ def get_connect_args(self) -> dict: self._computed_connect_args = connect_args return connect_args - -class SnowflakeConfig(BaseSnowflakeConfig, SQLCommonConfig): - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - - ignore_start_time_lineage: bool = False - upstream_lineage_in_report: bool = False - - def get_sql_alchemy_url( - self, - database: Optional[str] = None, - username: Optional[str] = None, - password: Optional[pydantic.SecretStr] = None, - role: Optional[str] = None, - ) -> str: - return super().get_sql_alchemy_url( - database=database, username=username, password=password, role=role - ) - def get_options(self) -> dict: - options_connect_args: Dict = super().get_connect_args() + options_connect_args: Dict = self.get_connect_args() options_connect_args.update(self.options.get("connect_args", {})) self.options["connect_args"] = options_connect_args return self.options @@ -372,3 +337,34 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection: else: # not expected to be here raise Exception("Not expected to be here.") + + +class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig): + + include_table_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", + ) + include_view_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", + ) + + database_pattern: AllowDenyPattern = AllowDenyPattern( + deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] + ) + + ignore_start_time_lineage: bool = False + upstream_lineage_in_report: bool = False + + @pydantic.root_validator() + def validate_include_view_lineage(cls, values): + if ( + "include_table_lineage" in values + and not values.get("include_table_lineage") + and values.get("include_view_lineage") + ): + raise ValueError( + "include_table_lineage must be True for include_view_lineage to be set." + ) + return values diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json new file mode 100644 index 00000000000000..a72c960a722969 --- /dev/null +++ b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json @@ -0,0 +1,658 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "name": "postgres" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "paused": "False", + "sync_frequency": "1440", + "destination_id": "'interval_unconstitutional'" + }, + "name": "postgres", + "type": { + "string": "COMMAND" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD),name)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:Shubham Jagtap", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "4c9a03d6-eded-4422-a46a-163266e58243", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1695191853000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1695191853000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1695191885000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1696343730000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343730000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343732000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SKIPPED", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "63c2fc85-600b-455f-9ba0-f576522465be", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1696343755000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343755000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343790000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py new file mode 100644 index 00000000000000..62b3df12e1b9d3 --- /dev/null +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -0,0 +1,192 @@ +import datetime +from unittest import mock +from unittest.mock import MagicMock + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.fivetran.config import DestinationConfig +from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery +from tests.test_helpers import mce_helpers + +FROZEN_TIME = "2022-06-07 17:00:00" + + +def default_query_results(query): + if query == FivetranLogQuery.use_schema("TEST_DATABASE", "TEST_SCHEMA"): + return [] + elif query == FivetranLogQuery.get_connectors_query(): + return [ + { + "connector_id": "calendar_elected", + "connecting_user_id": "reapply_phone", + "connector_type_id": "postgres", + "connector_name": "postgres", + "paused": False, + "sync_frequency": 1440, + "destination_id": "interval_unconstitutional", + }, + ] + elif query == FivetranLogQuery.get_table_lineage_query("calendar_elected"): + return [ + { + "source_table_id": "10040", + "source_table_name": "employee", + "source_schema_name": "public", + "destination_table_id": "7779", + "destination_table_name": "employee", + "destination_schema_name": "postgres_public", + }, + { + "source_table_id": "10041", + "source_table_name": "company", + "source_schema_name": "public", + "destination_table_id": "7780", + "destination_table_name": "company", + "destination_schema_name": "postgres_public", + }, + ] + elif query == FivetranLogQuery.get_column_lineage_query( + "10040", "7779" + ) or query == FivetranLogQuery.get_column_lineage_query("10041", "7780"): + return [ + { + "source_column_name": "id", + "destination_column_name": "id", + }, + { + "source_column_name": "name", + "destination_column_name": "name", + }, + ] + elif query == FivetranLogQuery.get_user_query("reapply_phone"): + return [ + { + "user_id": "reapply_phone", + "given_name": "Shubham", + "family_name": "Jagtap", + } + ] + elif query == FivetranLogQuery.get_sync_start_logs_query("calendar_elected"): + return [ + { + "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), + "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", + }, + ] + elif query == FivetranLogQuery.get_sync_end_logs_query("calendar_elected"): + return [ + { + "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), + "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", + "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), + "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), + "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", + "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', + }, + ] + # Unreachable code + raise Exception(f"Unknown query {query}") + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_fivetran_basic(pytestconfig, tmp_path): + test_resources_dir = pytestconfig.rootpath / "tests/integration/fivetran" + + # Run the metadata ingestion pipeline. + output_file = tmp_path / "fivetran_test_events.json" + golden_file = test_resources_dir / "fivetran_golden.json" + + with mock.patch( + "datahub.ingestion.source.fivetran.fivetran_log_api.create_engine" + ) as mock_create_engine: + connection_magic_mock = MagicMock() + connection_magic_mock.execute.side_effect = default_query_results + + mock_create_engine.return_value = connection_magic_mock + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "fivetran", + "config": { + "fivetran_log_config": { + "destination_platform": "snowflake", + "destination_config": { + "account_id": "TESTID", + "warehouse": "TEST_WH", + "username": "test", + "password": "test@123", + "database": "TEST_DATABASE", + "role": "TESTROLE", + "log_schema": "TEST_SCHEMA", + }, + }, + "connector_patterns": { + "allow": [ + "postgres", + ] + }, + "sources_to_database": { + "calendar_elected": "postgres_db", + }, + "sources_to_platform_instance": { + "calendar_elected": { + "env": "DEV", + } + }, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{output_file}", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "fivetran_golden.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{output_file}", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + +@freeze_time(FROZEN_TIME) +def test_fivetran_snowflake_destination_config(pytestconfig, tmp_path): + snowflake_dest = DestinationConfig( + account_id="TESTID", + warehouse="TEST_WH", + username="test", + password="test@123", + database="TEST_DATABASE", + role="TESTROLE", + log_schema="TEST_SCHEMA", + ) + assert ( + snowflake_dest.get_sql_alchemy_url() + == "snowflake://test:test%40123@TESTID?application=acryl_datahub&authenticator=SNOWFLAKE&role=TESTROLE&warehouse=TEST_WH" + ) diff --git a/metadata-service/war/src/main/resources/boot/data_platforms.json b/metadata-service/war/src/main/resources/boot/data_platforms.json index 3d956c5774dedb..3c70eda8561b86 100644 --- a/metadata-service/war/src/main/resources/boot/data_platforms.json +++ b/metadata-service/war/src/main/resources/boot/data_platforms.json @@ -564,5 +564,15 @@ "type": "KEY_VALUE_STORE", "logoUrl": "/assets/platforms/dynamodblogo.png" } + }, + { + "urn": "urn:li:dataPlatform:fivetran", + "aspect": { + "datasetNameDelimiter": ".", + "name": "fivetran", + "displayName": "Fivetran", + "type": "OTHERS", + "logoUrl": "/assets/platforms/fivetranlogo.png" + } } ] From 399e032dfa2b4bf87b7b406e7b009e34e99a1003 Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Wed, 8 Nov 2023 22:32:13 +0530 Subject: [PATCH 02/29] feat(neo4j): Allow datahub to connect to specific neo4j database (#9179) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docker/docker-compose.override.yml | 4 ++ .../docker-compose-m1.quickstart.yml | 54 +++++++++---------- .../quickstart/docker-compose.quickstart.yml | 54 +++++++++---------- .../src/main/resources/application.yml | 1 + .../common/Neo4jGraphServiceFactory.java | 7 ++- 5 files changed, 65 insertions(+), 55 deletions(-) diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml index 225aa01fa4e4f8..0907f47d70c3ce 100644 --- a/docker/docker-compose.override.yml +++ b/docker/docker-compose.override.yml @@ -7,8 +7,12 @@ services: environment: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} volumes: - ${HOME}/.datahub/plugins:/etc/datahub/plugins + datahub-upgrade: + environment: + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} mysql-setup: container_name: mysql-setup hostname: mysql-setup diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index c96baf37551b29..613718306abef6 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -81,32 +81,32 @@ services: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} - DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - - NEO4J_HOST=http://neo4j:7474 - - NEO4J_URI=bolt://neo4j - - NEO4J_USERNAME=neo4j - - NEO4J_PASSWORD=datahub - - JAVA_OPTS=-Xms1g -Xmx1g - - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - - GRAPH_SERVICE_IMPL=neo4j + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - JAVA_OPTS=-Xms1g -Xmx1g + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true + - METADATA_SERVICE_AUTH_ENABLED=false + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_PASSWORD=datahub + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j - PE_CONSUMER_ENABLED=true - UI_INGESTION_ENABLED=true - - METADATA_SERVICE_AUTH_ENABLED=false healthcheck: interval: 1s retries: 3 @@ -134,23 +134,23 @@ services: neo4j: condition: service_healthy environment: - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - BACKFILL_BROWSE_PATHS_V2=true + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub + - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - - GRAPH_SERVICE_IMPL=elasticsearch - - DATAHUB_GMS_HOST=datahub-gms - - DATAHUB_GMS_PORT=8080 + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - - BACKFILL_BROWSE_PATHS_V2=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false hostname: datahub-upgrade image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 8a66521cbb5221..30ccbae59be740 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -81,32 +81,32 @@ services: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} - DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - - NEO4J_HOST=http://neo4j:7474 - - NEO4J_URI=bolt://neo4j - - NEO4J_USERNAME=neo4j - - NEO4J_PASSWORD=datahub - - JAVA_OPTS=-Xms1g -Xmx1g - - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - - GRAPH_SERVICE_IMPL=neo4j + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - JAVA_OPTS=-Xms1g -Xmx1g + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true + - METADATA_SERVICE_AUTH_ENABLED=false + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_PASSWORD=datahub + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j - PE_CONSUMER_ENABLED=true - UI_INGESTION_ENABLED=true - - METADATA_SERVICE_AUTH_ENABLED=false healthcheck: interval: 1s retries: 3 @@ -134,23 +134,23 @@ services: neo4j: condition: service_healthy environment: - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - BACKFILL_BROWSE_PATHS_V2=true + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub + - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - - GRAPH_SERVICE_IMPL=elasticsearch - - DATAHUB_GMS_HOST=datahub-gms - - DATAHUB_GMS_PORT=8080 + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - - BACKFILL_BROWSE_PATHS_V2=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false hostname: datahub-upgrade image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index a06891699607bb..46aa02d98572e6 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -251,6 +251,7 @@ neo4j: username: ${NEO4J_USERNAME:neo4j} password: ${NEO4J_PASSWORD:datahub} uri: ${NEO4J_URI:bolt://localhost} + database: ${NEO4J_DATABASE:graph.db} maxConnectionPoolSize: ${NEO4J_MAX_CONNECTION_POOL_SIZE:100} maxConnectionAcquisitionTimeout: ${NEO4J_MAX_CONNECTION_ACQUISITION_TIMEOUT_IN_SECONDS:60} maxConnectionLifetimeInSeconds: ${NEO4j_MAX_CONNECTION_LIFETIME_IN_SECONDS:3600} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java index e62dfd50f897d7..87670ce10f481a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java @@ -6,8 +6,10 @@ import com.linkedin.metadata.models.registry.EntityRegistry; import javax.annotation.Nonnull; import org.neo4j.driver.Driver; +import org.neo4j.driver.SessionConfig; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -24,10 +26,13 @@ public class Neo4jGraphServiceFactory { @Qualifier("entityRegistry") private EntityRegistry entityRegistry; + @Value("${neo4j.database}") + private String neo4jDatabase; + @Bean(name = "neo4jGraphService") @Nonnull protected Neo4jGraphService getInstance() { LineageRegistry lineageRegistry = new LineageRegistry(entityRegistry); - return new Neo4jGraphService(lineageRegistry, neo4jDriver); + return new Neo4jGraphService(lineageRegistry, neo4jDriver, SessionConfig.forDatabase(neo4jDatabase)); } } From 332d4afaab39e4b9e9ff73a48e3bfec9b21fe0b5 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 8 Nov 2023 10:22:09 -0800 Subject: [PATCH 03/29] feat(subtypes): support subtypes for charts in the UI (#9186) --- .../java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java | 4 ++++ datahub-graphql-core/src/main/resources/entity.graphql | 5 +++++ datahub-web-react/src/app/entity/chart/ChartEntity.tsx | 4 ++++ .../src/app/entity/chart/preview/ChartPreview.tsx | 5 ++++- datahub-web-react/src/graphql/chart.graphql | 3 +++ datahub-web-react/src/graphql/lineage.graphql | 3 +++ datahub-web-react/src/graphql/search.graphql | 6 ++++++ metadata-models/src/main/resources/entity-registry.yml | 1 + 8 files changed, 30 insertions(+), 1 deletion(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index b99f712034fe03..b0b26f073876c4 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -1433,6 +1433,10 @@ private void configureChartResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("statsSummary", new ChartStatsSummaryResolver(this.timeseriesAspectService)) .dataFetcher("privileges", new EntityPrivilegesResolver(entityClient)) .dataFetcher("exists", new EntityExistsResolver(entityService)) + .dataFetcher("subTypes", new SubTypesResolver( + this.entityClient, + "chart", + "subTypes")) ); builder.type("ChartInfo", typeWiring -> typeWiring .dataFetcher("inputs", new LoadableTypeBatchResolver<>(datasetType, diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index b37a8f34fa0563..035f756a10d557 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -5249,6 +5249,11 @@ type Chart implements EntityWithRelationships & Entity & BrowsableEntity { Whether or not this entity exists on DataHub """ exists: Boolean + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes } """ diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index 0f1b6dbf3d660d..fc898dec9d93af 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -154,10 +154,12 @@ export class ChartEntity implements Entity { getOverridePropertiesFromEntity = (chart?: Chart | null): GenericEntityProperties => { // TODO: Get rid of this once we have correctly formed platform coming back. const name = chart?.properties?.name; + const subTypes = chart?.subTypes; const externalUrl = chart?.properties?.externalUrl; return { name, externalUrl, + entityTypeOverride: subTypes ? capitalizeFirstLetterOnly(subTypes.typeNames?.[0]) : '', }; }; @@ -187,6 +189,7 @@ export class ChartEntity implements Entity { return ( { type: EntityType.Chart, icon: entity?.platform?.properties?.logoUrl || undefined, platform: entity?.platform, + subtype: entity?.subTypes?.typeNames?.[0] || undefined, }; }; diff --git a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx index 7d0fc143043e29..b7fbd63ee231e3 100644 --- a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx +++ b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx @@ -15,6 +15,7 @@ import { EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; +import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; import { ChartStatsSummary as ChartStatsSummaryView } from '../shared/ChartStatsSummary'; @@ -43,6 +44,7 @@ export const ChartPreview = ({ snippet, degree, paths, + subType, }: { urn: string; platform?: string; @@ -67,6 +69,7 @@ export const ChartPreview = ({ snippet?: React.ReactNode | null; degree?: number; paths?: EntityPath[]; + subType?: string | null; }): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -76,7 +79,7 @@ export const ChartPreview = ({ name={name || ''} urn={urn} description={description || ''} - type="Chart" + type={capitalizeFirstLetterOnly(subType) || 'Chart'} typeIcon={entityRegistry.getIcon(EntityType.Chart, 14, IconStyleType.ACCENT)} logoUrl={logoUrl || ''} platform={platform} diff --git a/datahub-web-react/src/graphql/chart.graphql b/datahub-web-react/src/graphql/chart.graphql index d4d3c3c9184082..a4b430720fa3d5 100644 --- a/datahub-web-react/src/graphql/chart.graphql +++ b/datahub-web-react/src/graphql/chart.graphql @@ -100,6 +100,9 @@ query getChart($urn: String!) { canEditLineage canEditEmbed } + subTypes { + typeNames + } } } diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index 52385dee8631ac..8fdfb696e08943 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -165,6 +165,9 @@ fragment lineageNodeProperties on EntityWithRelationships { status { removed } + subTypes { + typeNames + } } ... on Dataset { name diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 2297c2d0c1d075..876be12fd335b7 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -105,6 +105,9 @@ fragment autoCompleteFields on Entity { parentContainers { ...parentContainersFields } + subTypes { + typeNames + } } ... on DataFlow { orchestrator @@ -550,6 +553,9 @@ fragment searchResultFields on Entity { } } } + subTypes { + typeNames + } } ... on DataFlow { flowId diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 11d0f74305d7be..a5296d074093be 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -120,6 +120,7 @@ entities: - globalTags - glossaryTerms - browsePathsV2 + - subTypes - name: dashboard keyAspect: dashboardKey aspects: From 72135914109a241aa11ceaeb68b9ac56134e7e64 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 8 Nov 2023 14:36:33 -0500 Subject: [PATCH 04/29] feat(ui) Debounce auto-complete in search bar (#9205) --- datahub-web-react/src/app/home/HomePageHeader.tsx | 6 ++++-- datahub-web-react/src/app/search/SearchablePage.tsx | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx index 0052d54f562ebd..c881109f6e419d 100644 --- a/datahub-web-react/src/app/home/HomePageHeader.tsx +++ b/datahub-web-react/src/app/home/HomePageHeader.tsx @@ -1,6 +1,7 @@ import React, { useEffect, useMemo, useState } from 'react'; import { useHistory } from 'react-router'; import { Typography, Image, Row, Button, Tag } from 'antd'; +import { debounce } from 'lodash'; import styled, { useTheme } from 'styled-components/macro'; import { RightOutlined } from '@ant-design/icons'; import { ManageAccount } from '../shared/ManageAccount'; @@ -24,6 +25,7 @@ import { getAutoCompleteInputFromQuickFilter } from '../search/utils/filterUtils import { useUserContext } from '../context/useUserContext'; import AcrylDemoBanner from './AcrylDemoBanner'; import DemoButton from '../entity/shared/components/styled/DemoButton'; +import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants'; const Background = styled.div` width: 100%; @@ -176,7 +178,7 @@ export const HomePageHeader = () => { }); }; - const onAutoComplete = (query: string) => { + const onAutoComplete = debounce((query: string) => { if (query && query.trim() !== '') { getAutoCompleteResultsForMultiple({ variables: { @@ -189,7 +191,7 @@ export const HomePageHeader = () => { }, }); } - }; + }, HALF_SECOND_IN_MS); const onClickExploreAll = () => { analytics.event({ diff --git a/datahub-web-react/src/app/search/SearchablePage.tsx b/datahub-web-react/src/app/search/SearchablePage.tsx index 489687050c749d..9d02d85d3634c0 100644 --- a/datahub-web-react/src/app/search/SearchablePage.tsx +++ b/datahub-web-react/src/app/search/SearchablePage.tsx @@ -1,5 +1,6 @@ import React, { useEffect, useState } from 'react'; import { useHistory, useLocation } from 'react-router'; +import { debounce } from 'lodash'; import * as QueryString from 'query-string'; import { useTheme } from 'styled-components'; import { SearchHeader } from './SearchHeader'; @@ -17,6 +18,7 @@ import { getAutoCompleteInputFromQuickFilter } from './utils/filterUtils'; import { useQuickFiltersContext } from '../../providers/QuickFiltersContext'; import { useUserContext } from '../context/useUserContext'; import { useSelectedSortOption } from './context/SearchContext'; +import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants'; const styles = { children: { @@ -93,7 +95,7 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) => }); }; - const autoComplete = (query: string) => { + const autoComplete = debounce((query: string) => { if (query && query.trim() !== '') { getAutoCompleteResults({ variables: { @@ -105,7 +107,7 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) => }, }); } - }; + }, HALF_SECOND_IN_MS); // Load correct autocomplete results on initial page load. useEffect(() => { From 70692b44e995eab252a2344496141acdf6181908 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Wed, 8 Nov 2023 12:49:23 -0800 Subject: [PATCH 05/29] fix(lineage): magical lineage layout fix (#9187) --- .../src/app/lineage/utils/layoutTree.ts | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/datahub-web-react/src/app/lineage/utils/layoutTree.ts b/datahub-web-react/src/app/lineage/utils/layoutTree.ts index cc704007049c20..a972a62308f073 100644 --- a/datahub-web-react/src/app/lineage/utils/layoutTree.ts +++ b/datahub-web-react/src/app/lineage/utils/layoutTree.ts @@ -32,6 +32,21 @@ function getParentRelationship(direction: Direction, parent: VizNode | null, nod return directionRelationships?.find((r) => r?.entity?.urn === node?.urn); } +// this utility function is to help make sure layouts that contain many references to the same URN don't struggle laying out that URN. +function firstAppearanceIndices(arr) { + const seen = new Set(); // To track which strings have been seen + const result = [] as number[]; + + for (let i = 0; i < arr.length; i++) { + if (!seen.has(arr[i])) { + seen.add(arr[i]); // Add the string to the set + result.push(i); // Save the index + } + } + + return result; +} + function layoutNodesForOneDirection( data: NodeData, direction: Direction, @@ -54,12 +69,10 @@ function layoutNodesForOneDirection( while (nodesInCurrentLayer.length > 0) { // if we've already added a node to the viz higher up dont add it again const urnsToAddInCurrentLayer = Array.from(new Set(nodesInCurrentLayer.map(({ node }) => node.urn || ''))); - const nodesToAddInCurrentLayer = urnsToAddInCurrentLayer - .filter((urn, pos) => urnsToAddInCurrentLayer.indexOf(urn) === pos) - .filter((urn) => !nodesByUrn[urn || '']); + const positionsToAddInCurrentLayer = firstAppearanceIndices(urnsToAddInCurrentLayer); const filteredNodesInCurrentLayer = nodesInCurrentLayer - .filter(({ node }) => nodesToAddInCurrentLayer.indexOf(node.urn || '') > -1) + .filter((_, idx) => positionsToAddInCurrentLayer.indexOf(idx) > -1) .filter(({ node }) => node.status?.removed !== true); const layerSize = filteredNodesInCurrentLayer.length; From f87983d69dc62db5c58dc114f8796dcb9eb1cc95 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Wed, 8 Nov 2023 13:29:37 -0800 Subject: [PATCH 06/29] refactor(pdl): Refactoring Assertion model enums out (#9191) Co-authored-by: Harshal Sheth --- .../linkedin/assertion/AssertionResult.pdl | 19 +-------------- .../assertion/AssertionResultType.pdl | 23 +++++++++++++++++++ .../linkedin/assertion/AssertionRunEvent.pdl | 7 +----- .../linkedin/assertion/AssertionRunStatus.pdl | 12 ++++++++++ 4 files changed, 37 insertions(+), 24 deletions(-) create mode 100644 metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl create mode 100644 metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl index ded84e1969153b..935f3e5976dfa5 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl @@ -9,24 +9,7 @@ record AssertionResult { */ @TimeseriesField = {} @Searchable = {} - type: enum AssertionResultType { - /** - * The Assertion has not yet been fully evaluated - */ - INIT - /** - * The Assertion Succeeded - */ - SUCCESS - /** - * The Assertion Failed - */ - FAILURE - /** - * The Assertion encountered an Error - */ - ERROR - } + type: AssertionResultType /** * Number of rows for evaluated batch diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl new file mode 100644 index 00000000000000..8954d94cced7bf --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl @@ -0,0 +1,23 @@ +namespace com.linkedin.assertion + +/** +* The final result of evaluating an assertion, e.g. SUCCESS, FAILURE, or ERROR. +*/ +enum AssertionResultType { + /** + * The Assertion has not yet been fully evaluated + */ + INIT + /** + * The Assertion Succeeded + */ + SUCCESS + /** + * The Assertion Failed + */ + FAILURE + /** + * The Assertion encountered an Error + */ + ERROR +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl index 14f12042327404..55bcae77273dbd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl @@ -39,12 +39,7 @@ record AssertionRunEvent { * The status of the assertion run as per this timeseries event. */ @TimeseriesField = {} - status: enum AssertionRunStatus { - /** - * The Assertion Run has completed - */ - COMPLETE - } + status: AssertionRunStatus /** * Results of assertion, present if the status is COMPLETE diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl new file mode 100644 index 00000000000000..e4e17925ede82a --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl @@ -0,0 +1,12 @@ +namespace com.linkedin.assertion + + +/** +* The lifecycle status of an assertion run. +*/ +enum AssertionRunStatus { + /** + * The Assertion Run has completed + */ + COMPLETE +} \ No newline at end of file From f38c8087bb508a779d94d04967a9c449f6d93126 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Wed, 8 Nov 2023 22:38:15 +0000 Subject: [PATCH 07/29] feat(auth): Add roles to policy engine validation logic (#9178) --- .../authorization/AuthorizedActors.java | 1 + .../authorization/AuthorizerChain.java | 5 + .../authorization/DataHubAuthorizer.java | 8 +- .../datahub/authorization/PolicyEngine.java | 43 +++----- .../authorization/DataHubAuthorizerTest.java | 97 ++++++++++++++++--- .../authorization/PolicyEngineTest.java | 54 ++++++++++- .../datahub/plugins/test/TestAuthorizer.java | 2 +- 7 files changed, 162 insertions(+), 48 deletions(-) diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java index aec99e1b1e57a8..5a9990552bb34a 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java @@ -15,6 +15,7 @@ public class AuthorizedActors { String privilege; List users; List groups; + List roles; boolean allUsers; boolean allGroups; } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java index f8eca541e1efb4..7e7a1de176f06d 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java @@ -126,11 +126,16 @@ private AuthorizedActors mergeAuthorizedActors(@Nullable AuthorizedActors origin mergedGroups = new ArrayList<>(groups); } + Set roles = new HashSet<>(original.getRoles()); + roles.addAll(other.getRoles()); + List mergedRoles = new ArrayList<>(roles); + return AuthorizedActors.builder() .allUsers(original.isAllUsers() || other.isAllUsers()) .allGroups(original.isAllGroups() || other.isAllGroups()) .users(mergedUsers) .groups(mergedGroups) + .roles(mergedRoles) .build(); } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index f8f99475de23e2..956d635c7901ac 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -133,6 +133,7 @@ public AuthorizedActors authorizedActors( final List authorizedUsers = new ArrayList<>(); final List authorizedGroups = new ArrayList<>(); + final List authorizedRoles = new ArrayList<>(); boolean allUsers = false; boolean allGroups = false; @@ -153,16 +154,17 @@ public AuthorizedActors authorizedActors( // Step 3: For each matching policy, add actors that are authorized. authorizedUsers.addAll(matchingActors.getUsers()); authorizedGroups.addAll(matchingActors.getGroups()); - if (matchingActors.allUsers()) { + authorizedRoles.addAll(matchingActors.getRoles()); + if (matchingActors.getAllUsers()) { allUsers = true; } - if (matchingActors.allGroups()) { + if (matchingActors.getAllGroups()) { allGroups = true; } } // Step 4: Return all authorized users and groups. - return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, allUsers, allGroups); + return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, authorizedRoles, allUsers, allGroups); } /** diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java index f8c017ea74e1f6..da0ae26f2b1da6 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java @@ -32,7 +32,10 @@ import java.util.stream.Stream; import javax.annotation.Nullable; +import lombok.AccessLevel; +import lombok.AllArgsConstructor; import lombok.RequiredArgsConstructor; +import lombok.Value; import lombok.extern.slf4j.Slf4j; import static com.linkedin.metadata.Constants.*; @@ -75,6 +78,7 @@ public PolicyActors getMatchingActors( final Optional resource) { final List users = new ArrayList<>(); final List groups = new ArrayList<>(); + final List roles = new ArrayList<>(); boolean allUsers = false; boolean allGroups = false; if (policyMatchesResource(policy, resource)) { @@ -96,6 +100,9 @@ public PolicyActors getMatchingActors( if (actorFilter.getGroups() != null) { groups.addAll(actorFilter.getGroups()); } + if (actorFilter.getRoles() != null) { + roles.addAll(actorFilter.getRoles()); + } // 2. Fetch Actors based on resource ownership. if (actorFilter.isResourceOwners() && resource.isPresent()) { @@ -104,7 +111,7 @@ public PolicyActors getMatchingActors( groups.addAll(groupOwners(owners)); } } - return new PolicyActors(users, groups, allUsers, allGroups); + return new PolicyActors(users, groups, roles, allUsers, allGroups); } private boolean isPolicyApplicable( @@ -438,34 +445,14 @@ public boolean isGranted() { /** * Class used to represent all valid users of a policy. */ + @Value + @AllArgsConstructor(access = AccessLevel.PUBLIC) public static class PolicyActors { - final List _users; - final List _groups; - final Boolean _allUsers; - final Boolean _allGroups; - - public PolicyActors(final List users, final List groups, final Boolean allUsers, final Boolean allGroups) { - _users = users; - _groups = groups; - _allUsers = allUsers; - _allGroups = allGroups; - } - - public List getUsers() { - return _users; - } - - public List getGroups() { - return _groups; - } - - public Boolean allUsers() { - return _allUsers; - } - - public Boolean allGroups() { - return _allGroups; - } + List users; + List groups; + List roles; + Boolean allUsers; + Boolean allGroups; } private List userOwners(final Set owners) { diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index babb1c5d00ee8a..b0b206001209c7 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -21,6 +21,7 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.RoleMembership; import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchEntity; @@ -55,6 +56,7 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; public class DataHubAuthorizerTest { @@ -63,6 +65,7 @@ public class DataHubAuthorizerTest { private static final Urn PARENT_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:parent"); private static final Urn CHILD_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:child"); + private static final Urn USER_WITH_ADMIN_ROLE = UrnUtils.getUrn("urn:li:corpuser:user-with-admin"); private EntityClient _entityClient; private DataHubAuthorizer _dataHubAuthorizer; @@ -92,40 +95,56 @@ public void setupTest() throws Exception { final EnvelopedAspectMap childDomainPolicyAspectMap = new EnvelopedAspectMap(); childDomainPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(childDomainPolicy.data()))); + final Urn adminPolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:4"); + final DataHubActorFilter actorFilter = new DataHubActorFilter(); + actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Admin")))); + final DataHubPolicyInfo adminPolicy = createDataHubPolicyInfoFor(true, ImmutableList.of("EDIT_USER_PROFILE"), null, actorFilter); + final EnvelopedAspectMap adminPolicyAspectMap = new EnvelopedAspectMap(); + adminPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(adminPolicy.data()))); + final ScrollResult policySearchResult1 = new ScrollResult() .setScrollId("1") - .setNumEntities(4) + .setNumEntities(5) .setEntities( new SearchEntityArray( ImmutableList.of(new SearchEntity().setEntity(activePolicyUrn)))); final ScrollResult policySearchResult2 = new ScrollResult() .setScrollId("2") - .setNumEntities(4) + .setNumEntities(5) .setEntities( new SearchEntityArray( ImmutableList.of(new SearchEntity().setEntity(inactivePolicyUrn)))); final ScrollResult policySearchResult3 = new ScrollResult() .setScrollId("3") - .setNumEntities(4) + .setNumEntities(5) .setEntities( new SearchEntityArray( ImmutableList.of(new SearchEntity().setEntity(parentDomainPolicyUrn)))); final ScrollResult policySearchResult4 = new ScrollResult() - .setNumEntities(4) + .setScrollId("4") + .setNumEntities(5) .setEntities( new SearchEntityArray( ImmutableList.of( new SearchEntity().setEntity(childDomainPolicyUrn)))); + final ScrollResult policySearchResult5 = new ScrollResult() + .setNumEntities(5) + .setEntities( + new SearchEntityArray( + ImmutableList.of( + new SearchEntity().setEntity(adminPolicyUrn)))); + when(_entityClient.scrollAcrossEntities(eq(List.of("dataHubPolicy")), eq(""), isNull(), any(), isNull(), anyInt(), eq(new SearchFlags().setFulltext(true).setSkipAggregates(true).setSkipHighlighting(true).setSkipCache(true)), any())) .thenReturn(policySearchResult1) .thenReturn(policySearchResult2) .thenReturn(policySearchResult3) - .thenReturn(policySearchResult4); + .thenReturn(policySearchResult4) + .thenReturn(policySearchResult5); when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME), any(), eq(null), any())).thenAnswer(args -> { Set inputUrns = args.getArgument(1); @@ -140,6 +159,8 @@ public void setupTest() throws Exception { return Map.of(parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap)); case "urn:li:dataHubPolicy:3": return Map.of(childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap)); + case "urn:li:dataHubPolicy:4": + return Map.of(adminPolicyUrn, new EntityResponse().setUrn(adminPolicyUrn).setAspects(adminPolicyAspectMap)); default: throw new IllegalStateException(); } @@ -167,6 +188,10 @@ public void setupTest() throws Exception { when(_entityClient.batchGetV2(any(), eq(Collections.singleton(PARENT_DOMAIN_URN)), eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), any())) .thenReturn(createDomainPropertiesBatchResponse(null)); + // Mocks to reach role membership for a user urn + when(_entityClient.batchGetV2(any(), eq(Collections.singleton(USER_WITH_ADMIN_ROLE)), eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any()) + ).thenReturn(createUserRoleMembershipBatchResponse(USER_WITH_ADMIN_ROLE, UrnUtils.getUrn("urn:li:dataHubRole:Admin"))); + final Authentication systemAuthentication = new Authentication( new Actor(ActorType.USER, DATAHUB_SYSTEM_CLIENT_ID), "" @@ -302,6 +327,32 @@ public void testAuthorizedActorsActivePolicy() throws Exception { )); } + @Test + public void testAuthorizedRoleActivePolicy() throws Exception { + final AuthorizedActors actors = + _dataHubAuthorizer.authorizedActors("EDIT_USER_PROFILE", // Should be inside the active policy. + Optional.of(new EntitySpec("dataset", "urn:li:dataset:1"))); + + assertFalse(actors.isAllUsers()); + assertFalse(actors.isAllGroups()); + assertEquals(new HashSet<>(actors.getUsers()), ImmutableSet.of()); + assertEquals(new HashSet<>(actors.getGroups()), ImmutableSet.of()); + assertEquals(new HashSet<>(actors.getRoles()), ImmutableSet.of(UrnUtils.getUrn("urn:li:dataHubRole:Admin"))); + } + + @Test + public void testAuthorizationBasedOnRoleIsAllowed() { + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); + + AuthorizationRequest request = new AuthorizationRequest( + USER_WITH_ADMIN_ROLE.toString(), + "EDIT_USER_PROFILE", + Optional.of(resourceSpec) + ); + + assertEquals(_dataHubAuthorizer.authorize(request).getType(), AuthorizationResult.Type.ALLOW); + } + @Test public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); @@ -342,13 +393,6 @@ public void testAuthorizationOnDomainWithoutPrivilegeIsDenied() { } private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List privileges, @Nullable final Urn domain) throws Exception { - final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); - dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); - dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE); - dataHubPolicyInfo.setPrivileges(new StringArray(privileges)); - dataHubPolicyInfo.setDisplayName("My Test Display"); - dataHubPolicyInfo.setDescription("My test display!"); - dataHubPolicyInfo.setEditable(true); List users = ImmutableList.of(Urn.createFromString("urn:li:corpuser:user1"), Urn.createFromString("urn:li:corpuser:user2")); List groups = ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group1"), Urn.createFromString("urn:li:corpGroup:group2")); @@ -359,6 +403,20 @@ private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List p actorFilter.setAllGroups(true); actorFilter.setUsers(new UrnArray(users)); actorFilter.setGroups(new UrnArray(groups)); + + return createDataHubPolicyInfoFor(active, privileges, domain, actorFilter); + } + + private DataHubPolicyInfo createDataHubPolicyInfoFor(boolean active, List privileges, + @Nullable final Urn domain, DataHubActorFilter actorFilter) throws Exception { + final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); + dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); + dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE); + dataHubPolicyInfo.setPrivileges(new StringArray(privileges)); + dataHubPolicyInfo.setDisplayName("My Test Display"); + dataHubPolicyInfo.setDescription("My test display!"); + dataHubPolicyInfo.setEditable(true); + dataHubPolicyInfo.setActors(actorFilter); final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); @@ -429,6 +487,21 @@ private Map createDomainPropertiesBatchResponse(@Nullable f return batchResponse; } + private Map createUserRoleMembershipBatchResponse(final Urn userUrn, @Nullable final Urn roleUrn) { + final Map batchResponse = new HashMap<>(); + final EntityResponse response = new EntityResponse(); + EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); + final RoleMembership membership = new RoleMembership(); + if (roleUrn != null) { + membership.setRoles(new UrnArray(roleUrn)); + } + aspectMap.put(ROLE_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect() + .setValue(new com.linkedin.entity.Aspect(membership.data()))); + response.setAspects(aspectMap); + batchResponse.put(userUrn, response); + return batchResponse; + } + private AuthorizerContext createAuthorizerContext(final Authentication systemAuthentication, final EntityClient entityClient) { return new AuthorizerContext(Collections.emptyMap(), new DefaultEntitySpecResolver(systemAuthentication, entityClient)); } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java index be8c948f8ef897..2790c16ba75e62 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java @@ -1041,6 +1041,7 @@ public void testGetMatchingActorsResourceMatch() throws Exception { Urn.createFromString("urn:li:corpuser:user2")))); actorFilter.setGroups(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group1"), Urn.createFromString("urn:li:corpGroup:group2")))); + actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:role:Admin")))); dataHubPolicyInfo.setActors(actorFilter); final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); @@ -1056,8 +1057,8 @@ public void testGetMatchingActorsResourceMatch() throws Exception { Collections.emptySet(), Collections.emptySet()); PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); - assertTrue(actors.allUsers()); - assertTrue(actors.allGroups()); + assertTrue(actors.getAllUsers()); + assertTrue(actors.getAllGroups()); assertEquals(actors.getUsers(), ImmutableList.of(Urn.createFromString("urn:li:corpuser:user1"), Urn.createFromString("urn:li:corpuser:user2"), @@ -1068,6 +1069,8 @@ public void testGetMatchingActorsResourceMatch() throws Exception { Urn.createFromString("urn:li:corpGroup:group2"), Urn.createFromString(AUTHORIZED_GROUP) // Resource Owner )); + assertEquals(actors.getRoles(), ImmutableList.of(Urn.createFromString("urn:li:role:Admin"))); + // Verify aspect client called, entity client not called. verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), eq(null), any()); @@ -1106,15 +1109,58 @@ public void testGetMatchingActorsNoResourceMatch() throws Exception { buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); - assertFalse(actors.allUsers()); - assertFalse(actors.allGroups()); + assertFalse(actors.getAllUsers()); + assertFalse(actors.getAllGroups()); assertEquals(actors.getUsers(), Collections.emptyList()); assertEquals(actors.getGroups(), Collections.emptyList()); + //assertEquals(actors.getRoles(), Collections.emptyList()); // Verify no network calls verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } + @Test + public void testGetMatchingActorsByRoleResourceMatch() throws Exception { + final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); + dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); + dataHubPolicyInfo.setState(ACTIVE_POLICY_STATE); + dataHubPolicyInfo.setPrivileges(new StringArray("EDIT_ENTITY_TAGS")); + dataHubPolicyInfo.setDisplayName("My Test Display"); + dataHubPolicyInfo.setDescription("My test display!"); + dataHubPolicyInfo.setEditable(true); + + final DataHubActorFilter actorFilter = new DataHubActorFilter(); + actorFilter.setResourceOwners(true); + actorFilter.setAllUsers(false); + actorFilter.setAllGroups(false); + actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Editor")))); + dataHubPolicyInfo.setActors(actorFilter); + + final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); + resourceFilter.setAllResources(false); + resourceFilter.setType("dataset"); + StringArray resourceUrns = new StringArray(); + resourceUrns.add(RESOURCE_URN); + resourceFilter.setResources(resourceUrns); + dataHubPolicyInfo.setResources(resourceFilter); + + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(), + Collections.emptySet(), Collections.emptySet()); + + PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); + + assertFalse(actors.getAllUsers()); + assertFalse(actors.getAllGroups()); + + assertEquals(actors.getUsers(), ImmutableList.of()); + assertEquals(actors.getGroups(), ImmutableList.of()); + assertEquals(actors.getRoles(), ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Editor"))); + + // Verify aspect client called, entity client not called. + verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), + eq(null), any()); + } + private Ownership createOwnershipAspect(final Boolean addUserOwner, final Boolean addGroupOwner) throws Exception { final Ownership ownershipAspect = new Ownership(); final OwnerArray owners = new OwnerArray(); diff --git a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java index 442ac1b0d287b3..e5f3e223ff505d 100644 --- a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java +++ b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java @@ -75,7 +75,7 @@ public AuthorizationResult authorize(@Nonnull AuthorizationRequest request) { @Override public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { - return new AuthorizedActors("ALL", null, null, true, true); + return new AuthorizedActors("ALL", null, null, null, true, true); } } From f73ecfdcbbc35437fcb80c9e27e78908dae23ea7 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 8 Nov 2023 18:17:49 -0500 Subject: [PATCH 08/29] style(ingest/tableau): Rename tableau_constant to c (#9207) --- .../src/datahub/ingestion/source/tableau.py | 597 ++++++++---------- .../ingestion/source/tableau_common.py | 14 +- 2 files changed, 272 insertions(+), 339 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 4bc40b0aac9649..08df7599510f47 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -59,7 +59,7 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source import tableau_constant +from datahub.ingestion.source import tableau_constant as c from datahub.ingestion.source.common.subtypes import ( BIContainerSubTypes, DatasetSubTypes, @@ -720,16 +720,12 @@ def get_connection_object_page( query, connection_type, query_filter, count, offset, False ) - if tableau_constant.ERRORS in query_data: - errors = query_data[tableau_constant.ERRORS] + if c.ERRORS in query_data: + errors = query_data[c.ERRORS] if all( # The format of the error messages is highly unpredictable, so we have to # be extra defensive with our parsing. - error - and (error.get(tableau_constant.EXTENSIONS) or {}).get( - tableau_constant.SEVERITY - ) - == tableau_constant.WARNING + error and (error.get(c.EXTENSIONS) or {}).get(c.SEVERITY) == c.WARNING for error in errors ): self.report.report_warning(key=connection_type, reason=f"{errors}") @@ -737,14 +733,14 @@ def get_connection_object_page( raise RuntimeError(f"Query {connection_type} error: {errors}") connection_object = ( - query_data.get(tableau_constant.DATA).get(connection_type, {}) - if query_data.get(tableau_constant.DATA) + query_data.get(c.DATA).get(connection_type, {}) + if query_data.get(c.DATA) else {} ) - total_count = connection_object.get(tableau_constant.TOTAL_COUNT, 0) - has_next_page = connection_object.get(tableau_constant.PAGE_INFO, {}).get( - tableau_constant.HAS_NEXT_PAGE, False + total_count = connection_object.get(c.TOTAL_COUNT, 0) + has_next_page = connection_object.get(c.PAGE_INFO, {}).get( + c.HAS_NEXT_PAGE, False ) return connection_object, total_count, has_next_page @@ -781,7 +777,7 @@ def get_connection_objects( offset += count - for obj in connection_objects.get(tableau_constant.NODES) or []: + for obj in connection_objects.get(c.NODES) or []: yield obj def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: @@ -790,11 +786,11 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: project.name for project in self.tableau_project_registry.values() ] project_names_str: str = json.dumps(project_names) - projects = f"{tableau_constant.PROJECT_NAME_WITH_IN}: {project_names_str}" + projects = f"{c.PROJECT_NAME_WITH_IN}: {project_names_str}" for workbook in self.get_connection_objects( workbook_graphql_query, - tableau_constant.WORKBOOKS_CONNECTION, + c.WORKBOOKS_CONNECTION, projects, page_size_override=self.config.workbook_page_size, ): @@ -804,11 +800,9 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later. project_luid: Optional[str] = self._get_workbook_project_luid(workbook) if project_luid not in self.tableau_project_registry.keys(): - wrk_name: Optional[str] = workbook.get(tableau_constant.NAME) - wrk_id: Optional[str] = workbook.get(tableau_constant.ID) - prj_name: Optional[str] = workbook.get( - tableau_constant.PROJECT_NAME - ) + wrk_name: Optional[str] = workbook.get(c.NAME) + wrk_id: Optional[str] = workbook.get(c.ID) + prj_name: Optional[str] = workbook.get(c.PROJECT_NAME) logger.debug( f"Skipping workbook {wrk_name}({wrk_id}) as it is project {prj_name}({project_luid}) not " @@ -818,25 +812,22 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: yield from self.emit_workbook_as_container(workbook) - for sheet in workbook.get(tableau_constant.SHEETS, []): - self.sheet_ids.append(sheet[tableau_constant.ID]) + for sheet in workbook.get(c.SHEETS, []): + self.sheet_ids.append(sheet[c.ID]) - for dashboard in workbook.get(tableau_constant.DASHBOARDS, []): - self.dashboard_ids.append(dashboard[tableau_constant.ID]) + for dashboard in workbook.get(c.DASHBOARDS, []): + self.dashboard_ids.append(dashboard[c.ID]) - for ds in workbook.get(tableau_constant.EMBEDDED_DATA_SOURCES, []): - self.embedded_datasource_ids_being_used.append( - ds[tableau_constant.ID] - ) + for ds in workbook.get(c.EMBEDDED_DATA_SOURCES, []): + self.embedded_datasource_ids_being_used.append(ds[c.ID]) def _track_custom_sql_ids(self, field: dict) -> None: # Tableau shows custom sql datasource as a table in ColumnField's upstreamColumns. - for column in field.get(tableau_constant.UPSTREAM_COLUMNS, []): + for column in field.get(c.UPSTREAM_COLUMNS, []): table_id = ( - column.get(tableau_constant.TABLE, {}).get(tableau_constant.ID) - if column.get(tableau_constant.TABLE) - and column[tableau_constant.TABLE][tableau_constant.TYPE_NAME] - == tableau_constant.CUSTOM_SQL_TABLE + column.get(c.TABLE, {}).get(c.ID) + if column.get(c.TABLE) + and column[c.TABLE][c.TYPE_NAME] == c.CUSTOM_SQL_TABLE else None ) @@ -861,15 +852,15 @@ def _create_upstream_table_lineage( # and published datasource have same upstreamTables in this case. if upstream_tables and is_embedded_ds: logger.debug( - f"Embedded datasource {datasource.get(tableau_constant.ID)} has upstreamDatasources.\ + f"Embedded datasource {datasource.get(c.ID)} has upstreamDatasources.\ Setting only upstreamDatasources lineage. The upstreamTables lineage \ will be set via upstream published datasource." ) else: # This adds an edge to upstream DatabaseTables using `upstreamTables` upstreams, id_to_urn = self.get_upstream_tables( - datasource.get(tableau_constant.UPSTREAM_TABLES, []), - datasource.get(tableau_constant.NAME), + datasource.get(c.UPSTREAM_TABLES, []), + datasource.get(c.NAME), browse_path, is_custom_sql=False, ) @@ -878,23 +869,23 @@ def _create_upstream_table_lineage( # This adds an edge to upstream CustomSQLTables using `fields`.`upstreamColumns`.`table` csql_upstreams, csql_id_to_urn = self.get_upstream_csql_tables( - datasource.get(tableau_constant.FIELDS) or [], + datasource.get(c.FIELDS) or [], ) upstream_tables.extend(csql_upstreams) table_id_to_urn.update(csql_id_to_urn) logger.debug( - f"A total of {len(upstream_tables)} upstream table edges found for datasource {datasource[tableau_constant.ID]}" + f"A total of {len(upstream_tables)} upstream table edges found for datasource {datasource[c.ID]}" ) datasource_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, - name=datasource[tableau_constant.ID], + name=datasource[c.ID], platform_instance=self.config.platform_instance, env=self.config.env, ) - if datasource.get(tableau_constant.FIELDS): + if datasource.get(c.FIELDS): if self.config.extract_column_level_lineage: # Find fine grained lineage for datasource column to datasource column edge, # upstream columns may be from same datasource @@ -912,20 +903,20 @@ def _create_upstream_table_lineage( fine_grained_lineages.extend(upstream_columns) logger.debug( - f"A total of {len(fine_grained_lineages)} upstream column edges found for datasource {datasource[tableau_constant.ID]}" + f"A total of {len(fine_grained_lineages)} upstream column edges found for datasource {datasource[c.ID]}" ) return upstream_tables, fine_grained_lineages def get_upstream_datasources(self, datasource: dict) -> List[Upstream]: upstream_tables = [] - for ds in datasource.get(tableau_constant.UPSTREAM_DATA_SOURCES, []): - if ds[tableau_constant.ID] not in self.datasource_ids_being_used: - self.datasource_ids_being_used.append(ds[tableau_constant.ID]) + for ds in datasource.get(c.UPSTREAM_DATA_SOURCES, []): + if ds[c.ID] not in self.datasource_ids_being_used: + self.datasource_ids_being_used.append(ds[c.ID]) upstream_ds_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, - name=ds[tableau_constant.ID], + name=ds[c.ID], platform_instance=self.config.platform_instance, env=self.config.env, ) @@ -943,20 +934,15 @@ def get_upstream_csql_tables( csql_id_to_urn = {} for field in fields: - if not field.get(tableau_constant.UPSTREAM_COLUMNS): + if not field.get(c.UPSTREAM_COLUMNS): continue - for upstream_col in field[tableau_constant.UPSTREAM_COLUMNS]: + for upstream_col in field[c.UPSTREAM_COLUMNS]: if ( upstream_col - and upstream_col.get(tableau_constant.TABLE) - and upstream_col.get(tableau_constant.TABLE)[ - tableau_constant.TYPE_NAME - ] - == tableau_constant.CUSTOM_SQL_TABLE + and upstream_col.get(c.TABLE) + and upstream_col.get(c.TABLE)[c.TYPE_NAME] == c.CUSTOM_SQL_TABLE ): - upstream_table_id = upstream_col.get(tableau_constant.TABLE)[ - tableau_constant.ID - ] + upstream_table_id = upstream_col.get(c.TABLE)[c.ID] csql_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, @@ -986,18 +972,18 @@ def get_upstream_tables( for table in tables: # skip upstream tables when there is no column info when retrieving datasource # Lineage and Schema details for these will be taken care in self.emit_custom_sql_datasources() - num_tbl_cols: Optional[int] = table.get( - tableau_constant.COLUMNS_CONNECTION - ) and table[tableau_constant.COLUMNS_CONNECTION].get("totalCount") + num_tbl_cols: Optional[int] = table.get(c.COLUMNS_CONNECTION) and table[ + c.COLUMNS_CONNECTION + ].get("totalCount") if not is_custom_sql and not num_tbl_cols: logger.debug( - f"Skipping upstream table with id {table[tableau_constant.ID]}, no columns: {table}" + f"Skipping upstream table with id {table[c.ID]}, no columns: {table}" ) continue - elif table[tableau_constant.NAME] is None: + elif table[c.NAME] is None: self.report.num_upstream_table_skipped_no_name += 1 logger.warning( - f"Skipping upstream table {table[tableau_constant.ID]} from lineage since its name is none: {table}" + f"Skipping upstream table {table[c.ID]} from lineage since its name is none: {table}" ) continue @@ -1014,7 +1000,7 @@ def get_upstream_tables( self.config.platform_instance_map, self.config.lineage_overrides, ) - table_id_to_urn[table[tableau_constant.ID]] = table_urn + table_id_to_urn[table[c.ID]] = table_urn upstream_table = Upstream( dataset=table_urn, @@ -1029,13 +1015,13 @@ def get_upstream_tables( if table_urn not in self.database_tables: self.database_tables[table_urn] = DatabaseTable( urn=table_urn, - id=table[tableau_constant.ID], + id=table[c.ID], num_cols=num_tbl_cols, paths={table_path} if table_path else set(), ) else: self.database_tables[table_urn].update_table( - table[tableau_constant.ID], num_tbl_cols, table_path + table[c.ID], num_tbl_cols, table_path ) return upstream_tables, table_id_to_urn @@ -1047,24 +1033,24 @@ def get_upstream_columns_of_fields_in_datasource( table_id_to_urn: Dict[str, str], ) -> List[FineGrainedLineage]: fine_grained_lineages = [] - for field in datasource.get(tableau_constant.FIELDS) or []: - field_name = field.get(tableau_constant.NAME) + for field in datasource.get(c.FIELDS) or []: + field_name = field.get(c.NAME) # upstreamColumns lineage will be set via upstreamFields. # such as for CalculatedField if ( not field_name - or not field.get(tableau_constant.UPSTREAM_COLUMNS) - or field.get(tableau_constant.UPSTREAM_FIELDS) + or not field.get(c.UPSTREAM_COLUMNS) + or field.get(c.UPSTREAM_FIELDS) ): continue input_columns = [] - for upstream_col in field.get(tableau_constant.UPSTREAM_COLUMNS): + for upstream_col in field.get(c.UPSTREAM_COLUMNS): if not upstream_col: continue - name = upstream_col.get(tableau_constant.NAME) + name = upstream_col.get(c.NAME) upstream_table_id = ( - upstream_col.get(tableau_constant.TABLE)[tableau_constant.ID] - if upstream_col.get(tableau_constant.TABLE) + upstream_col.get(c.TABLE)[c.ID] + if upstream_col.get(c.TABLE) else None ) if ( @@ -1110,23 +1096,21 @@ def get_upstream_fields_of_field_in_datasource( self, datasource: dict, datasource_urn: str ) -> List[FineGrainedLineage]: fine_grained_lineages = [] - for field in datasource.get(tableau_constant.FIELDS) or []: - field_name = field.get(tableau_constant.NAME) + for field in datasource.get(c.FIELDS) or []: + field_name = field.get(c.NAME) # It is observed that upstreamFields gives one-hop field # lineage, and not multi-hop field lineage # This behavior is as desired in our case. - if not field_name or not field.get(tableau_constant.UPSTREAM_FIELDS): + if not field_name or not field.get(c.UPSTREAM_FIELDS): continue input_fields = [] - for upstream_field in field.get(tableau_constant.UPSTREAM_FIELDS): + for upstream_field in field.get(c.UPSTREAM_FIELDS): if not upstream_field: continue - name = upstream_field.get(tableau_constant.NAME) + name = upstream_field.get(c.NAME) upstream_ds_id = ( - upstream_field.get(tableau_constant.DATA_SOURCE)[ - tableau_constant.ID - ] - if upstream_field.get(tableau_constant.DATA_SOURCE) + upstream_field.get(c.DATA_SOURCE)[c.ID] + if upstream_field.get(c.DATA_SOURCE) else None ) if name and upstream_ds_id: @@ -1212,35 +1196,37 @@ def get_upstream_fields_from_custom_sql( return fine_grained_lineages def get_transform_operation(self, field: dict) -> str: - field_type = field[tableau_constant.TYPE_NAME] + field_type = field[c.TYPE_NAME] if field_type in ( - tableau_constant.DATA_SOURCE_FIELD, - tableau_constant.COLUMN_FIELD, + c.DATA_SOURCE_FIELD, + c.COLUMN_FIELD, ): - op = tableau_constant.IDENTITY # How to specify exact same - elif field_type == tableau_constant.CALCULATED_FIELD: + op = c.IDENTITY # How to specify exact same + elif field_type == c.CALCULATED_FIELD: op = field_type - if field.get(tableau_constant.FORMULA): - op += f"formula: {field.get(tableau_constant.FORMULA)}" + if field.get(c.FORMULA): + op += f"formula: {field.get(c.FORMULA)}" else: op = field_type # BinField, CombinedField, etc return op def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: - custom_sql_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.custom_sql_ids_being_used)}" + custom_sql_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(self.custom_sql_ids_being_used)}" + ) custom_sql_connection = list( self.get_connection_objects( custom_sql_graphql_query, - tableau_constant.CUSTOM_SQL_TABLE_CONNECTION, + c.CUSTOM_SQL_TABLE_CONNECTION, custom_sql_filter, ) ) unique_custom_sql = get_unique_custom_sql(custom_sql_connection) for csql in unique_custom_sql: - csql_id: str = csql[tableau_constant.ID] + csql_id: str = csql[c.ID] csql_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, name=csql_id, @@ -1256,40 +1242,33 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: datasource_name = None project = None - if len(csql[tableau_constant.DATA_SOURCES]) > 0: + if len(csql[c.DATA_SOURCES]) > 0: # CustomSQLTable id owned by exactly one tableau data source logger.debug( - f"Number of datasources referencing CustomSQLTable: {len(csql[tableau_constant.DATA_SOURCES])}" + f"Number of datasources referencing CustomSQLTable: {len(csql[c.DATA_SOURCES])}" ) - datasource = csql[tableau_constant.DATA_SOURCES][0] - datasource_name = datasource.get(tableau_constant.NAME) + datasource = csql[c.DATA_SOURCES][0] + datasource_name = datasource.get(c.NAME) if datasource.get( - tableau_constant.TYPE_NAME - ) == tableau_constant.EMBEDDED_DATA_SOURCE and datasource.get( - tableau_constant.WORKBOOK - ): + c.TYPE_NAME + ) == c.EMBEDDED_DATA_SOURCE and datasource.get(c.WORKBOOK): datasource_name = ( - f"{datasource.get(tableau_constant.WORKBOOK).get(tableau_constant.NAME)}/{datasource_name}" - if datasource_name - and datasource.get(tableau_constant.WORKBOOK).get( - tableau_constant.NAME - ) + f"{datasource.get(c.WORKBOOK).get(c.NAME)}/{datasource_name}" + if datasource_name and datasource.get(c.WORKBOOK).get(c.NAME) else None ) logger.debug( f"Adding datasource {datasource_name}({datasource.get('id')}) to container" ) yield from add_entity_to_container( - self.gen_workbook_key( - datasource[tableau_constant.WORKBOOK][tableau_constant.ID] - ), - tableau_constant.DATASET, + self.gen_workbook_key(datasource[c.WORKBOOK][c.ID]), + c.DATASET, dataset_snapshot.urn, ) project = self._get_project_browse_path_name(datasource) - tables = csql.get(tableau_constant.TABLES, []) + tables = csql.get(c.TABLES, []) if tables: # lineage from custom sql -> datasets/tables # @@ -1306,9 +1285,8 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: # Schema Metadata # if condition is needed as graphQL return "cloumns": None columns: List[Dict[Any, Any]] = ( - cast(List[Dict[Any, Any]], csql.get(tableau_constant.COLUMNS)) - if tableau_constant.COLUMNS in csql - and csql.get(tableau_constant.COLUMNS) is not None + cast(List[Dict[Any, Any]], csql.get(c.COLUMNS)) + if c.COLUMNS in csql and csql.get(c.COLUMNS) is not None else [] ) schema_metadata = self.get_schema_metadata_for_custom_sql(columns) @@ -1320,7 +1298,7 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: if project and datasource_name: browse_paths = BrowsePathsClass( paths=[ - f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource[tableau_constant.NAME]}" + f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource[c.NAME]}" ] ) dataset_snapshot.aspects.append(browse_paths) @@ -1328,27 +1306,25 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: logger.debug(f"Browse path not set for Custom SQL table {csql_id}") dataset_properties = DatasetPropertiesClass( - name=csql.get(tableau_constant.NAME), - description=csql.get(tableau_constant.DESCRIPTION), + name=csql.get(c.NAME), + description=csql.get(c.DESCRIPTION), ) dataset_snapshot.aspects.append(dataset_properties) - if csql.get(tableau_constant.QUERY): + if csql.get(c.QUERY): view_properties = ViewPropertiesClass( materialized=False, - viewLanguage=tableau_constant.SQL, - viewLogic=clean_query(csql[tableau_constant.QUERY]), + viewLanguage=c.SQL, + viewLogic=clean_query(csql[c.QUERY]), ) dataset_snapshot.aspects.append(view_properties) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, - aspect_name=tableau_constant.SUB_TYPES, - aspect=SubTypesClass( - typeNames=[DatasetSubTypes.VIEW, tableau_constant.CUSTOM_SQL] - ), + aspect_name=c.SUB_TYPES, + aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]), ) def get_schema_metadata_for_custom_sql( @@ -1359,21 +1335,19 @@ def get_schema_metadata_for_custom_sql( for field in columns: # Datasource fields - if field.get(tableau_constant.NAME) is None: + if field.get(c.NAME) is None: self.report.num_csql_field_skipped_no_name += 1 logger.warning( - f"Skipping field {field[tableau_constant.ID]} from schema since its name is none" + f"Skipping field {field[c.ID]} from schema since its name is none" ) continue - nativeDataType = field.get( - tableau_constant.REMOTE_TYPE, tableau_constant.UNKNOWN - ) + nativeDataType = field.get(c.REMOTE_TYPE, c.UNKNOWN) TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( - fieldPath=field[tableau_constant.NAME], + fieldPath=field[c.NAME], type=SchemaFieldDataType(type=TypeClass()), nativeDataType=nativeDataType, - description=field.get(tableau_constant.DESCRIPTION), + description=field.get(c.DESCRIPTION), ) fields.append(schema_field) @@ -1391,28 +1365,25 @@ def _get_published_datasource_project_luid(self, ds: dict) -> Optional[str]: # This is fallback in case "get all datasources" query fails for some reason. # It is possible due to https://github.com/tableau/server-client-python/issues/1210 if ( - ds.get(tableau_constant.LUID) - and ds[tableau_constant.LUID] not in self.datasource_project_map.keys() + ds.get(c.LUID) + and ds[c.LUID] not in self.datasource_project_map.keys() and self.report.get_all_datasources_query_failed ): logger.debug( - f"published datasource {ds.get(tableau_constant.NAME)} project_luid not found." - f" Running get datasource query for {ds[tableau_constant.LUID]}" + f"published datasource {ds.get(c.NAME)} project_luid not found." + f" Running get datasource query for {ds[c.LUID]}" ) # Query and update self.datasource_project_map with luid - self._query_published_datasource_for_project_luid(ds[tableau_constant.LUID]) + self._query_published_datasource_for_project_luid(ds[c.LUID]) if ( - ds.get(tableau_constant.LUID) - and ds[tableau_constant.LUID] in self.datasource_project_map.keys() - and self.datasource_project_map[ds[tableau_constant.LUID]] - in self.tableau_project_registry + ds.get(c.LUID) + and ds[c.LUID] in self.datasource_project_map.keys() + and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry ): - return self.datasource_project_map[ds[tableau_constant.LUID]] + return self.datasource_project_map[ds[c.LUID]] - logger.debug( - f"published datasource {ds.get(tableau_constant.NAME)} project_luid not found" - ) + logger.debug(f"published datasource {ds.get(c.NAME)} project_luid not found") return None @@ -1437,60 +1408,52 @@ def _query_published_datasource_for_project_luid(self, ds_luid: str) -> None: logger.debug("Error stack trace", exc_info=True) def _get_workbook_project_luid(self, wb: dict) -> Optional[str]: - if wb.get(tableau_constant.LUID) and self.workbook_project_map.get( - wb[tableau_constant.LUID] - ): - return self.workbook_project_map[wb[tableau_constant.LUID]] + if wb.get(c.LUID) and self.workbook_project_map.get(wb[c.LUID]): + return self.workbook_project_map[wb[c.LUID]] - logger.debug(f"workbook {wb.get(tableau_constant.NAME)} project_luid not found") + logger.debug(f"workbook {wb.get(c.NAME)} project_luid not found") return None def _get_embedded_datasource_project_luid(self, ds: dict) -> Optional[str]: - if ds.get(tableau_constant.WORKBOOK): + if ds.get(c.WORKBOOK): project_luid: Optional[str] = self._get_workbook_project_luid( - ds[tableau_constant.WORKBOOK] + ds[c.WORKBOOK] ) if project_luid and project_luid in self.tableau_project_registry: return project_luid - logger.debug( - f"embedded datasource {ds.get(tableau_constant.NAME)} project_luid not found" - ) + logger.debug(f"embedded datasource {ds.get(c.NAME)} project_luid not found") return None def _get_datasource_project_luid(self, ds: dict) -> Optional[str]: # Only published and embedded data-sources are supported - ds_type: Optional[str] = ds.get(tableau_constant.TYPE_NAME) + ds_type: Optional[str] = ds.get(c.TYPE_NAME) if ds_type not in ( - tableau_constant.PUBLISHED_DATA_SOURCE, - tableau_constant.EMBEDDED_DATA_SOURCE, + c.PUBLISHED_DATA_SOURCE, + c.EMBEDDED_DATA_SOURCE, ): logger.debug( - f"datasource {ds.get(tableau_constant.NAME)} type {ds.get(tableau_constant.TYPE_NAME)} is " + f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is " f"unsupported" ) return None func_selector: Any = { - tableau_constant.PUBLISHED_DATA_SOURCE: self._get_published_datasource_project_luid, - tableau_constant.EMBEDDED_DATA_SOURCE: self._get_embedded_datasource_project_luid, + c.PUBLISHED_DATA_SOURCE: self._get_published_datasource_project_luid, + c.EMBEDDED_DATA_SOURCE: self._get_embedded_datasource_project_luid, } return func_selector[ds_type](ds) @staticmethod def _get_datasource_project_name(ds: dict) -> Optional[str]: - if ds.get( - tableau_constant.TYPE_NAME - ) == tableau_constant.EMBEDDED_DATA_SOURCE and ds.get( - tableau_constant.WORKBOOK - ): - return ds[tableau_constant.WORKBOOK].get(tableau_constant.PROJECT_NAME) - if ds.get(tableau_constant.TYPE_NAME) == tableau_constant.PUBLISHED_DATA_SOURCE: - return ds.get(tableau_constant.PROJECT_NAME) + if ds.get(c.TYPE_NAME) == c.EMBEDDED_DATA_SOURCE and ds.get(c.WORKBOOK): + return ds[c.WORKBOOK].get(c.PROJECT_NAME) + if ds.get(c.TYPE_NAME) == c.PUBLISHED_DATA_SOURCE: + return ds.get(c.PROJECT_NAME) return None def _get_project_browse_path_name(self, ds: dict) -> Optional[str]: @@ -1502,7 +1465,7 @@ def _get_project_browse_path_name(self, ds: dict) -> Optional[str]: project_luid = self._get_datasource_project_luid(ds) if project_luid is None: logger.warning( - f"Could not load project hierarchy for datasource {ds.get(tableau_constant.NAME)}. Please check permissions." + f"Could not load project hierarchy for datasource {ds.get(c.NAME)}. Please check permissions." ) logger.debug(f"datasource = {ds}") return None @@ -1515,7 +1478,7 @@ def _create_lineage_to_upstream_tables( # This adds an edge to upstream DatabaseTables using `upstreamTables` upstream_tables, _ = self.get_upstream_tables( tables, - datasource.get(tableau_constant.NAME) or "", + datasource.get(c.NAME) or "", self._get_project_browse_path_name(datasource), is_custom_sql=True, ) @@ -1524,7 +1487,7 @@ def _create_lineage_to_upstream_tables( upstream_lineage = UpstreamLineage(upstreams=upstream_tables) yield self.get_metadata_change_proposal( csql_urn, - aspect_name=tableau_constant.UPSTREAM_LINEAGE, + aspect_name=c.UPSTREAM_LINEAGE, aspect=upstream_lineage, ) @@ -1547,22 +1510,19 @@ def parse_custom_sql( ] ], ) -> Optional["SqlParsingResult"]: - database_info = datasource.get(tableau_constant.DATABASE) or {} + database_info = datasource.get(c.DATABASE) or {} - if datasource.get(tableau_constant.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False): + if datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False): logger.debug(f"datasource {datasource_urn} is not created from custom sql") return None - if ( - tableau_constant.NAME not in database_info - or tableau_constant.CONNECTION_TYPE not in database_info - ): + if c.NAME not in database_info or c.CONNECTION_TYPE not in database_info: logger.debug( f"database information is missing from datasource {datasource_urn}" ) return None - query = datasource.get(tableau_constant.QUERY) + query = datasource.get(c.QUERY) if query is None: logger.debug( f"raw sql query is not available for datasource {datasource_urn}" @@ -1571,13 +1531,13 @@ def parse_custom_sql( logger.debug(f"Parsing sql={query}") - upstream_db = database_info.get(tableau_constant.NAME) + upstream_db = database_info.get(c.NAME) if func_overridden_info is not None: # Override the information as per configuration upstream_db, platform_instance, platform, _ = func_overridden_info( - database_info[tableau_constant.CONNECTION_TYPE], - database_info.get(tableau_constant.NAME), + database_info[c.CONNECTION_TYPE], + database_info.get(c.NAME), self.config.platform_instance_map, self.config.lineage_overrides, ) @@ -1631,7 +1591,7 @@ def _create_lineage_from_unsupported_csql( yield self.get_metadata_change_proposal( csql_urn, - aspect_name=tableau_constant.UPSTREAM_LINEAGE, + aspect_name=c.UPSTREAM_LINEAGE, aspect=upstream_lineage, ) @@ -1642,10 +1602,10 @@ def _get_schema_metadata_for_datasource( for field in datasource_fields: # check datasource - custom sql relations from a field being referenced self._track_custom_sql_ids(field) - if field.get(tableau_constant.NAME) is None: + if field.get(c.NAME) is None: self.report.num_upstream_table_skipped_no_name += 1 logger.warning( - f"Skipping field {field[tableau_constant.ID]} from schema since its name is none" + f"Skipping field {field[c.ID]} from schema since its name is none" ) continue @@ -1678,7 +1638,7 @@ def get_metadata_change_proposal( aspect: Union["UpstreamLineage", "SubTypesClass"], ) -> MetadataWorkUnit: return MetadataChangeProposalWrapper( - entityType=tableau_constant.DATASET, + entityType=c.DATASET, changeType=ChangeTypeClass.UPSERT, entityUrn=urn, aspectName=aspect_name, @@ -1696,10 +1656,8 @@ def emit_datasource( datasource_info = datasource browse_path = self._get_project_browse_path_name(datasource) - logger.debug( - f"datasource {datasource.get(tableau_constant.NAME)} browse-path {browse_path}" - ) - datasource_id = datasource[tableau_constant.ID] + logger.debug(f"datasource {datasource.get(c.NAME)} browse-path {browse_path}") + datasource_id = datasource[c.ID] datasource_urn = builder.make_dataset_urn_with_platform_instance( self.platform, datasource_id, self.config.platform_instance, self.config.env ) @@ -1713,13 +1671,10 @@ def emit_datasource( # Browse path - if ( - browse_path - and is_embedded_ds - and workbook - and workbook.get(tableau_constant.NAME) - ): - browse_path = f"{browse_path}/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}" + if browse_path and is_embedded_ds and workbook and workbook.get(c.NAME): + browse_path = ( + f"{browse_path}/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}" + ) if browse_path: browse_paths = BrowsePathsClass( @@ -1729,12 +1684,10 @@ def emit_datasource( # Ownership owner = ( - self._get_ownership( - datasource_info[tableau_constant.OWNER][tableau_constant.USERNAME] - ) + self._get_ownership(datasource_info[c.OWNER][c.USERNAME]) if datasource_info - and datasource_info.get(tableau_constant.OWNER) - and datasource_info[tableau_constant.OWNER].get(tableau_constant.USERNAME) + and datasource_info.get(c.OWNER) + and datasource_info[c.OWNER].get(c.USERNAME) else None ) if owner is not None: @@ -1742,24 +1695,22 @@ def emit_datasource( # Dataset properties dataset_props = DatasetPropertiesClass( - name=datasource.get(tableau_constant.NAME), - description=datasource.get(tableau_constant.DESCRIPTION), + name=datasource.get(c.NAME), + description=datasource.get(c.DESCRIPTION), customProperties=self.get_custom_props_from_dict( datasource, [ - tableau_constant.HAS_EXTRACTS, - tableau_constant.EXTRACT_LAST_REFRESH_TIME, - tableau_constant.EXTRACT_LAST_INCREMENTAL_UPDATE_TIME, - tableau_constant.EXTRACT_LAST_UPDATE_TIME, + c.HAS_EXTRACTS, + c.EXTRACT_LAST_REFRESH_TIME, + c.EXTRACT_LAST_INCREMENTAL_UPDATE_TIME, + c.EXTRACT_LAST_UPDATE_TIME, ], ), ) dataset_snapshot.aspects.append(dataset_props) # Upstream Tables - if datasource.get(tableau_constant.UPSTREAM_TABLES) or datasource.get( - tableau_constant.UPSTREAM_DATA_SOURCES - ): + if datasource.get(c.UPSTREAM_TABLES) or datasource.get(c.UPSTREAM_DATA_SOURCES): # datasource -> db table relations ( upstream_tables, @@ -1779,13 +1730,13 @@ def emit_datasource( ) yield self.get_metadata_change_proposal( datasource_urn, - aspect_name=tableau_constant.UPSTREAM_LINEAGE, + aspect_name=c.UPSTREAM_LINEAGE, aspect=upstream_lineage, ) # Datasource Fields schema_metadata = self._get_schema_metadata_for_datasource( - datasource.get(tableau_constant.FIELDS, []) + datasource.get(c.FIELDS, []) ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) @@ -1793,7 +1744,7 @@ def emit_datasource( yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, - aspect_name=tableau_constant.SUB_TYPES, + aspect_name=c.SUB_TYPES, aspect=SubTypesClass( typeNames=( ["Embedded Data Source"] @@ -1809,7 +1760,7 @@ def emit_datasource( if container_key is not None: yield from add_entity_to_container( container_key, - tableau_constant.DATASET, + c.DATASET, dataset_snapshot.urn, ) @@ -1822,10 +1773,10 @@ def _get_datasource_container_key( container_key: Optional[ContainerKey] = None if is_embedded_ds: # It is embedded then parent is container is workbook if workbook is not None: - container_key = self.gen_workbook_key(workbook[tableau_constant.ID]) + container_key = self.gen_workbook_key(workbook[c.ID]) else: logger.warning( - f"Parent container not set for embedded datasource {datasource[tableau_constant.ID]}" + f"Parent container not set for embedded datasource {datasource[c.ID]}" ) else: parent_project_luid = self._get_published_datasource_project_luid( @@ -1836,17 +1787,19 @@ def _get_datasource_container_key( container_key = self.gen_project_key(parent_project_luid) else: logger.warning( - f"Parent container not set for published datasource {datasource[tableau_constant.ID]}" + f"Parent container not set for published datasource {datasource[c.ID]}" ) return container_key def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]: - datasource_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.datasource_ids_being_used)}" + datasource_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(self.datasource_ids_being_used)}" + ) for datasource in self.get_connection_objects( published_datasource_graphql_query, - tableau_constant.PUBLISHED_DATA_SOURCES_CONNECTION, + c.PUBLISHED_DATA_SOURCES_CONNECTION, datasource_filter, ): yield from self.emit_datasource(datasource) @@ -1855,11 +1808,13 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: database_table_id_to_urn_map: Dict[str, str] = dict() for urn, tbl in self.database_tables.items(): database_table_id_to_urn_map[tbl.id] = urn - tables_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(list(database_table_id_to_urn_map.keys()))}" + tables_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(list(database_table_id_to_urn_map.keys()))}" + ) for table in self.get_connection_objects( database_tables_graphql_query, - tableau_constant.DATABASE_TABLES_CONNECTION, + c.DATABASE_TABLES_CONNECTION, tables_filter, ): yield from self.emit_table(table, database_table_id_to_urn_map) @@ -1867,11 +1822,9 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: def emit_table( self, table: dict, database_table_id_to_urn_map: Dict[str, str] ) -> Iterable[MetadataWorkUnit]: - database_table = self.database_tables[ - database_table_id_to_urn_map[table[tableau_constant.ID]] - ] - columns = table.get(tableau_constant.COLUMNS, []) - is_embedded = table.get(tableau_constant.IS_EMBEDDED) or False + database_table = self.database_tables[database_table_id_to_urn_map[table[c.ID]]] + columns = table.get(c.COLUMNS, []) + is_embedded = table.get(c.IS_EMBEDDED) or False if not is_embedded and not self.config.ingest_tables_external: logger.debug( f"Skipping external table {database_table.urn} as ingest_tables_external is set to False" @@ -1907,21 +1860,19 @@ def get_schema_metadata_for_table( if columns: fields = [] for field in columns: - if field.get(tableau_constant.NAME) is None: + if field.get(c.NAME) is None: self.report.num_table_field_skipped_no_name += 1 logger.warning( - f"Skipping field {field[tableau_constant.ID]} from schema since its name is none" + f"Skipping field {field[c.ID]} from schema since its name is none" ) continue - nativeDataType = field.get( - tableau_constant.REMOTE_TYPE, tableau_constant.UNKNOWN - ) + nativeDataType = field.get(c.REMOTE_TYPE, c.UNKNOWN) TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( - fieldPath=field[tableau_constant.NAME], + fieldPath=field[c.NAME], type=SchemaFieldDataType(type=TypeClass()), - description=field.get(tableau_constant.DESCRIPTION), + description=field.get(c.DESCRIPTION), nativeDataType=nativeDataType, ) @@ -1941,11 +1892,9 @@ def get_schema_metadata_for_table( def get_sheetwise_upstream_datasources(self, sheet: dict) -> set: sheet_upstream_datasources = set() - for field in sheet.get(tableau_constant.DATA_SOURCE_FIELDS) or []: - if field and field.get(tableau_constant.DATA_SOURCE): - sheet_upstream_datasources.add( - field[tableau_constant.DATA_SOURCE][tableau_constant.ID] - ) + for field in sheet.get(c.DATA_SOURCE_FIELDS) or []: + if field and field.get(c.DATA_SOURCE): + sheet_upstream_datasources.add(field[c.DATA_SOURCE][c.ID]) return sheet_upstream_datasources @@ -1961,20 +1910,20 @@ def _create_datahub_chart_usage_stat( def _get_chart_stat_wu( self, sheet: dict, sheet_urn: str ) -> Optional[MetadataWorkUnit]: - luid: Optional[str] = sheet.get(tableau_constant.LUID) + luid: Optional[str] = sheet.get(c.LUID) if luid is None: logger.debug( "stat:luid is none for sheet %s(id:%s)", - sheet.get(tableau_constant.NAME), - sheet.get(tableau_constant.ID), + sheet.get(c.NAME), + sheet.get(c.ID), ) return None usage_stat: Optional[UsageStat] = self.tableau_stat_registry.get(luid) if usage_stat is None: logger.debug( "stat:UsageStat is not available in tableau_stat_registry for sheet %s(id:%s)", - sheet.get(tableau_constant.NAME), - sheet.get(tableau_constant.ID), + sheet.get(c.NAME), + sheet.get(c.ID), ) return None @@ -1983,8 +1932,8 @@ def _get_chart_stat_wu( ) logger.debug( "stat: Chart usage stat work unit is created for %s(id:%s)", - sheet.get(tableau_constant.NAME), - sheet.get(tableau_constant.ID), + sheet.get(c.NAME), + sheet.get(c.ID), ) return MetadataChangeProposalWrapper( aspect=aspect, @@ -1992,22 +1941,20 @@ def _get_chart_stat_wu( ).as_workunit() def emit_sheets(self) -> Iterable[MetadataWorkUnit]: - sheets_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.sheet_ids)}" + sheets_filter = f"{c.ID_WITH_IN}: {json.dumps(self.sheet_ids)}" for sheet in self.get_connection_objects( sheet_graphql_query, - tableau_constant.SHEETS_CONNECTION, + c.SHEETS_CONNECTION, sheets_filter, ): - yield from self.emit_sheets_as_charts( - sheet, sheet.get(tableau_constant.WORKBOOK) - ) + yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK)) def emit_sheets_as_charts( self, sheet: dict, workbook: Optional[Dict] ) -> Iterable[MetadataWorkUnit]: sheet_urn: str = builder.make_chart_urn( - self.platform, sheet[tableau_constant.ID], self.config.platform_instance + self.platform, sheet[c.ID], self.config.platform_instance ) chart_snapshot = ChartSnapshot( urn=sheet_urn, @@ -2015,34 +1962,32 @@ def emit_sheets_as_charts( ) creator: Optional[str] = None - if workbook is not None and workbook.get(tableau_constant.OWNER) is not None: - creator = workbook[tableau_constant.OWNER].get(tableau_constant.USERNAME) - created_at = sheet.get(tableau_constant.CREATED_AT, datetime.now()) - updated_at = sheet.get(tableau_constant.UPDATED_AT, datetime.now()) + if workbook is not None and workbook.get(c.OWNER) is not None: + creator = workbook[c.OWNER].get(c.USERNAME) + created_at = sheet.get(c.CREATED_AT, datetime.now()) + updated_at = sheet.get(c.UPDATED_AT, datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) - if sheet.get(tableau_constant.PATH): + if sheet.get(c.PATH): site_part = f"/site/{self.config.site}" if self.config.site else "" - sheet_external_url = f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(tableau_constant.PATH)}" - elif ( - sheet.get(tableau_constant.CONTAINED_IN_DASHBOARDS) is not None - and len(sheet[tableau_constant.CONTAINED_IN_DASHBOARDS]) > 0 - and sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0] is not None - and sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0].get( - tableau_constant.PATH + sheet_external_url = ( + f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(c.PATH)}" ) + elif ( + sheet.get(c.CONTAINED_IN_DASHBOARDS) is not None + and len(sheet[c.CONTAINED_IN_DASHBOARDS]) > 0 + and sheet[c.CONTAINED_IN_DASHBOARDS][0] is not None + and sheet[c.CONTAINED_IN_DASHBOARDS][0].get(c.PATH) ): # sheet contained in dashboard site_part = f"/t/{self.config.site}" if self.config.site else "" - dashboard_path = sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0][ - tableau_constant.PATH - ] - sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get(tableau_constant.NAME, '')}" + dashboard_path = sheet[c.CONTAINED_IN_DASHBOARDS][0][c.PATH] + sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get(c.NAME, '')}" else: # hidden or viz-in-tooltip sheet sheet_external_url = None input_fields: List[InputField] = [] - if sheet.get(tableau_constant.DATA_SOURCE_FIELDS): + if sheet.get(c.DATA_SOURCE_FIELDS): self.populate_sheet_upstream_fields(sheet, input_fields) # datasource urn @@ -2060,15 +2005,13 @@ def emit_sheets_as_charts( # Chart Info chart_info = ChartInfoClass( description="", - title=sheet.get(tableau_constant.NAME) or "", + title=sheet.get(c.NAME) or "", lastModified=last_modified, externalUrl=sheet_external_url if self.config.ingest_external_links_for_charts else None, inputs=sorted(datasource_urn), - customProperties=self.get_custom_props_from_dict( - sheet, [tableau_constant.LUID] - ), + customProperties=self.get_custom_props_from_dict(sheet, [c.LUID]), ) chart_snapshot.aspects.append(chart_info) # chart_snapshot doesn't support the stat aspect as list element and hence need to emit MCP @@ -2083,7 +2026,7 @@ def emit_sheets_as_charts( chart_snapshot.aspects.append(browse_paths) else: logger.warning( - f"Could not set browse path for workbook {sheet[tableau_constant.ID]}. Please check permissions." + f"Could not set browse path for workbook {sheet[c.ID]}. Please check permissions." ) # Ownership @@ -2107,9 +2050,7 @@ def emit_sheets_as_charts( ) if workbook is not None: yield from add_entity_to_container( - self.gen_workbook_key(workbook[tableau_constant.ID]), - tableau_constant.CHART, - chart_snapshot.urn, + self.gen_workbook_key(workbook[c.ID]), c.CHART, chart_snapshot.urn ) if input_fields: @@ -2134,14 +2075,12 @@ def _get_project_path(self, project: TableauProject) -> str: def populate_sheet_upstream_fields( self, sheet: dict, input_fields: List[InputField] ) -> None: - for field in sheet.get(tableau_constant.DATA_SOURCE_FIELDS): # type: ignore + for field in sheet.get(c.DATA_SOURCE_FIELDS): # type: ignore if not field: continue - name = field.get(tableau_constant.NAME) + name = field.get(c.NAME) upstream_ds_id = ( - field.get(tableau_constant.DATA_SOURCE)[tableau_constant.ID] - if field.get(tableau_constant.DATA_SOURCE) - else None + field.get(c.DATA_SOURCE)[c.ID] if field.get(c.DATA_SOURCE) else None ) if name and upstream_ds_id: input_fields.append( @@ -2162,10 +2101,8 @@ def populate_sheet_upstream_fields( ) def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]: - workbook_container_key = self.gen_workbook_key(workbook[tableau_constant.ID]) - creator = workbook.get(tableau_constant.OWNER, {}).get( - tableau_constant.USERNAME - ) + workbook_container_key = self.gen_workbook_key(workbook[c.ID]) + creator = workbook.get(c.OWNER, {}).get(c.USERNAME) owner_urn = ( builder.make_user_urn(creator) @@ -2191,17 +2128,17 @@ def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUni if project_luid and project_luid in self.tableau_project_registry.keys(): parent_key = self.gen_project_key(project_luid) else: - workbook_id: Optional[str] = workbook.get(tableau_constant.ID) - workbook_name: Optional[str] = workbook.get(tableau_constant.NAME) + workbook_id: Optional[str] = workbook.get(c.ID) + workbook_name: Optional[str] = workbook.get(c.NAME) logger.warning( f"Could not load project hierarchy for workbook {workbook_name}({workbook_id}). Please check permissions." ) yield from gen_containers( container_key=workbook_container_key, - name=workbook.get(tableau_constant.NAME) or "", + name=workbook.get(c.NAME) or "", parent_container_key=parent_key, - description=workbook.get(tableau_constant.DESCRIPTION), + description=workbook.get(c.DESCRIPTION), sub_types=[BIContainerSubTypes.TABLEAU_WORKBOOK], owner_urn=owner_urn, external_url=workbook_external_url, @@ -2237,20 +2174,20 @@ def _create_datahub_dashboard_usage_stat( def _get_dashboard_stat_wu( self, dashboard: dict, dashboard_urn: str ) -> Optional[MetadataWorkUnit]: - luid: Optional[str] = dashboard.get(tableau_constant.LUID) + luid: Optional[str] = dashboard.get(c.LUID) if luid is None: logger.debug( "stat:luid is none for dashboard %s(id:%s)", - dashboard.get(tableau_constant.NAME), - dashboard.get(tableau_constant.ID), + dashboard.get(c.NAME), + dashboard.get(c.ID), ) return None usage_stat: Optional[UsageStat] = self.tableau_stat_registry.get(luid) if usage_stat is None: logger.debug( "stat:UsageStat is not available in tableau_stat_registry for dashboard %s(id:%s)", - dashboard.get(tableau_constant.NAME), - dashboard.get(tableau_constant.ID), + dashboard.get(c.NAME), + dashboard.get(c.ID), ) return None @@ -2259,8 +2196,8 @@ def _get_dashboard_stat_wu( ) logger.debug( "stat: Dashboard usage stat is created for %s(id:%s)", - dashboard.get(tableau_constant.NAME), - dashboard.get(tableau_constant.ID), + dashboard.get(c.NAME), + dashboard.get(c.ID), ) return MetadataChangeProposalWrapper( @@ -2288,26 +2225,20 @@ def new_work_unit(self, mcp: MetadataChangeProposalWrapper) -> MetadataWorkUnit: ) def emit_dashboards(self) -> Iterable[MetadataWorkUnit]: - dashboards_filter = ( - f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.dashboard_ids)}" - ) + dashboards_filter = f"{c.ID_WITH_IN}: {json.dumps(self.dashboard_ids)}" for dashboard in self.get_connection_objects( dashboard_graphql_query, - tableau_constant.DASHBOARDS_CONNECTION, + c.DASHBOARDS_CONNECTION, dashboards_filter, ): - yield from self.emit_dashboard( - dashboard, dashboard.get(tableau_constant.WORKBOOK) - ) + yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK)) def get_tags(self, obj: dict) -> Optional[List[str]]: - tag_list = obj.get(tableau_constant.TAGS, []) + tag_list = obj.get(c.TAGS, []) if tag_list and self.config.ingest_tags: tag_list_str = [ - t[tableau_constant.NAME] - for t in tag_list - if t is not None and t.get(tableau_constant.NAME) + t[c.NAME] for t in tag_list if t is not None and t.get(c.NAME) ] return tag_list_str @@ -2317,7 +2248,7 @@ def emit_dashboard( self, dashboard: dict, workbook: Optional[Dict] ) -> Iterable[MetadataWorkUnit]: dashboard_urn: str = builder.make_dashboard_urn( - self.platform, dashboard[tableau_constant.ID], self.config.platform_instance + self.platform, dashboard[c.ID], self.config.platform_instance ) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, @@ -2325,26 +2256,28 @@ def emit_dashboard( ) creator: Optional[str] = None - if workbook is not None and workbook.get(tableau_constant.OWNER) is not None: - creator = workbook[tableau_constant.OWNER].get(tableau_constant.USERNAME) - created_at = dashboard.get(tableau_constant.CREATED_AT, datetime.now()) - updated_at = dashboard.get(tableau_constant.UPDATED_AT, datetime.now()) + if workbook is not None and workbook.get(c.OWNER) is not None: + creator = workbook[c.OWNER].get(c.USERNAME) + created_at = dashboard.get(c.CREATED_AT, datetime.now()) + updated_at = dashboard.get(c.UPDATED_AT, datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) site_part = f"/site/{self.config.site}" if self.config.site else "" - dashboard_external_url = f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(tableau_constant.PATH, '')}" + dashboard_external_url = ( + f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(c.PATH, '')}" + ) title = ( - dashboard[tableau_constant.NAME].replace("/", REPLACE_SLASH_CHAR) - if dashboard.get(tableau_constant.NAME) + dashboard[c.NAME].replace("/", REPLACE_SLASH_CHAR) + if dashboard.get(c.NAME) else "" ) chart_urns = [ builder.make_chart_urn( self.platform, - sheet.get(tableau_constant.ID), + sheet.get(c.ID), self.config.platform_instance, ) - for sheet in dashboard.get(tableau_constant.SHEETS, []) + for sheet in dashboard.get(c.SHEETS, []) ] dashboard_info_class = DashboardInfoClass( description="", @@ -2354,9 +2287,7 @@ def emit_dashboard( dashboardUrl=dashboard_external_url if self.config.ingest_external_links_for_dashboards else None, - customProperties=self.get_custom_props_from_dict( - dashboard, [tableau_constant.LUID] - ), + customProperties=self.get_custom_props_from_dict(dashboard, [c.LUID]), ) dashboard_snapshot.aspects.append(dashboard_info_class) @@ -2377,7 +2308,7 @@ def emit_dashboard( dashboard_snapshot.aspects.append(browse_paths) else: logger.warning( - f"Could not set browse path for dashboard {dashboard[tableau_constant.ID]}. Please check permissions." + f"Could not set browse path for dashboard {dashboard[c.ID]}. Please check permissions." ) # Ownership @@ -2397,8 +2328,8 @@ def emit_dashboard( if workbook is not None: yield from add_entity_to_container( - self.gen_workbook_key(workbook[tableau_constant.ID]), - tableau_constant.DASHBOARD, + self.gen_workbook_key(workbook[c.ID]), + c.DASHBOARD, dashboard_snapshot.urn, ) @@ -2406,38 +2337,40 @@ def get_browse_paths_aspect( self, workbook: Optional[Dict] ) -> Optional[BrowsePathsClass]: browse_paths: Optional[BrowsePathsClass] = None - if workbook and workbook.get(tableau_constant.NAME): + if workbook and workbook.get(c.NAME): project_luid: Optional[str] = self._get_workbook_project_luid(workbook) if project_luid in self.tableau_project_registry: browse_paths = BrowsePathsClass( paths=[ f"/{self.platform}/{self._project_luid_to_browse_path_name(project_luid)}" - f"/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}" + f"/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}" ] ) - elif workbook.get(tableau_constant.PROJECT_NAME): + elif workbook.get(c.PROJECT_NAME): # browse path browse_paths = BrowsePathsClass( paths=[ - f"/{self.platform}/{workbook[tableau_constant.PROJECT_NAME].replace('/', REPLACE_SLASH_CHAR)}" - f"/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}" + f"/{self.platform}/{workbook[c.PROJECT_NAME].replace('/', REPLACE_SLASH_CHAR)}" + f"/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}" ] ) return browse_paths def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]: - datasource_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.embedded_datasource_ids_being_used)}" + datasource_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(self.embedded_datasource_ids_being_used)}" + ) for datasource in self.get_connection_objects( embedded_datasource_graphql_query, - tableau_constant.EMBEDDED_DATA_SOURCES_CONNECTION, + c.EMBEDDED_DATA_SOURCES_CONNECTION, datasource_filter, ): yield from self.emit_datasource( datasource, - datasource.get(tableau_constant.WORKBOOK), + datasource.get(c.WORKBOOK), is_embedded_ds=True, ) @@ -2483,7 +2416,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: container_key=self.gen_project_key(_id), name=project.name, description=project.description, - sub_types=[tableau_constant.PROJECT], + sub_types=[c.PROJECT], parent_container_key=self.gen_project_key(project.parent_id) if project.parent_id else None, @@ -2498,7 +2431,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: yield from gen_containers( container_key=self.gen_project_key(project.parent_id), name=cast(str, project.parent_name), - sub_types=[tableau_constant.PROJECT], + sub_types=[c.PROJECT], ) def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py index 7c4852042ce7c8..65d779b7f4516d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py @@ -8,7 +8,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel -from datahub.ingestion.source import tableau_constant as tc +from datahub.ingestion.source import tableau_constant as c from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageType, FineGrainedLineage, @@ -591,12 +591,12 @@ def create( cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None ) -> "TableauUpstreamReference": # Values directly from `table` object from Tableau - database = t_database = d.get(tc.DATABASE, {}).get(tc.NAME) - schema = t_schema = d.get(tc.SCHEMA) - table = t_table = d.get(tc.NAME) or "" - t_full_name = d.get(tc.FULL_NAME) - t_connection_type = d[tc.CONNECTION_TYPE] # required to generate urn - t_id = d[tc.ID] + database = t_database = d.get(c.DATABASE, {}).get(c.NAME) + schema = t_schema = d.get(c.SCHEMA) + table = t_table = d.get(c.NAME) or "" + t_full_name = d.get(c.FULL_NAME) + t_connection_type = d[c.CONNECTION_TYPE] # required to generate urn + t_id = d[c.ID] parsed_full_name = cls.parse_full_name(t_full_name) if parsed_full_name and len(parsed_full_name) == 3: From 9174301719122c2597db75c8bb6b60c4d1a74f77 Mon Sep 17 00:00:00 2001 From: sachinsaju <33017477+sachinsaju@users.noreply.github.com> Date: Thu, 9 Nov 2023 10:37:09 +0530 Subject: [PATCH 09/29] docs: update broken link in metadata-modelling (#9184) Co-authored-by: Hyejin Yoon <0327jane@gmail.com> Co-authored-by: John Joyce --- docs/modeling/metadata-model.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index a8958985a0a724..4c97cadc88417e 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -625,7 +625,7 @@ curl --location --request POST 'http://localhost:8080/analytics?action=getTimese } } ``` -For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [ElasticSearchTimeseriesAspectServiceTest.java](https://github.com/datahub-project/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java). +For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [TimeseriesAspectServiceTestBase.java](https://github.com/datahub-project/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java). From e494a9cc102f863bc51fcf80674bd6d3d36d726c Mon Sep 17 00:00:00 2001 From: Kos Korchak <97058061+kkorchak@users.noreply.github.com> Date: Thu, 9 Nov 2023 00:23:17 -0500 Subject: [PATCH 10/29] test(): Test policy to create and manage privileges (#9173) --- .../tests/privileges/test_privileges.py | 112 +++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py index 740311754678ef..d0f00734ae9f37 100644 --- a/smoke-test/tests/privileges/test_privileges.py +++ b/smoke-test/tests/privileges/test_privileges.py @@ -114,6 +114,21 @@ def _ensure_can_create_access_token(session, json): assert ingestion_data["data"]["createAccessToken"]["__typename"] == "AccessToken" +@tenacity.retry( + stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_can_create_user_policy(session, json): + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + response.raise_for_status() + res_data = response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["createPolicy"] is not None + + return res_data["data"]["createPolicy"] + + @pytest.mark.dependency(depends=["test_healthchecks"]) def test_privilege_to_create_and_manage_secrets(): @@ -337,4 +352,99 @@ def test_privilege_to_create_and_manage_access_tokens(): # Ensure that user can't create access token after policy is removed - _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken") \ No newline at end of file + _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken") + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_privilege_to_create_and_manage_policies(): + + (admin_user, admin_pass) = get_admin_credentials() + admin_session = login_as(admin_user, admin_pass) + user_session = login_as("user", "user") + + + # Verify new user can't create a policy + create_policy = { + "query": """mutation createPolicy($input: PolicyUpdateInput!) {\n + createPolicy(input: $input) }""", + "variables": { + "input": { + "type": "PLATFORM", + "name": "Policy Name", + "description": "Policy Description", + "state": "ACTIVE", + "resources": {"filter":{"criteria":[]}}, + "privileges": ["MANAGE_POLICIES"], + "actors": { + "users": [], + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + }, + } + }, + } + + _ensure_cant_perform_action(user_session, create_policy,"createPolicy") + + + # Assign privileges to the new user to create and manage policies + admin_policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_POLICIES"], admin_session) + + + # Verify new user can create and manage policy(create, edit, delete) + # Create a policy + user_policy_urn = _ensure_can_create_user_policy(user_session, create_policy) + + # Edit a policy + edit_policy = { + "query": """mutation updatePolicy($urn: String!, $input: PolicyUpdateInput!) {\n + updatePolicy(urn: $urn, input: $input) }""", + "variables": { + "urn": user_policy_urn, + "input": { + "type": "PLATFORM", + "state": "INACTIVE", + "name": "Policy Name test", + "description": "Policy Description updated", + "privileges": ["MANAGE_POLICIES"], + "actors": { + "users": [], + "groups": None, + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + "resourceOwnersTypes": None, + }, + }, + }, + } + edit_policy_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=edit_policy) + edit_policy_response.raise_for_status() + res_data = edit_policy_response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["updatePolicy"] == user_policy_urn + + # Delete a policy + remove_user_policy = { + "query": "mutation deletePolicy($urn: String!) {\n deletePolicy(urn: $urn)\n}\n", + "variables":{"urn":user_policy_urn} + } + + remove_policy_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_user_policy) + remove_policy_response.raise_for_status() + res_data = remove_policy_response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["deletePolicy"] == user_policy_urn + + + # Remove the user privilege by admin + remove_policy(admin_policy_urn, admin_session) + + + # Ensure that user can't create a policy after privilege is removed by admin + _ensure_cant_perform_action(user_session, create_policy,"createPolicy") \ No newline at end of file From 2187d24b54493953ab66b70f9a4b4fe0fd8841e1 Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Thu, 9 Nov 2023 13:58:12 -0600 Subject: [PATCH 11/29] docs(security): add security doc to website (#9209) --- docs-website/generateDocsDir.ts | 1 - docs-website/sidebars.js | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index a321146e10efa9..e19f09530665a0 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -125,7 +125,6 @@ function list_markdown_files(): string[] { /^docker\/(?!README|datahub-upgrade|airflow\/local_airflow)/, // Drop all but a few docker docs. /^docs\/docker\/README\.md/, // This one is just a pointer to another file. /^docs\/README\.md/, // This one is just a pointer to the hosted docs site. - /^SECURITY\.md$/, /^\s*$/, //Empty string ]; diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 9cc035f3e29e05..4d2420256ebff3 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -546,6 +546,7 @@ module.exports = { "docs/CONTRIBUTING", "docs/links", "docs/rfc", + "SECURITY", ], }, { From 5911a7b45ed726292b2aa77c9e307d0e8683603a Mon Sep 17 00:00:00 2001 From: sachinsaju <33017477+sachinsaju@users.noreply.github.com> Date: Fri, 10 Nov 2023 01:54:53 +0530 Subject: [PATCH 12/29] docs(java-sdk-dataset): add dataset via java sdk example (#9136) Co-authored-by: Hyejin Yoon <0327jane@gmail.com> --- docs/api/tutorials/datasets.md | 7 ++ .../datahubproject/examples/DatasetAdd.java | 84 +++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetAdd.java diff --git a/docs/api/tutorials/datasets.md b/docs/api/tutorials/datasets.md index 7c6d4a88d4190e..39b0fdce1bdb55 100644 --- a/docs/api/tutorials/datasets.md +++ b/docs/api/tutorials/datasets.md @@ -28,6 +28,13 @@ For detailed steps, please refer to [Datahub Quickstart Guide](/docs/quickstart. > 🚫 Creating a dataset via `graphql` is currently not supported. > Please check out [API feature comparison table](/docs/api/datahub-apis.md#datahub-api-comparison) for more information. + + + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetAdd.java show_path_as_comment }} +``` + diff --git a/metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetAdd.java b/metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetAdd.java new file mode 100644 index 00000000000000..ac368972e8dc90 --- /dev/null +++ b/metadata-integration/java/examples/src/main/java/io/datahubproject/examples/DatasetAdd.java @@ -0,0 +1,84 @@ +package io.datahubproject.examples; + +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.CorpuserUrn; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.schema.DateType; +import com.linkedin.schema.OtherSchema; +import com.linkedin.schema.SchemaField; +import com.linkedin.schema.SchemaFieldArray; +import com.linkedin.schema.SchemaFieldDataType; +import com.linkedin.schema.SchemaMetadata; +import com.linkedin.schema.StringType; +import datahub.client.MetadataWriteResponse; +import datahub.client.rest.RestEmitter; +import datahub.event.MetadataChangeProposalWrapper; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +public class DatasetAdd { + + private DatasetAdd() { + + } + + public static void main(String[] args) throws IOException, ExecutionException, InterruptedException { + DatasetUrn datasetUrn = UrnUtils.toDatasetUrn("hive", "fct_users_deleted", "PROD"); + CorpuserUrn userUrn = new CorpuserUrn("ingestion"); + AuditStamp lastModified = new AuditStamp().setTime(1640692800000L).setActor(userUrn); + + SchemaMetadata schemaMetadata = new SchemaMetadata() + .setSchemaName("customer") + .setPlatform(new DataPlatformUrn("hive")) + .setVersion(0L) + .setHash("") + .setPlatformSchema(SchemaMetadata.PlatformSchema.create(new OtherSchema().setRawSchema("__insert raw schema here__"))) + .setLastModified(lastModified); + + SchemaFieldArray fields = new SchemaFieldArray(); + + SchemaField field1 = new SchemaField() + .setFieldPath("address.zipcode") + .setType(new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))) + .setNativeDataType("VARCHAR(50)") + .setDescription("This is the zipcode of the address. Specified using extended form and limited to addresses in the United States") + .setLastModified(lastModified); + fields.add(field1); + + SchemaField field2 = new SchemaField().setFieldPath("address.street") + .setType(new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new StringType()))) + .setNativeDataType("VARCHAR(100)") + .setDescription("Street corresponding to the address") + .setLastModified(lastModified); + fields.add(field2); + + SchemaField field3 = new SchemaField().setFieldPath("last_sold_date") + .setType(new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new DateType()))) + .setNativeDataType("Date") + .setDescription("Date of the last sale date for this property") + .setLastModified(lastModified); + fields.add(field3); + + schemaMetadata.setFields(fields); + + MetadataChangeProposalWrapper mcpw = MetadataChangeProposalWrapper.builder() + .entityType("dataset") + .entityUrn(datasetUrn) + .upsert() + .aspect(schemaMetadata) + .build(); + + String token = ""; + RestEmitter emitter = RestEmitter.create( + b -> b.server("http://localhost:8080") + .token(token) + ); + Future response = emitter.emit(mcpw, null); + System.out.println(response.get().getResponseContent()); + } + +} \ No newline at end of file From d6cb106fab4a4d49193afd0efd8ff7d90a8d3fa8 Mon Sep 17 00:00:00 2001 From: sachinsaju <33017477+sachinsaju@users.noreply.github.com> Date: Fri, 10 Nov 2023 02:10:55 +0530 Subject: [PATCH 13/29] doc(java-sdk-example):example to create tag via java-sdk (#9151) --- docs/api/tutorials/tags.md | 8 ++++ .../io/datahubproject/examples/TagCreate.java | 40 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 metadata-integration/java/examples/src/main/java/io/datahubproject/examples/TagCreate.java diff --git a/docs/api/tutorials/tags.md b/docs/api/tutorials/tags.md index b2234bf00bcb92..24d583dc26dac5 100644 --- a/docs/api/tutorials/tags.md +++ b/docs/api/tutorials/tags.md @@ -78,6 +78,14 @@ Expected Response: + + +```java +{{ inline /metadata-integration/java/examples/src/main/java/io/datahubproject/examples/TagCreate.java show_path_as_comment }} +``` + + + ```python diff --git a/metadata-integration/java/examples/src/main/java/io/datahubproject/examples/TagCreate.java b/metadata-integration/java/examples/src/main/java/io/datahubproject/examples/TagCreate.java new file mode 100644 index 00000000000000..077489a9e02d9f --- /dev/null +++ b/metadata-integration/java/examples/src/main/java/io/datahubproject/examples/TagCreate.java @@ -0,0 +1,40 @@ +package io.datahubproject.examples; + +import com.linkedin.tag.TagProperties; +import datahub.client.MetadataWriteResponse; +import datahub.client.rest.RestEmitter; +import datahub.event.MetadataChangeProposalWrapper; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +public class TagCreate { + + private TagCreate() { + + } + + public static void main(String[] args) throws IOException, ExecutionException, InterruptedException { + TagProperties tagProperties = new TagProperties() + .setName("Deprecated") + .setDescription("Having this tag means this column or table is deprecated."); + + MetadataChangeProposalWrapper mcpw = MetadataChangeProposalWrapper.builder() + .entityType("tag") + .entityUrn("urn:li:tag:deprecated") + .upsert() + .aspect(tagProperties) + .build(); + + String token = ""; + RestEmitter emitter = RestEmitter.create( + b -> b.server("http://localhost:8080") + .token(token) + ); + Future response = emitter.emit(mcpw, null); + System.out.println(response.get().getResponseContent()); + + + } +} From 107713846f56e761011fd811fd8ac3b0b87a40bd Mon Sep 17 00:00:00 2001 From: Teppo Naakka Date: Fri, 10 Nov 2023 02:48:06 +0200 Subject: [PATCH 14/29] fix(ingest/powerbi): use dataset workspace id as key for parent container (#8994) --- .../ingestion/source/powerbi/powerbi.py | 42 +- .../powerbi/golden_test_container.json | 1089 +++++++++++++---- ..._config_and_modified_since_admin_only.json | 210 +++- .../tests/integration/powerbi/test_powerbi.py | 4 + 4 files changed, 1004 insertions(+), 341 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 4611a8eed47827..dc4394efcf245b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -4,7 +4,7 @@ # ######################################################### import logging -from typing import Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, List, Optional, Tuple, Union import datahub.emitter.mce_builder as builder import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes @@ -110,8 +110,7 @@ def __init__( self.__config = config self.__reporter = reporter self.__dataplatform_instance_resolver = dataplatform_instance_resolver - self.processed_datasets: Set[powerbi_data_classes.PowerBIDataset] = set() - self.workspace_key: ContainerKey + self.workspace_key: Optional[ContainerKey] = None @staticmethod def urn_to_lowercase(value: str, flag: bool) -> str: @@ -374,6 +373,9 @@ def to_datahub_dataset( f"Mapping dataset={dataset.name}(id={dataset.id}) to datahub dataset" ) + if self.__config.extract_datasets_to_containers: + dataset_mcps.extend(self.generate_container_for_dataset(dataset)) + for table in dataset.tables: # Create a URN for dataset ds_urn = builder.make_dataset_urn_with_platform_instance( @@ -461,7 +463,6 @@ def to_datahub_dataset( self.append_container_mcp( dataset_mcps, - workspace, ds_urn, dataset, ) @@ -473,8 +474,6 @@ def to_datahub_dataset( dataset.tags, ) - self.processed_datasets.add(dataset) - return dataset_mcps @staticmethod @@ -572,7 +571,6 @@ def tile_custom_properties(tile: powerbi_data_classes.Tile) -> dict: self.append_container_mcp( result_mcps, - workspace, chart_urn, ) @@ -695,7 +693,6 @@ def chart_custom_properties(dashboard: powerbi_data_classes.Dashboard) -> dict: self.append_container_mcp( list_of_mcps, - workspace, dashboard_urn, ) @@ -711,7 +708,6 @@ def chart_custom_properties(dashboard: powerbi_data_classes.Dashboard) -> dict: def append_container_mcp( self, list_of_mcps: List[MetadataChangeProposalWrapper], - workspace: powerbi_data_classes.Workspace, entity_urn: str, dataset: Optional[powerbi_data_classes.PowerBIDataset] = None, ) -> None: @@ -719,12 +715,8 @@ def append_container_mcp( dataset, powerbi_data_classes.PowerBIDataset ): container_key = dataset.get_dataset_key(self.__config.platform_name) - elif self.__config.extract_workspaces_to_containers: - container_key = workspace.get_workspace_key( - platform_name=self.__config.platform_name, - platform_instance=self.__config.platform_instance, - workspace_id_as_urn_part=self.__config.workspace_id_as_urn_part, - ) + elif self.__config.extract_workspaces_to_containers and self.workspace_key: + container_key = self.workspace_key else: return None @@ -743,6 +735,7 @@ def generate_container_for_workspace( ) -> Iterable[MetadataWorkUnit]: self.workspace_key = workspace.get_workspace_key( platform_name=self.__config.platform_name, + platform_instance=self.__config.platform_instance, workspace_id_as_urn_part=self.__config.workspace_id_as_urn_part, ) container_work_units = gen_containers( @@ -754,7 +747,7 @@ def generate_container_for_workspace( def generate_container_for_dataset( self, dataset: powerbi_data_classes.PowerBIDataset - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: dataset_key = dataset.get_dataset_key(self.__config.platform_name) container_work_units = gen_containers( container_key=dataset_key, @@ -762,7 +755,13 @@ def generate_container_for_dataset( parent_container_key=self.workspace_key, sub_types=[BIContainerSubTypes.POWERBI_DATASET], ) - return container_work_units + + # The if statement here is just to satisfy mypy + return [ + wu.metadata + for wu in container_work_units + if isinstance(wu.metadata, MetadataChangeProposalWrapper) + ] def append_tag_mcp( self, @@ -965,7 +964,6 @@ def to_chart_mcps( self.append_container_mcp( list_of_mcps, - workspace, chart_urn, ) @@ -1086,7 +1084,6 @@ def report_to_dashboard( self.append_container_mcp( list_of_mcps, - workspace, dashboard_urn, ) @@ -1220,10 +1217,6 @@ def validate_dataset_type_mapping(self): f"Dataset lineage would get ingested for data-platform = {self.source_config.dataset_type_mapping}" ) - def extract_datasets_as_containers(self): - for dataset in self.mapper.processed_datasets: - yield from self.mapper.generate_container_for_dataset(dataset) - def extract_independent_datasets( self, workspace: powerbi_data_classes.Workspace ) -> Iterable[MetadataWorkUnit]: @@ -1270,9 +1263,6 @@ def get_workspace_workunit( ): yield work_unit - if self.source_config.extract_datasets_to_containers: - yield from self.extract_datasets_as_containers() - yield from self.extract_independent_datasets(workspace) def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_container.json b/metadata-ingestion/tests/integration/powerbi/golden_test_container.json index 850816bf80807e..91b5499eaadcb4 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_container.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_container.json @@ -15,7 +15,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -30,7 +31,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -45,7 +47,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -62,7 +65,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -77,7 +81,44 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "powerbi", + "dataset": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "name": "library-dataset" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -94,7 +135,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -115,7 +157,79 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:powerbi" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", + "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -130,7 +244,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -148,7 +263,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -158,12 +274,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -177,13 +294,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -200,7 +322,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -221,7 +344,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -236,7 +360,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -254,7 +379,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -264,12 +390,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -283,13 +410,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -306,7 +438,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -327,7 +460,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -342,7 +476,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -360,7 +495,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -370,12 +506,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -389,13 +526,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -412,7 +554,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -433,7 +576,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -448,7 +592,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -466,7 +611,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -476,12 +622,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -495,13 +642,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -518,7 +670,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -539,7 +692,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +708,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -572,7 +727,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -582,12 +738,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -601,13 +758,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -624,7 +786,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -645,7 +808,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -660,7 +824,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +843,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -688,12 +854,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -707,13 +874,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -730,7 +902,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -751,7 +924,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -766,7 +940,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -784,7 +959,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -794,12 +970,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -813,13 +990,54 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2", + "urn": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "powerbi", + "dataset": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "name": "hr_pbi_test" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -836,7 +1054,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -857,7 +1076,79 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:powerbi" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", + "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -872,7 +1163,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -890,7 +1182,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -900,12 +1193,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -919,13 +1213,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "urn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -942,7 +1241,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -963,7 +1263,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -978,7 +1279,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -996,7 +1298,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1006,12 +1309,13 @@ "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "container": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1025,13 +1329,18 @@ { "id": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9", "urn": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + }, + { + "id": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", + "urn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" } ] } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1046,7 +1355,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1061,7 +1371,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1089,6 +1400,9 @@ } }, "inputs": [ + { + "string": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" + }, { "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)" }, @@ -1115,7 +1429,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1130,7 +1445,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1146,7 +1462,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1163,7 +1480,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1178,7 +1496,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1198,7 +1517,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1226,6 +1546,9 @@ } }, "inputs": [ + { + "string": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" + }, { "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)" }, @@ -1237,7 +1560,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1252,7 +1576,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1268,7 +1593,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1285,7 +1611,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1300,7 +1627,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1320,7 +1648,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1337,7 +1666,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1374,7 +1704,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1389,7 +1720,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1405,7 +1737,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1433,7 +1766,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1448,7 +1782,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1468,7 +1803,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1485,7 +1821,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1506,7 +1843,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1521,7 +1859,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1539,22 +1878,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1571,7 +1896,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1592,7 +1918,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1607,7 +1934,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1625,22 +1953,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1657,7 +1971,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1678,7 +1993,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1693,7 +2009,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1711,22 +2028,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1743,7 +2046,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1764,7 +2068,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1779,7 +2084,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1797,46 +2103,33 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "viewProperties", "aspect": { "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]){[Name=\"GSL_TEST_DB\"]}[Data], \"select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, 'mo')\", null, [EnableFolding=true])\nin\n Source", + "viewLanguage": "m_query" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]){[Name=\"GSL_TEST_DB\"]}[Data], \"select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, 'mo')\", null, [EnableFolding=true])\nin\n Source", - "viewLanguage": "m_query" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "datasetProperties", "aspect": { "json": { "customProperties": { @@ -1850,7 +2143,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1865,7 +2159,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1883,22 +2178,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1915,7 +2196,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1936,7 +2218,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1951,7 +2234,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1969,22 +2253,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2001,7 +2271,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2022,7 +2293,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2037,7 +2309,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2055,22 +2328,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:a4ed52f9abd3ff9cc34960c0c41f72e9" - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2085,7 +2344,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2100,7 +2360,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2126,6 +2387,9 @@ } }, "inputs": [ + { + "string": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" + }, { "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)" }, @@ -2152,7 +2416,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2167,7 +2432,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2184,7 +2450,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2199,7 +2466,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2219,7 +2487,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2245,6 +2514,9 @@ } }, "inputs": [ + { + "string": "urn:li:container:6ac0662f0f2fc3a9196ac505da2182b2" + }, { "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)" }, @@ -2271,7 +2543,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2286,7 +2559,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2303,7 +2577,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2318,7 +2593,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2338,7 +2614,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2355,7 +2632,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2388,7 +2666,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2403,7 +2682,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2419,7 +2699,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2436,7 +2717,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2464,7 +2746,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2479,7 +2762,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2499,7 +2783,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2514,7 +2799,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2529,7 +2815,310 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User4@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User4@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User4@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User3@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User3@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User3@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "powerbi", + "workspace": "second-demo-workspace" + }, + "name": "second-demo-workspace" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:powerbi" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/second-demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", + "urn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "chartCount": "0", + "workspaceName": "second-demo-workspace", + "workspaceId": "64ED5CAD-7C22-4684-8180-826122881108" + }, + "title": "test_dashboard2", + "description": "", + "charts": [], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "dashboardUrl": "https://localhost/dashboards/web/1" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "dashboardId": "powerbi.linkedin.com/dashboards/7D668CAD-8FFC-4505-9215-655BCA5BEBAE" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:users.User3@foo.com", + "type": "NONE" + }, + { + "owner": "urn:li:corpuser:users.User4@foo.com", + "type": "NONE" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json b/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json index a4527b97157042..b301ca1c1b9886 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json @@ -15,7 +15,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -30,7 +31,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -45,7 +47,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -62,7 +65,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -77,7 +81,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -94,7 +99,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -126,7 +132,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -147,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -162,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -180,7 +189,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +214,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -219,7 +230,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -238,7 +250,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -249,6 +262,10 @@ "aspect": { "json": { "path": [ + { + "id": "urn:li:container:e3dc21b5c79f9d594f639a9f57d7f2c3", + "urn": "urn:li:container:e3dc21b5c79f9d594f639a9f57d7f2c3" + }, { "id": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", "urn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" @@ -258,7 +275,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -275,7 +293,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -307,7 +326,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +348,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -343,7 +364,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -361,7 +383,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -385,7 +408,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -400,7 +424,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -419,7 +444,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -430,6 +456,10 @@ "aspect": { "json": { "path": [ + { + "id": "urn:li:container:e3dc21b5c79f9d594f639a9f57d7f2c3", + "urn": "urn:li:container:e3dc21b5c79f9d594f639a9f57d7f2c3" + }, { "id": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", "urn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" @@ -439,7 +469,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -456,7 +487,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -540,7 +572,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -561,7 +594,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -576,7 +610,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -594,7 +629,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -618,7 +654,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -633,7 +670,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -652,7 +690,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -663,6 +702,10 @@ "aspect": { "json": { "path": [ + { + "id": "urn:li:container:e3dc21b5c79f9d594f639a9f57d7f2c3", + "urn": "urn:li:container:e3dc21b5c79f9d594f639a9f57d7f2c3" + }, { "id": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", "urn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" @@ -672,7 +715,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -704,7 +748,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -719,7 +764,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -735,7 +781,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -752,7 +799,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -767,7 +815,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -787,7 +836,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -815,6 +865,9 @@ } }, "inputs": [ + { + "string": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc" + }, { "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)" }, @@ -829,7 +882,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -844,7 +898,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -860,7 +915,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -877,7 +933,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -892,7 +949,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -912,7 +970,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -929,7 +988,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -966,7 +1026,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -981,7 +1042,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -997,7 +1059,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1012,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1032,7 +1096,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1049,7 +1114,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1079,7 +1145,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1094,7 +1161,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1110,7 +1178,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1127,7 +1196,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1151,7 +1221,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1166,7 +1237,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1186,7 +1258,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1205,7 +1278,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1220,7 +1294,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1235,7 +1310,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1252,7 +1328,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1267,7 +1344,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1287,7 +1365,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1302,7 +1381,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test" + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 7232d2a38da1d3..c9b0ded4337491 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1039,7 +1039,11 @@ def test_workspace_container( "type": "powerbi", "config": { **default_source_config(), + "workspace_id_pattern": { + "deny": ["64ED5CAD-7322-4684-8180-826122881108"], + }, "extract_workspaces_to_containers": True, + "extract_datasets_to_containers": True, "extract_reports": True, }, }, From bfa1769d4dd4f5281d751c6998c586e4e021897d Mon Sep 17 00:00:00 2001 From: John Joyce Date: Thu, 9 Nov 2023 17:56:33 -0800 Subject: [PATCH 15/29] refactor(schema tab): Remove last observed timestamps from schema tab (#9188) --- .../schema/SchemaTimeStamps.test.tsx | 23 ------- .../schema/components/SchemaHeader.tsx | 6 -- .../schema/components/SchemaTimeStamps.tsx | 64 ------------------- .../shared/tabs/Dataset/Schema/SchemaTab.tsx | 5 -- 4 files changed, 98 deletions(-) delete mode 100644 datahub-web-react/src/app/entity/dataset/profile/__tests__/schema/SchemaTimeStamps.test.tsx delete mode 100644 datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaTimeStamps.tsx diff --git a/datahub-web-react/src/app/entity/dataset/profile/__tests__/schema/SchemaTimeStamps.test.tsx b/datahub-web-react/src/app/entity/dataset/profile/__tests__/schema/SchemaTimeStamps.test.tsx deleted file mode 100644 index c8bb5d8100f2aa..00000000000000 --- a/datahub-web-react/src/app/entity/dataset/profile/__tests__/schema/SchemaTimeStamps.test.tsx +++ /dev/null @@ -1,23 +0,0 @@ -import { render } from '@testing-library/react'; -import React from 'react'; -import { toRelativeTimeString } from '../../../../../shared/time/timeUtils'; -import SchemaTimeStamps from '../../schema/components/SchemaTimeStamps'; - -describe('SchemaTimeStamps', () => { - it('should render last observed text if lastObserved is not null', () => { - const { getByText, queryByText } = render(); - expect(getByText(`Last observed ${toRelativeTimeString(123)}`)).toBeInTheDocument(); - expect(queryByText(`Reported ${toRelativeTimeString(123)}`)).toBeNull(); - }); - - it('should render last updated text if lastObserved is null', () => { - const { getByText, queryByText } = render(); - expect(queryByText(`Last observed ${toRelativeTimeString(123)}`)).toBeNull(); - expect(getByText(`Reported ${toRelativeTimeString(123)}`)).toBeInTheDocument(); - }); - - it('should return null if lastUpdated and lastObserved are both null', () => { - const { container } = render(); - expect(container.firstChild).toBeNull(); - }); -}); diff --git a/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaHeader.tsx b/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaHeader.tsx index 9e9e0ede2a1cef..2fc8fc11cd1b29 100644 --- a/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaHeader.tsx +++ b/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaHeader.tsx @@ -17,7 +17,6 @@ import { SemanticVersionStruct } from '../../../../../../types.generated'; import { toRelativeTimeString } from '../../../../../shared/time/timeUtils'; import { ANTD_GRAY, REDESIGN_COLORS } from '../../../../shared/constants'; import { navigateToVersionedDatasetUrl } from '../../../../shared/tabs/Dataset/Schema/utils/navigateToVersionedDatasetUrl'; -import SchemaTimeStamps from './SchemaTimeStamps'; import getSchemaFilterFromQueryString from '../../../../shared/tabs/Dataset/Schema/utils/getSchemaFilterFromQueryString'; const SchemaHeaderContainer = styled.div` @@ -137,8 +136,6 @@ type Props = { hasKeySchema: boolean; showKeySchema: boolean; setShowKeySchema: (show: boolean) => void; - lastUpdated?: number | null; - lastObserved?: number | null; selectedVersion: string; versionList: Array; showSchemaAuditView: boolean; @@ -158,8 +155,6 @@ export default function SchemaHeader({ hasKeySchema, showKeySchema, setShowKeySchema, - lastUpdated, - lastObserved, selectedVersion, versionList, showSchemaAuditView, @@ -255,7 +250,6 @@ export default function SchemaHeader({ )} - - {lastObserved && ( - Last observed on {toLocalDateTimeString(lastObserved)}. - )} - {lastUpdated &&
First reported on {toLocalDateTimeString(lastUpdated)}.
} - - } - > - - {lastObserved && ( - - Last observed {toRelativeTimeString(lastObserved)} - - )} - {!lastObserved && lastUpdated && ( - - - Reported {toRelativeTimeString(lastUpdated)} - - )} - - - ); -} - -export default SchemaTimeStamps; diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/SchemaTab.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/SchemaTab.tsx index 4bdb2dac033e7b..75027e17b6d0c1 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/SchemaTab.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/SchemaTab.tsx @@ -151,9 +151,6 @@ export const SchemaTab = ({ properties }: { properties?: any }) => { return groupByFieldPath(filteredRows, { showKeySchema }); }, [showKeySchema, filteredRows]); - const lastUpdated = getSchemaBlameData?.getSchemaBlame?.version?.semanticVersionTimestamp; - const lastObserved = versionedDatasetData.data?.versionedDataset?.schema?.lastObserved; - const schemaFieldBlameList: Array = (getSchemaBlameData?.getSchemaBlame?.schemaFieldBlameList as Array) || []; @@ -167,8 +164,6 @@ export const SchemaTab = ({ properties }: { properties?: any }) => { hasKeySchema={hasKeySchema} showKeySchema={showKeySchema} setShowKeySchema={setShowKeySchema} - lastObserved={lastObserved} - lastUpdated={lastUpdated} selectedVersion={selectedVersion} versionList={versionList} showSchemaAuditView={showSchemaAuditView} From 9c0f4de38241477524682943c815d5c03259e1a5 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Fri, 10 Nov 2023 16:06:06 +0900 Subject: [PATCH 16/29] docs: adjust sidebar & create new admin section (#9064) --- docs-website/sidebars.js | 213 ++++++++++++++++++++------------------ docs/CODE_OF_CONDUCT.md | 2 +- docs/saas.md | 14 --- docs/townhall-history.md | 216 +++++++++++++++++++++++---------------- docs/townhalls.md | 11 +- 5 files changed, 253 insertions(+), 203 deletions(-) delete mode 100644 docs/saas.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 4d2420256ebff3..f15f2927379c56 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -9,17 +9,13 @@ module.exports = { overviewSidebar: [ { - label: "Getting Started", + label: "What Is DataHub?", type: "category", collapsed: true, + link: { type: "doc", id: "docs/features" }, items: [ // By the end of this section, readers should understand the core use cases that DataHub addresses, // target end-users, high-level architecture, & hosting options - { - type: "doc", - label: "Introduction", - id: "docs/features", - }, { type: "doc", label: "Quickstart", @@ -31,7 +27,6 @@ module.exports = { href: "https://demo.datahubproject.io/", }, "docs/what-is-datahub/datahub-concepts", - "docs/saas", ], }, { @@ -161,7 +156,15 @@ module.exports = { "docs/deploy/azure", "docker/README", "docs/deploy/kubernetes", + "docs/deploy/confluent-cloud", "docs/deploy/environment-vars", + "docs/how/extract-container-logs", + ], + }, + { + type: "category", + label: "Admin", + items: [ { Authentication: [ "docs/authentication/README", @@ -195,20 +198,91 @@ module.exports = { "docs/how/restore-indices", "docs/advanced/db-retention", "docs/advanced/monitoring", - "docs/how/extract-container-logs", "docs/deploy/telemetry", "docs/how/kafka-config", - "docs/deploy/confluent-cloud", "docs/advanced/no-code-upgrade", "docs/how/jattach-guide", ], }, - "docs/how/updating-datahub", ], }, { - API: [ - "docs/api/datahub-apis", + Developers: [ + // The purpose of this section is to provide developers & technical users with + // concrete tutorials for how to work with the DataHub CLI & APIs + { + Architecture: [ + "docs/architecture/architecture", + "docs/components", + "docs/architecture/metadata-ingestion", + "docs/architecture/metadata-serving", + "docs/architecture/docker-containers", + ], + }, + { + "Metadata Model": [ + "docs/modeling/metadata-model", + "docs/modeling/extending-the-metadata-model", + "docs/what/mxe", + { + Entities: [ + { + type: "autogenerated", + dirName: "docs/generated/metamodel/entities", // '.' means the current docs folder + }, + ], + }, + ], + }, + { + "Developing on DataHub": [ + "docs/developers", + "docs/docker/development", + "metadata-ingestion/developing", + "docs/api/graphql/graphql-endpoint-development", + { + Modules: [ + "datahub-web-react/README", + "datahub-frontend/README", + "datahub-graphql-core/README", + "metadata-service/README", + "metadata-jobs/mae-consumer-job/README", + "metadata-jobs/mce-consumer-job/README", + ], + }, + ], + }, + "docs/plugins", + { + Troubleshooting: [ + "docs/troubleshooting/quickstart", + "docs/troubleshooting/build", + "docs/troubleshooting/general", + ], + }, + { + Advanced: [ + "metadata-ingestion/docs/dev_guides/reporting_telemetry", + "docs/advanced/mcp-mcl", + "docker/datahub-upgrade/README", + "docs/advanced/no-code-modeling", + "datahub-web-react/src/app/analytics/README", + "docs/how/migrating-graph-service-implementation", + "docs/advanced/field-path-spec-v2", + "metadata-ingestion/adding-source", + "docs/how/add-custom-ingestion-source", + "docs/how/add-custom-data-platform", + "docs/advanced/browse-paths-upgrade", + "docs/browseV2/browse-paths-v2", + ], + }, + ], + }, + { + type: "category", + label: "API", + link: { type: "doc", id: "docs/api/datahub-apis" }, + items: [ { "GraphQL API": [ { @@ -466,92 +540,14 @@ module.exports = { ], }, { - Develop: [ - // The purpose of this section is to provide developers & technical users with - // concrete tutorials for how to work with the DataHub CLI & APIs - { - "DataHub Metadata Model": [ - "docs/modeling/metadata-model", - "docs/modeling/extending-the-metadata-model", - "docs/what/mxe", - { - Entities: [ - { - type: "autogenerated", - dirName: "docs/generated/metamodel/entities", // '.' means the current docs folder - }, - ], - }, - ], - }, - { - Architecture: [ - "docs/architecture/architecture", - "docs/components", - "docs/architecture/metadata-ingestion", - "docs/architecture/metadata-serving", - "docs/architecture/docker-containers", - ], - }, - { - "Developing on DataHub": [ - "docs/developers", - "docs/docker/development", - "metadata-ingestion/developing", - "docs/api/graphql/graphql-endpoint-development", - { - Modules: [ - "datahub-web-react/README", - "datahub-frontend/README", - "datahub-graphql-core/README", - "metadata-service/README", - "metadata-jobs/mae-consumer-job/README", - "metadata-jobs/mce-consumer-job/README", - ], - }, - ], - }, - "docs/plugins", - { - Troubleshooting: [ - "docs/troubleshooting/quickstart", - "docs/troubleshooting/build", - "docs/troubleshooting/general", - ], - }, - { - Advanced: [ - "metadata-ingestion/docs/dev_guides/reporting_telemetry", - "docs/advanced/mcp-mcl", - "docker/datahub-upgrade/README", - "docs/advanced/no-code-modeling", - "datahub-web-react/src/app/analytics/README", - "docs/how/migrating-graph-service-implementation", - "docs/advanced/field-path-spec-v2", - "metadata-ingestion/adding-source", - "docs/how/add-custom-ingestion-source", - "docs/how/add-custom-data-platform", - "docs/advanced/browse-paths-upgrade", - "docs/browseV2/browse-paths-v2", - ], - }, - ], - }, - { - Community: [ - "docs/slack", - "docs/townhalls", - "docs/townhall-history", - "docs/CODE_OF_CONDUCT", - "docs/CONTRIBUTING", - "docs/links", - "docs/rfc", - "SECURITY", - ], - }, - { - "Managed DataHub": [ - "docs/managed-datahub/managed-datahub-overview", + label: "Managed DataHub", + type: "category", + collapsed: true, + link: { + type: "doc", + id: "docs/managed-datahub/managed-datahub-overview", + }, + items: [ "docs/managed-datahub/welcome-acryl", { type: "doc", @@ -648,7 +644,26 @@ module.exports = { ], }, { - "Release History": ["releases"], + label: "Community", + type: "category", + collapsed: true, + link: { + type: "generated-index", + title: "Community", + description: "Learn about DataHub community.", + }, + items: [ + "docs/slack", + "docs/townhalls", + // "docs/townhall-history", + "docs/CODE_OF_CONDUCT", + "docs/CONTRIBUTING", + "docs/links", + "docs/rfc", + ], + }, + { + "Release History": ["releases", "docs/how/updating-datahub"], }, // "Candidates for Deprecation": [ diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md index 1c4fd659f14e09..ca899dc26d5f71 100644 --- a/docs/CODE_OF_CONDUCT.md +++ b/docs/CODE_OF_CONDUCT.md @@ -1,4 +1,4 @@ -# Contributor Covenant Code of Conduct +# Code of Conduct ## Our Pledge diff --git a/docs/saas.md b/docs/saas.md deleted file mode 100644 index de57b5617e0626..00000000000000 --- a/docs/saas.md +++ /dev/null @@ -1,14 +0,0 @@ -# DataHub SaaS - -Sign up for fully managed, hassle-free and secure SaaS service for DataHub, provided by [Acryl Data](https://www.acryl.io/). - -

- - Sign up - -

- -Refer to [Managed Datahub Exclusives](/docs/managed-datahub/managed-datahub-overview.md) for more information. diff --git a/docs/townhall-history.md b/docs/townhall-history.md index d92905af0cd72c..0242e4ec2cee12 100644 --- a/docs/townhall-history.md +++ b/docs/townhall-history.md @@ -1,22 +1,55 @@ -# Town Hall History +# Town Hall History -A list of previous Town Halls, their planned schedule, and the recording of the meeting. +:::note +For the Town Hall meetings after June 2023, please refer to our [LinkedIn Live event history](https://www.linkedin.com/company/acryl-data/events/). +::: -## 03/23/2023 -[Full YouTube video](https://youtu.be/BTX8rIBe0yo) +### June 2023 +[Full YouTube video](https://www.youtube.com/watch?v=1QVcUmRQK5E) + +- Community & Project Updates - Maggie Hays & Shirshanka Das (Acryl Data) +- Community Case Study: Dataset Joins - Raj Tekal & Bobbie-Jean Nowak (Optum) +- DataHub 201: Column-Level Lineage - Hyejin Yoon (Acryl Data) +- Sneak Peek: BigQuery Column-Level Lineage with SQL Parsing - Harshal Sheth (Acryl Data) +- DataHub Performance Tuning – Indy Prentice (Acryl Data) + + +### May 2023 +[Full YouTube video](https://www.youtube.com/watch?v=KHNPjSbbZR8) + +**Agenda** +- Community - Maggie Hays & Shirshanka Das (Acryl Data) +- Community Case Study: Jira + DataHub for Access Requests - Joshua Garza (Sharp Healthcare) +- Sneak Peek: Use your own ownership types - Pedro Silva (Acryl Data) +- Sneak Peek: Data Contracts are coming! – John Joyce, Shirshanka (Acryl Data) +- Bring DataHub into your BI Tools — Chris Collins (Acryl Data) + +### Apr 2023 +[Full YouTube video](https://www.youtube.com/watch?v=D5YYGu-ZIBo) + +**Agenda** +- Community & Roadmap Updates - Maggie Hays & Shirshanka Das (Acryl Data) +- DataHub 201: Python SDK - Hyejin Yoon (Acryl Data) +- Streamlined Search & Browse Experience - Chris Collins (Acryl Data) +- Acryl's DataHub GitHub Actions - Harshal Sheth (Acryl Data) +- Data Products in DataHub - Shirshanka Das & Chris Collins (Acryl Data) +- DataHub Docs Bot - Maggie Hays (Acryl Data) + +### Mar 2023 -### Agenda +[Full YouTube video](https://youtu.be/BTX8rIBe0yo) +**Agenda** - Community & Roadmap Update - Recent Releases - Community Case Study — Jumio’s DataHub adoption journey - DataHub 201: Data Debugging - Sneak Peek: Streamlined Filtering Experience -## 02/23/2023 +### Feb 2023 [Full YouTube video](https://youtu.be/UItt4ppJSFc) -### Agenda +**Agenda** - Community & Roadmap Update - Recent Releases @@ -27,20 +60,20 @@ A list of previous Town Halls, their planned schedule, and the recording of the - Simplifying Metadata Ingestion - DataHub 201: Rolling Out DataHub -## 01/26/2023 +### Jan 2023 (26th) [Full YouTube video](https://youtu.be/A3mSiGHZ6Rc) -### Agenda +**Agenda** - What’s to Come - Q1 2023 Roadmap: Data Products, Data Contracts and more - Community Case Study - Notion: Automating annotations and metadata propagation - Community Contribution - Grab: Improvements to documentation editing - Simplifying DataHub - Removing Schema Registry requirement and introducing DataHub Lite -## 01/05/2023 +### Jan 2023 (5th) [Full YouTube video](https://youtu.be/ECxIMbKwuOY) -### Agenda +**Agenda** - DataHub Community: 2022 in Review - Our Community of Data Practitioners is one of a kind. We’ll take the time to celebrate who we are, what we’ve built, and how we’ve collaborated in the past 12 months. - Search Improvements - Learn how we’re making the Search experience smarter and faster to connect you with the most relevant resources during data discovery. @@ -49,13 +82,12 @@ A list of previous Town Halls, their planned schedule, and the recording of the - Sneak Peek: Time-based Lineage - Get a preview of how you’ll soon be able to trace lineage between datasets across different points in time to understand how interdependencies have evolved. - Sneak Peek: Chrome Extension - Soon, you’ll be able to quickly access rich metadata from DataHub while exploring resources in Looker via our upcoming Chrome Extension. -## 12/01/2022 +### Dec 2023 [Full YouTube video](https://youtu.be/BlCLhG8lGoY) -### Agenda +**Agenda** November Town Hall (in December!) - - Community Case Study - The Pinterest Team will share how they have integrated DataHub + Thrift and extended the Metadata Model with a Data Element entity to capture semantic types. - NEW! Ingestion Quickstart Guides - DataHub newbies, this one is for you! We’re rolling out ingestion quickstart guides to help you quickly get up and running with DataHub + Snowflake, BigQuery, and more! - NEW! In-App Product Tours - We’re making it easier than ever for end-users to get familiar with all that DataHub has to offer - hear all about the in-product onboarding resources we’re rolling out soon! @@ -64,10 +96,10 @@ November Town Hall (in December!) - NEW! Slack + Microsoft Teams Integrations - Send automated alerts to Slack and/or Teams to keep track of critical events and changes within DataHub. - Hacktoberfest Winners Announced - We’ll recap this year’s Hacktoberfest and announce three winners of a $250 Amazon gift card & DataHub Swag. -## 10/27/2022 +### Oct 2022 [Full YouTube video](https://youtu.be/B74WHxX5EMk) -### Agenda +**Agenda** - Conquer Data Governance with Acryl Data’s Metadata Tests - Learn how to tackle Data Governance with incremental, automation-driven governance using Metadata Tests provided in Acryl Data’s managed DataHub offering - Community Case Study - The Grab Team shares how they are using DataHub for data discoverability, automated classification and governance workflows, data quality observability, and beyond! @@ -75,20 +107,19 @@ November Town Hall (in December!) - Sneak Peek! Saved Views - Learn how you can soon use Saved Views to help end-users navigate entities in DataHub with more precision and focus - Performance Improvements - Hear about the latest upgrades to DataHub performance -## 9/29/2022 +### Sep 2022 [Full YouTube video](https://youtu.be/FjkNySWkghY) -### Agenda - +**Agenda** - Column Level Lineage is here! - Demo of column-level lineage and impact analysis in the DataHub UI - Community Case Study - The Stripe Team shares how they leverage DataHub to power observability within their Airflow-based ecosystem - Sneak Peek! Automated PII Classification - Preview upcoming functionality to automatically identify data fields that likely contain sensitive data - Ingestion Improvements Galore - Improved performance and functionality for dbt, Looker, Tableau, and Presto ingestion sources -## 8/25/2022 +### Aug 2022 [Full YouTube video](https://youtu.be/EJCKxKBvCwo) -### Agenda +**Agenda** - Community Case Study - The Etsy Team shares their journey of adopting DataHub - Looker & DataHub Improvements - surface the most relevant Looks and Dashboards @@ -97,10 +128,11 @@ November Town Hall (in December!) - Patch Support - Native support for PATCH in the metadata protocol to support efficient updates to add & remove owners, lineage, tags and more - Sneak Peek! Advanced Search -## 7/28/2022 +### Jul 2022 + [Full YouTube video](https://youtu.be/Zrkf3Mzcvc4) -### Agenda +**Agenda** - Community Updates - Project Updates @@ -109,21 +141,20 @@ November Town Hall (in December!) - Streamlined Metadata Ingestion - DataHub 201: Metadata Enrichment -## 6/30/2022 +### Jun 2022 [Full YouTube video](https://youtu.be/fAD53fEJ6m0) -### Agenda - +**Agenda** - Community Updates - Project Updates - dbt Integration Updates - CSV Ingestion Support - DataHub 201 - Glossary Term Deep Dive -## 5/26/2022 +### May 2022 [Full YouTube video](https://youtu.be/taKb_zyowEE) -### Agenda +**Agenda** - Community Case Study: Hear how the G-Research team is using Cassandra as DataHub’s Backend - Creating & Editing Glossary Terms from the DataHub UI @@ -132,20 +163,22 @@ November Town Hall (in December!) - Sneak Peek: Data Reliability with DataHub - Metadata Day Hackathon Winners -## 4/28/2022 +### Apr 2022 [Full YouTube video](https://www.youtube.com/watch?v=7iwNxHgqxtg) -### Agenda +**Agenda** + - Community Case Study: Hear from Included Health about how they are embedding external tools into the DataHub UI - New! Actions Framework: run custom code when changes happen within DataHub - UI Refresh for ML Entities - Improved deletion support for time-series aspects, tags, terms, & more - OpenAPI Improvements -## 3/31/2022 +### Mar 2022 [Full YouTube video](https://www.youtube.com/watch?v=IVazVgcNRdw) -### Agenda +**Agenda** + - Community Case Study: Hear from Zendesk about how they are applying “shift left” principles by authoring metadata in their Protobuf schemas - RBAC Functionality: View-Based Policies - Schema Version History - surfacing the history of schema changes in DataHub's UI @@ -154,20 +187,22 @@ November Town Hall (in December!) - Delete API -## 2/25/2022 +### Feb 2022 [Full YouTube video](https://www.youtube.com/watch?v=enBqB2Dbuv4) -### Agenda +**Agenda** + - Lineage Impact Analysis - using DataHub to understand the impact of changes on downstream dependencies - Displaying Data Quality Checks in the UI - Roadmap update: Schema Version History & Column-Level Lineage - Community Case Study: Managing Lineage via YAML -## 1/28/2022 +### Jan 2022 [Full YouTube video](https://youtu.be/ShlSR3dMUnE) -### Agenda +**Agenda** + - Community & Roadmap Updates by Maggie Hays (Acryl Data) - Project Updates by Shirshanka Das (Acryl Data) @@ -176,10 +211,11 @@ November Town Hall (in December!) - DataHub Basics — Data Profiling & Usage Stats 101 by Maggie Hays & Tamás Németh (Acryl Data) - Demo: Spark Lineage by Mugdha Hardikar (GS Lab) & Shirshanka Das -## 12/17/2021 +### Dec 2021 [Full YouTube video](https://youtu.be/rYInKCwxu7o) -### Agenda +**Agenda** + - Community & Roadmap Updates by Maggie Hays (Acryl Data) - Project Updates by Shirshanka Das (Acryl Data) - 2021 DataHub Community in Review by Maggie Hays @@ -189,10 +225,11 @@ November Town Hall (in December!) - Top DataHub Contributors of 2021 - Maggie Hays - Final Surprise! We Interviewed a 10yo and a 70yo about DataHub -## 11/19/2021 +### Nov 2021 [Full YouTube video](https://youtu.be/to80sEDZz7k) -### Agenda +**Agenda** + - Community & Roadmap Updates by Maggie Hays (Acryl Data) - Project Updates by Shirshanka Das (Acryl Data) - DataHub Basics -- Lineage 101 by John Joyce & Surya Lanka (Acryl Data) @@ -200,10 +237,11 @@ November Town Hall (in December!) - DataHub API Authentication by John Joyce (Acryl Data) - Case Study: LinkedIn pilot to extend the OSS UI by Aikepaer Abuduweili & Joshua Shinavier -## 10/29/2021 +### Oct 2021 [Full YouTube video](https://youtu.be/GrS_uZhYNm0) -### Agenda +**Agenda** + - DataHub Community & Roadmap Update - Maggie Hays (Acryl Data) - October Project Updates - Shirshanka Das (Acryl Data) - Introducing Recommendations - John Joyce & Dexter Lee (Acryl Data) @@ -211,10 +249,11 @@ November Town Hall (in December!) - Data Profiling Improvements - Surya Lanka & Harshal Sheth (Acryl Data) - Lineage Improvements & BigQuery Dataset Lineage by Gabe Lyons & Varun Bharill (Acryl Data) -## 9/24/2021 +### Sep 2021 [Full YouTube video](https://youtu.be/nQDiKPKnLLQ) -### Agenda +**Agenda** + - Project Updates and Callouts by Shirshanka - GraphQL Public API Annoucement - Demo: Faceted Search by Gabe Lyons (Acryl Data) @@ -224,10 +263,11 @@ November Town Hall (in December!) - Offline - Foreign Key and Related Term Mapping by Gabe Lyons (Acryl Data) [video](https://www.loom.com/share/79f27c2d9f6c4a3b8aacbc48c19add18) -## 8/27/2021 +### Aug 2021 [Full YouTube video](https://youtu.be/3joZINi3ti4) -### Agenda +**Agenda** + - Project Updates and Callouts by Shirshanka - Business Glossary Demo - 0.8.12 Upcoming Release Highlights @@ -239,12 +279,13 @@ November Town Hall (in December!) - Performance Monitoring by Dexter Lee (Acryl Data) [video](https://youtu.be/6Xfr_Y9abZo) -## 7/23/2021 +### Jul 2021 [Full YouTube video](https://www.youtube.com/watch?v=rZsiB8z5rG4) [Medium Post](https://medium.com/datahub-project/datahub-project-updates-f4299cd3602e?source=friends_link&sk=27af7637f7ae44786ede694c3af512a5) -### Agenda +**Agenda** + - Project Updates by Shirshanka - Release highlights @@ -253,12 +294,13 @@ November Town Hall (in December!) - Demo: AWS SageMaker integration for Models and Features by Kevin Hu (Acryl Data) -## 6/25/2021 +### Jun 2021 [Full YouTube video](https://www.youtube.com/watch?v=xUHOdDfdFpY) [Medium Post](https://medium.com/datahub-project/datahub-project-updates-ed3155476408?source=friends_link&sk=02816a16ff2acd688e6db8eb55808d31) -#### Agenda +**Agenda** + - Project Updates by Shirshanka - Release notes @@ -269,12 +311,13 @@ November Town Hall (in December!) - Developer Session: Simplified Deployment for DataHub by John Joyce, Gabe Lyons (Acryl Data) -## 5/27/2021 +### May 2021 [Full YouTube video](https://www.youtube.com/watch?v=qgW_xpIr1Ho) [Medium Post](https://medium.com/datahub-project/linkedin-datahub-project-updates-ed98cdf913c1?source=friends_link&sk=9930ec5579299b155ea87c747683d1ad) -#### Agenda +**Agenda** + - Project Updates by Shirshanka - 10 mins - 0.8.0 Release @@ -284,12 +327,13 @@ November Town Hall (in December!) - Deep Dive: No Code Metadata Engine by John Joyce (Acryl Data) - 20 mins - General Q&A and closing remarks -## 4/23/2021 +### Apr 2021 [Full YouTube video](https://www.youtube.com/watch?v=dlFa4ubJ9ho) [Medium Digest](https://medium.com/datahub-project/linkedin-datahub-project-updates-2b0d26066b8f?source=friends_link&sk=686c47219ed294e0838ae3e2fe29084d) -#### Agenda +**Agenda** + - Welcome - 5 mins - Project Updates by Shirshanka - 10 mins @@ -302,12 +346,13 @@ November Town Hall (in December!) - General Q&A and closing remarks - 5 mins -## 3/19/2021 +### Mar 2021 [YouTube video](https://www.youtube.com/watch?v=xE8Uc27VTG4) - + [Medium Digest](https://medium.com/datahub-project/linkedin-datahub-project-updates-697f0faddd10?source=friends_link&sk=9888633c5c7219b875125e87a703ec4d) -#### Agenda +**Agenda** + * Welcome - 5 mins * Project Updates ([slides](https://drive.google.com/file/d/1c3BTP3oDAzJr07l6pY6CkDZi5nT0cLRs/view?usp=sharing)) by [Shirshanka](https://www.linkedin.com/in/shirshankadas/) - 10 mins @@ -320,11 +365,11 @@ November Town Hall (in December!) * Closing remarks - 5 mins -## 2/19/2021 +### Feb 2021 [YouTube video](https://www.youtube.com/watch?v=Z9ImbcsAVl0) - + [Medium Digest](https://medium.com/datahub-project/linkedin-datahub-project-updates-february-2021-edition-338d2c6021f0) -#### Agenda +**Agenda** * Welcome - 5 mins * Latest React App Demo! ([video](https://www.youtube.com/watch?v=RQBEJhcen5E)) by John Joyce and Gabe Lyons - 5 mins @@ -334,12 +379,12 @@ November Town Hall (in December!) * Closing remarks - 5 mins -## 1/15/2021 +### Jan 2021 [Full Recording](https://youtu.be/r862MZTLAJ0) [Slide-deck](https://docs.google.com/presentation/d/e/2PACX-1vQ2B0iHb2uwege1wlkXHOgQer0myOMEE5EGnzRjyqw0xxS5SaAc8VMZ_1XVOHuTZCJYzZZW4i9YnzSN/pub?start=false&loop=false&delayms=3000) -Agenda +**Agenda** - Announcements - 2 mins - Community Updates ([video](https://youtu.be/r862MZTLAJ0?t=99)) - 10 mins @@ -349,10 +394,10 @@ Agenda - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 minutes -## 12/04/2020 +### Dec 2020 [Recording](https://linkedin.zoom.us/rec/share/8E7-lFnCi_kQ8OvXR9kW6fn-AjvV8VlqOO2xYR8b5Y_UeWI_ODcKFlxlHqYgBP7j.S-c8C1YMrz7d3Mjq) -Agenda +**Agenda** - Quick intro - 5 mins - [Why did Grofers choose DataHub for their data catalog?](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Datahub_at_Grofers.pdf) by [Shubham Gupta](https://www.linkedin.com/in/shubhamg931/) - 15 minutes @@ -360,11 +405,11 @@ Agenda - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 minutes -## 11/06/2020 +### Nov 2020 [Recording](https://linkedin.zoom.us/rec/share/0yvjZ2fOzVmD8aaDo3lC59fXivmYG3EnF0U9tMVgKs827595usvSoIhtFUPjZCsU.b915nLRkw6iQlnoD) -Agenda +**Agenda** - Quick intro - 5 mins - [Lightning talk on Metadata use-cases at LinkedIn](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Metadata_Use-Cases_at_LinkedIn_-_Lightning_Talk.pdf) by [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) (LinkedIn) - 5 mins @@ -374,11 +419,11 @@ Agenda - Closing remarks - 5 minutes -## 09/25/2020 +### Sep 2020 [Recording](https://linkedin.zoom.us/rec/share/uEQ2pRY0BHbVqk_sOTVRm05VXJ0xM_zKJ26yzfCBqNZItiBht__k_juCCahJ37QK.IKAU9qA_0qdURX4_) -Agenda +**Agenda** - Quick intro - 5 mins - [Data Discoverability at SpotHero](https://github.com/acryldata/static-assets-test/raw/master/imgs/demo/Data_Discoverability_at_SpotHero.pdf) by [Maggie Hays](https://www.linkedin.com/in/maggie-hays/) (SpotHero) - 20 mins @@ -386,23 +431,23 @@ Agenda - General Q&A from sign up sheet, slack, and participants - 15 mins - Closing remarks - 5 mins -## 08/28/2020 +### Aug 2020 [Recording](https://linkedin.zoom.us/rec/share/vMBfcb31825IBZ3T71_wffM_GNv3T6a8hicf8_dcfzQlhfFxl5i_CPVKcmYaZA) -Agenda +**Agenda** - Quick intro - 5 mins - [Data Governance look for a Digital Bank](https://www.slideshare.net/SheetalPratik/linkedinsaxobankdataworkbench) by [Sheetal Pratik](https://www.linkedin.com/in/sheetalpratik/) (Saxo Bank) - 20 mins - Column level lineage for datasets demo by [Nagarjuna Kanamarlapudi](https://www.linkedin.com/in/nagarjunak/) (LinkedIn) - 15 mins - General Q&A from sign up sheet and participants - 15 mins - Closing remarks - 5 mins -## 07/31/20 +### Jul 2020 [Recording](https://bluejeans.com/s/wjnDRJevi5z/) -Agenda +**Agenda** * Quick intro - 5 mins * Showcasing new entities onboarded to internal LinkedIn DataHub (Data Concepts, Schemas) by [Nagarjuna Kanamarlapudi](https://www.linkedin.com/in/nagarjunak) (LinkedIn) - 15 mins * Showcasing new Lineage UI in internal LinkedIn DataHub By [Ignacio Bona](https://www.linkedin.com/in/ignaciobona) (LinkedIn) - 10 mins @@ -410,12 +455,12 @@ Agenda * Answering questions from the signup sheet - 13 mins * Questions from the participants - 10 mins * Closing remarks - 5 mins - -## 06/26/20 + +### June 2020 [Recording](https://bluejeans.com/s/yILyR/) -Agenda +**Agenda** * Quick intro - 5 mins * Onboarding Data Process entity by [Liangjun Jiang](https://github.com/liangjun-jiang) (Expedia) - 15 mins * How to onboard a new relationship to metadata graph by [Kerem Sahin](https://github.com/keremsahin1) (Linkedin) - 15 mins @@ -423,11 +468,11 @@ Agenda * Questions from the participants - 10 mins * Closing remarks - 5 mins -## 05/29/20 +### May 2020 [Recording](https://bluejeans.com/s/GCAzY) -Agenda +**Agenda** * Quick intro - 5 mins * How to add a new aspect/feature for an existing entity in UI by [Charlie Tran](https://www.linkedin.com/in/charlie-tran/) (LinkedIn) - 10 mins * How to search over a new field by [Jyoti Wadhwani](https://www.linkedin.com/in/jyotiwadhwani/) (LinkedIn) - 10 mins @@ -435,11 +480,11 @@ Agenda * Questions from the participants - 10 mins * Closing remarks - 5 mins -## 04/17/20 +### Apr 2020 (17th) [Recording](https://bluejeans.com/s/eYRD4) -Agenda +**Agenda** * Quick intro - 5 mins * [DataHub Journey with Expedia Group](https://www.youtube.com/watch?v=ajcRdB22s5o&ab_channel=ArunVasudevan) by [Arun Vasudevan](https://www.linkedin.com/in/arun-vasudevan-55117368/) (Expedia) - 10 mins * Deploying DataHub using Nix by [Larry Luo](https://github.com/clojurians-org) (Shanghai HuaRui Bank) - 10 mins @@ -447,13 +492,13 @@ Agenda * Questions from the participants - 10 mins * Closing remarks - 5 mins -## 04/03/20 +### Apr 2020 (3rd) [Recording](https://bluejeans.com/s/vzYpa) [Q&A](https://docs.google.com/document/d/1ChF9jiJWv9wj3HLLkFYRg7NSYg8Kb0PT7COd7Hf9Zpk/edit?usp=sharing) -- Agenda +- **Agenda** * Quick intro - 5 mins * Creating Helm charts for deploying DataHub on Kubernetes by [Bharat Akkinepalli](https://www.linkedin.com/in/bharat-akkinepalli-ba0b7223/) (ThoughtWorks) - 10 mins * How to onboard a new metadata aspect by [Mars Lan](https://www.linkedin.com/in/marslan) (LinkedIn) - 10 mins @@ -461,13 +506,13 @@ Agenda * Questions from the participants - 10 mins * Closing remarks - 5 mins -## 03/20/20 +### Mar 2020 (20th) [Recording](https://bluejeans.com/s/FSKEF) [Q&A](https://docs.google.com/document/d/1vQ6tAGXsVafnPIcZv1GSYgnTJJXFOACa1aWzOQjiGHI/edit) -Agenda +**Agenda** * Quick intro - 5 mins * Internal DataHub demo - 10 mins * What's coming up next for DataHub (what roadmap items we are working on) - 10 mins @@ -475,9 +520,8 @@ Agenda * Questions from the participants - 10 mins * Closing remarks - 5 mins -## 03/06/20 +### Mar 2020 (6th) [Recording](https://bluejeans.com/s/vULMG) -[Q&A](https://docs.google.com/document/d/1N_VGqlH9CD-54LBsVlpcK2Cf2Mgmuzq79EvN9qgBqtQ/edit) - +[Q&A](https://docs.google.com/document/d/1N_VGqlH9CD-54LBsVlpcK2Cf2Mgmuzq79EvN9qgBqtQ/edit) \ No newline at end of file diff --git a/docs/townhalls.md b/docs/townhalls.md index f9c3bb16150cd8..c80d198e5184c7 100644 --- a/docs/townhalls.md +++ b/docs/townhalls.md @@ -7,8 +7,13 @@ From time to time we also use the opportunity to showcase upcoming features. ## Meeting Invite & Agenda -You can join with this link https://zoom.datahubproject.io, or [RSVP](https://rsvp.datahubproject.io/) to get a calendar invite - this will always have the most up-to-date agenda for upcoming sessions. +You can join with [this link](https://zoom.datahubproject.io) or [RSVP](https://rsvp.datahubproject.io/) to get a calendar invite - this will always have the most up-to-date agenda for upcoming sessions. + +## Town Hall History + +See our Town Hall history for the recordings and summaries of the past town halls. + +* [Town Hall Events (July 2023~)](https://www.linkedin.com/company/acryl-data/events/) +* [Town Hall Events (~June 2023)](townhall-history.md) -## Past Meetings -See [Town Hall History](townhall-history.md) for recordings of past town halls. From 179f103412d036212a1155d436a507def4f4928f Mon Sep 17 00:00:00 2001 From: Xuelei Li <115022112+lix-mms@users.noreply.github.com> Date: Fri, 10 Nov 2023 17:58:38 +0100 Subject: [PATCH 17/29] fix(metadata-io): in Neo4j service use proper algorithm to get lineage (#8687) Co-authored-by: RyanHolstien Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- build.gradle | 8 +- docker/neo4j/env/docker.env | 1 + .../docker-compose-m1.quickstart.yml | 1 + .../quickstart/docker-compose.quickstart.yml | 1 + docs/how/updating-datahub.md | 1 + metadata-io/build.gradle | 3 + .../graph/neo4j/Neo4jGraphService.java | 231 +++++++++--------- .../graph/neo4j/Neo4jGraphServiceTest.java | 130 ++++++++++ .../graph/neo4j/Neo4jTestServerBuilder.java | 6 +- 9 files changed, 267 insertions(+), 115 deletions(-) diff --git a/build.gradle b/build.gradle index 31e005e001cf05..54802917d05a52 100644 --- a/build.gradle +++ b/build.gradle @@ -7,6 +7,8 @@ buildscript { ext.springBootVersion = '2.7.14' ext.openTelemetryVersion = '1.18.0' ext.neo4jVersion = '4.4.9' + ext.neo4jTestVersion = '4.4.25' + ext.neo4jApocVersion = '4.4.0.20:all' ext.testContainersVersion = '1.17.4' ext.elasticsearchVersion = '2.9.0' // ES 7.10, Opensearch 1.x, 2.x ext.jacksonVersion = '2.15.2' @@ -154,8 +156,10 @@ project.ext.externalDependency = [ 'mockServer': 'org.mock-server:mockserver-netty:5.11.2', 'mockServerClient': 'org.mock-server:mockserver-client-java:5.11.2', 'mysqlConnector': 'mysql:mysql-connector-java:8.0.20', - 'neo4jHarness': 'org.neo4j.test:neo4j-harness:' + neo4jVersion, + 'neo4jHarness': 'org.neo4j.test:neo4j-harness:' + neo4jTestVersion, 'neo4jJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jVersion, + 'neo4jTestJavaDriver': 'org.neo4j.driver:neo4j-java-driver:' + neo4jTestVersion, + 'neo4jApoc': 'org.neo4j.procedure:apoc:' + neo4jApocVersion, 'opentelemetryApi': 'io.opentelemetry:opentelemetry-api:' + openTelemetryVersion, 'opentelemetryAnnotations': 'io.opentelemetry:opentelemetry-extension-annotations:' + openTelemetryVersion, 'opentracingJdbc':'io.opentracing.contrib:opentracing-jdbc:0.2.15', @@ -218,7 +222,7 @@ project.ext.externalDependency = [ 'common': 'commons-io:commons-io:2.7', 'jline':'jline:jline:1.4.1', 'jetbrains':' org.jetbrains.kotlin:kotlin-stdlib:1.6.0' - + ] allprojects { diff --git a/docker/neo4j/env/docker.env b/docker/neo4j/env/docker.env index 961a5ffcf54836..c8f2a4878900f4 100644 --- a/docker/neo4j/env/docker.env +++ b/docker/neo4j/env/docker.env @@ -1,3 +1,4 @@ NEO4J_AUTH=neo4j/datahub NEO4J_dbms_default__database=graph.db NEO4J_dbms_allow__upgrade=true +NEO4JLABS_PLUGINS="[\"apoc\"]" diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 613718306abef6..4df32395cf82d5 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -253,6 +253,7 @@ services: - NEO4J_AUTH=neo4j/datahub - NEO4J_dbms_default__database=graph.db - NEO4J_dbms_allow__upgrade=true + - NEO4JLABS_PLUGINS=["apoc"] healthcheck: interval: 1s retries: 5 diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 30ccbae59be740..29c980532d46f6 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -253,6 +253,7 @@ services: - NEO4J_AUTH=neo4j/datahub - NEO4J_dbms_default__database=graph.db - NEO4J_dbms_allow__upgrade=true + - NEO4JLABS_PLUGINS=["apoc"] healthcheck: interval: 1s retries: 5 diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 28f11e4b6d7072..90b53161950e8d 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -16,6 +16,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes +- #8687 (datahub-helm #365 #353) - If Helm is used for installation and Neo4j is enabled, update the prerequisites Helm chart to version >=0.1.2 and adjust your value overrides in the `neo4j:` section according to the new structure. - #9044 - GraphQL APIs for adding ownership now expect either an `ownershipTypeUrn` referencing a customer ownership type or a (deprecated) `type`. Where before adding an ownership without a concrete type was allowed, this is no longer the case. For simplicity you can use the `type` parameter which will get translated to a custom ownership type internally if one exists for the type being added. - #9010 - In Redshift source's config `incremental_lineage` is set default to off. - #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 740fed61f13d56..4b36f533476f76 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -57,6 +57,9 @@ dependencies { testImplementation externalDependency.h2 testImplementation externalDependency.mysqlConnector testImplementation externalDependency.neo4jHarness + testImplementation (externalDependency.neo4jApoc) { + exclude group: 'org.yaml', module: 'snakeyaml' + } testImplementation externalDependency.mockito testImplementation externalDependency.mockitoInline testImplementation externalDependency.iStackCommons diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java index 41d39cca4eddaa..ac57fb7db2b781 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java @@ -5,6 +5,7 @@ import com.datahub.util.exception.RetryLimitReached; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; import com.linkedin.common.UrnArray; import com.linkedin.common.UrnArrayArray; import com.linkedin.common.urn.Urn; @@ -25,17 +26,20 @@ import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.utils.metrics.MetricUtils; +import com.linkedin.util.Pair; import io.opentelemetry.extension.annotations.WithSpan; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.StringJoiner; -import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.StreamSupport; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.AllArgsConstructor; @@ -50,8 +54,7 @@ import org.neo4j.driver.Session; import org.neo4j.driver.SessionConfig; import org.neo4j.driver.exceptions.Neo4jException; -import org.neo4j.driver.internal.InternalRelationship; -import org.neo4j.driver.types.Node; +import org.neo4j.driver.types.Relationship; @Slf4j @@ -62,9 +65,6 @@ public class Neo4jGraphService implements GraphService { private final Driver _driver; private SessionConfig _sessionConfig; - private static final String SOURCE = "source"; - private static final String UI = "UI"; - public Neo4jGraphService(@Nonnull LineageRegistry lineageRegistry, @Nonnull Driver driver) { this(lineageRegistry, driver, SessionConfig.defaultConfig()); } @@ -234,53 +234,36 @@ public EntityLineageResult getLineage(@Nonnull Urn entityUrn, @Nonnull LineageDi @Nullable Long endTimeMillis) { log.debug(String.format("Neo4j getLineage maxHops = %d", maxHops)); - final String statement = - generateLineageStatement(entityUrn, direction, graphFilters, maxHops, startTimeMillis, endTimeMillis); + final var statementAndParams = + generateLineageStatementAndParameters(entityUrn, direction, graphFilters, maxHops, startTimeMillis, endTimeMillis); + + final var statement = statementAndParams.getFirst(); + final var parameters = statementAndParams.getSecond(); List neo4jResult = - statement != null ? runQuery(buildStatement(statement, new HashMap<>())).list() : new ArrayList<>(); - - // It is possible to have more than 1 path from node A to node B in the graph and previous query returns all the paths. - // We convert the List into Map with only the shortest paths. "item.get(i).size()" is the path size between two nodes in relation. - // The key for mapping is the destination node as the source node is always the same, and it is defined by parameter. - neo4jResult = neo4jResult.stream() - .collect(Collectors.toMap(item -> item.values().get(2).asNode().get("urn").asString(), Function.identity(), - (item1, item2) -> item1.get(1).size() < item2.get(1).size() ? item1 : item2)) - .values() - .stream() - .collect(Collectors.toList()); + statement != null ? runQuery(buildStatement(statement, parameters)).list() : new ArrayList<>(); LineageRelationshipArray relations = new LineageRelationshipArray(); neo4jResult.stream().skip(offset).limit(count).forEach(item -> { String urn = item.values().get(2).asNode().get("urn").asString(); - String relationType = ((InternalRelationship) item.get(1).asList().get(0)).type().split("r_")[1]; - int numHops = item.get(1).size(); try { - // Generate path from r in neo4jResult - List pathFromRelationships = - item.values().get(1).asList(Collections.singletonList(new ArrayList())).stream().map(t -> createFromString( - // Get real upstream node/downstream node by direction - ((InternalRelationship) t).get(direction == LineageDirection.UPSTREAM ? "startUrn" : "endUrn") - .asString())).collect(Collectors.toList()); - if (direction == LineageDirection.UPSTREAM) { - // For ui to show path correctly, reverse path for UPSTREAM direction - Collections.reverse(pathFromRelationships); - // Add missing original node to the end since we generate path from relationships - pathFromRelationships.add(Urn.createFromString(item.values().get(0).asNode().get("urn").asString())); - } else { - // Add missing original node to the beginning since we generate path from relationships - pathFromRelationships.add(0, Urn.createFromString(item.values().get(0).asNode().get("urn").asString())); - } + final var path = item.get(1).asPath(); + final List nodeListAsPath = StreamSupport.stream( + path.nodes().spliterator(), false) + .map(node -> createFromString(node.get("urn").asString())) + .collect(Collectors.toList()); + + final var firstRelationship = Optional.ofNullable(Iterables.getFirst(path.relationships(), null)); relations.add(new LineageRelationship().setEntity(Urn.createFromString(urn)) - .setType(relationType) - .setDegree(numHops) - .setPaths(new UrnArrayArray(new UrnArray(pathFromRelationships)))); + // although firstRelationship should never be absent, provide "" as fallback value + .setType(firstRelationship.map(Relationship::type).orElse("")) + .setDegree(path.length()) + .setPaths(new UrnArrayArray(new UrnArray(nodeListAsPath)))); } catch (URISyntaxException ignored) { log.warn(String.format("Can't convert urn = %s, Error = %s", urn, ignored.getMessage())); } }); - EntityLineageResult result = new EntityLineageResult().setStart(offset) .setCount(relations.size()) .setRelationships(relations) @@ -290,31 +273,104 @@ public EntityLineageResult getLineage(@Nonnull Urn entityUrn, @Nonnull LineageDi return result; } - private String generateLineageStatement(@Nonnull Urn entityUrn, @Nonnull LineageDirection direction, - GraphFilters graphFilters, int maxHops, @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { - String statement; - final String allowedEntityTypes = String.join(" OR b:", graphFilters.getAllowedEntityTypes()); - - final String multiHopMatchTemplateIndirect = "MATCH p = shortestPath((a {urn: '%s'})<-[r*1..%d]-(b)) "; - final String multiHopMatchTemplateDirect = "MATCH p = shortestPath((a {urn: '%s'})-[r*1..%d]->(b)) "; - // directionFilterTemplate should apply to all condition. - final String multiHopMatchTemplate = - direction == LineageDirection.UPSTREAM ? multiHopMatchTemplateIndirect : multiHopMatchTemplateDirect; - final String fullQueryTemplate = generateFullQueryTemplate(multiHopMatchTemplate, startTimeMillis, endTimeMillis); - - if (startTimeMillis != null && endTimeMillis != null) { - statement = - String.format(fullQueryTemplate, startTimeMillis, endTimeMillis, entityUrn, maxHops, allowedEntityTypes, - entityUrn); - } else if (startTimeMillis != null) { - statement = String.format(fullQueryTemplate, startTimeMillis, entityUrn, maxHops, allowedEntityTypes, entityUrn); - } else if (endTimeMillis != null) { - statement = String.format(fullQueryTemplate, endTimeMillis, entityUrn, maxHops, allowedEntityTypes, entityUrn); + private String getPathFindingLabelFilter(List entityNames) { + return entityNames.stream().map(x -> String.format("+%s", x)).collect(Collectors.joining("|")); + } + + private String getPathFindingRelationshipFilter(@Nonnull List entityNames, @Nullable LineageDirection direction) { + // relationshipFilter supports mixing different directions for various relation types, + // so simply transform entries lineage registry into format of filter + final var filterComponents = new HashSet(); + for (final var entityName : entityNames) { + if (direction != null) { + for (final var edgeInfo : _lineageRegistry.getLineageRelationships(entityName, direction)) { + final var type = edgeInfo.getType(); + if (edgeInfo.getDirection() == RelationshipDirection.INCOMING) { + filterComponents.add("<" + type); + } else { + filterComponents.add(type + ">"); + } + } + } else { + // return disjunctive combination of edge types regardless of direction + for (final var direction1 : List.of(LineageDirection.UPSTREAM, LineageDirection.DOWNSTREAM)) { + for (final var edgeInfo : _lineageRegistry.getLineageRelationships(entityName, direction1)) { + filterComponents.add(edgeInfo.getType()); + } + } + } + } + return String.join("|", filterComponents); + } + + private Pair> generateLineageStatementAndParameters( + @Nonnull Urn entityUrn, @Nonnull LineageDirection direction, + GraphFilters graphFilters, int maxHops, + @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { + + final var parameterMap = new HashMap(Map.of( + "urn", entityUrn.toString(), + "labelFilter", getPathFindingLabelFilter(graphFilters.getAllowedEntityTypes()), + "relationshipFilter", getPathFindingRelationshipFilter(graphFilters.getAllowedEntityTypes(), direction), + "maxHops", maxHops + )); + + if (startTimeMillis == null && endTimeMillis == null) { + // if no time filtering required, simply find all expansion paths to other nodes + final var statement = "MATCH (a {urn: $urn}) " + + "CALL apoc.path.spanningTree(a, { " + + " relationshipFilter: $relationshipFilter, " + + " labelFilter: $labelFilter, " + + " minLevel: 1, " + + " maxLevel: $maxHops " + + "}) " + + "YIELD path " + + "WITH a, path AS path " + + "RETURN a, path, last(nodes(path));"; + return Pair.of(statement, parameterMap); } else { - statement = String.format(fullQueryTemplate, entityUrn, maxHops, allowedEntityTypes, entityUrn); + // when needing time filtering, possibility on multiple paths between two + // nodes must be considered, and we need to construct more complex query + + // use r_ edges until they are no longer useful + final var relationFilter = getPathFindingRelationshipFilter(graphFilters.getAllowedEntityTypes(), null) + .replaceAll("(\\w+)", "r_$1"); + final var relationshipPattern = + String.format( + (direction == LineageDirection.UPSTREAM ? "<-[:%s*1..%d]-" : "-[:%s*1..%d]->"), + relationFilter, maxHops); + + // two steps: + // 1. find list of nodes reachable within maxHops + // 2. find the shortest paths from start node to every other node in these nodes + // (note: according to the docs of shortestPath, WHERE conditions are applied during path exploration, not + // after path exploration is done) + final var statement = "MATCH (a {urn: $urn}) " + + "CALL apoc.path.subgraphNodes(a, { " + + " relationshipFilter: $relationshipFilter, " + + " labelFilter: $labelFilter, " + + " minLevel: 1, " + + " maxLevel: $maxHops " + + "}) " + + "YIELD node AS b " + + "WITH a, b " + + "MATCH path = shortestPath((a)" + relationshipPattern + "(b)) " + + "WHERE a <> b " + + " AND ALL(rt IN relationships(path) WHERE " + + " (EXISTS(rt.source) AND rt.source = 'UI') OR " + + " (NOT EXISTS(rt.createdOn) AND NOT EXISTS(rt.updatedOn)) OR " + + " ($startTimeMillis <= rt.createdOn <= $endTimeMillis OR " + + " $startTimeMillis <= rt.updatedOn <= $endTimeMillis) " + + " ) " + + "RETURN a, path, b;"; + + // provide dummy start/end time when not provided, so no need to + // format clause differently if either of them is missing + parameterMap.put("startTimeMillis", startTimeMillis == null ? 0 : startTimeMillis); + parameterMap.put("endTimeMillis", endTimeMillis == null ? System.currentTimeMillis() : endTimeMillis); + + return Pair.of(statement, parameterMap); } - - return statement; } @Nonnull @@ -583,15 +639,6 @@ private Result runQuery(@Nonnull Statement statement) { } } - @Nonnull - private static String toCriterionWhereString(@Nonnull String key, @Nonnull Object value) { - if (ClassUtils.isPrimitiveOrWrapper(value.getClass())) { - return key + " = " + value; - } - - return key + " = \"" + value.toString() + "\""; - } - // Returns "key:value" String, if value is not primitive, then use toString() and double quote it @Nonnull private static String toCriterionString(@Nonnull String key, @Nonnull Object value) { @@ -715,44 +762,4 @@ Urn createFromString(@Nonnull String rawUrn) { return null; } } - - private String generateFullQueryTemplate(@Nonnull String multiHopMatchTemplate, @Nullable Long startTimeMillis, @Nullable Long endTimeMillis) { - final String sourceUiCheck = String.format("(EXISTS(rt.%s) AND rt.%s = '%s') ", SOURCE, SOURCE, UI); - final String whereTemplate = "WHERE (b:%s) AND b.urn <> '%s' "; - final String returnTemplate = "RETURN a,r,b"; - String withTimeTemplate = ""; - String timeFilterConditionTemplate = "AND ALL(rt IN relationships(p) WHERE left(type(rt), 2)='r_')"; - - if (startTimeMillis != null && endTimeMillis != null) { - withTimeTemplate = "WITH %d as startTimeMillis, %d as endTimeMillis "; - timeFilterConditionTemplate = - "AND ALL(rt IN relationships(p) WHERE " + sourceUiCheck + "OR " - + "(NOT EXISTS(rt.createdOn) AND NOT EXISTS(rt.updatedOn)) OR " - + "((rt.createdOn >= startTimeMillis AND rt.createdOn <= endTimeMillis) OR " - + "(rt.updatedOn >= startTimeMillis AND rt.updatedOn <= endTimeMillis))) " - + "AND ALL(rt IN relationships(p) WHERE left(type(rt), 2)='r_')"; - } else if (startTimeMillis != null) { - withTimeTemplate = "WITH %d as startTimeMillis "; - timeFilterConditionTemplate = - "AND ALL(rt IN relationships(p) WHERE " + sourceUiCheck + "OR " - + "(NOT EXISTS(rt.createdOn) AND NOT EXISTS(rt.updatedOn)) OR " - + "(rt.createdOn >= startTimeMillis OR rt.updatedOn >= startTimeMillis)) " - + "AND ALL(rt IN relationships(p) WHERE left(type(rt), 2)='r_')"; - } else if (endTimeMillis != null) { - withTimeTemplate = "WITH %d as endTimeMillis "; - timeFilterConditionTemplate = - "AND ALL(rt IN relationships(p) WHERE " + sourceUiCheck + "OR " - + "(NOT EXISTS(rt.createdOn) AND NOT EXISTS(rt.updatedOn)) OR " - + "(rt.createdOn <= endTimeMillis OR rt.updatedOn <= endTimeMillis)) " - + "AND ALL(rt IN relationships(p) WHERE left(type(rt), 2)='r_')"; - } - final StringJoiner fullQueryTemplateJoiner = new StringJoiner(" "); - fullQueryTemplateJoiner.add(withTimeTemplate); - fullQueryTemplateJoiner.add(multiHopMatchTemplate); - fullQueryTemplateJoiner.add(whereTemplate); - fullQueryTemplateJoiner.add(timeFilterConditionTemplate); - fullQueryTemplateJoiner.add(returnTemplate); - - return fullQueryTemplateJoiner.toString(); - } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java index d36f05cfb039dc..6f63209f9c3801 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java @@ -1,6 +1,7 @@ package com.linkedin.metadata.graph.neo4j; import com.linkedin.common.FabricType; +import com.linkedin.common.UrnArray; import com.linkedin.common.urn.DataPlatformUrn; import com.linkedin.common.urn.DatasetUrn; import com.linkedin.common.urn.TagUrn; @@ -17,6 +18,7 @@ import com.linkedin.metadata.query.filter.RelationshipFilter; import java.util.Arrays; import java.util.Collections; + import org.neo4j.driver.Driver; import org.neo4j.driver.GraphDatabase; import org.testng.SkipException; @@ -29,6 +31,8 @@ import java.util.Comparator; import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import static com.linkedin.metadata.search.utils.QueryUtils.*; import static org.testng.Assert.assertEquals; @@ -194,11 +198,82 @@ public void testRemoveEdge() throws Exception { assertEquals(result.getTotal(), 0); } + private Set getPathUrnArraysFromLineageResult(EntityLineageResult result) { + return result.getRelationships() + .stream() + .map(x -> x.getPaths().get(0)) + .collect(Collectors.toSet()); + } + + @Test + public void testGetLineage() { + GraphService service = getGraphService(); + + List edges = Arrays.asList( + // d1 <-Consumes- dj1 -Produces-> d2 <-DownstreamOf- d3 <-DownstreamOf- d5 + new Edge(dataJobOneUrn, datasetOneUrn, consumes, 1L, null, 3L, null, null), + new Edge(dataJobOneUrn, datasetTwoUrn, produces, 5L, null, 7L, null, null), + new Edge(datasetThreeUrn, datasetTwoUrn, downstreamOf, 9L, null, null, null, null), + new Edge(datasetFiveUrn, datasetThreeUrn, downstreamOf, 11L, null, null, null, null), + + // another path between d2 and d5 which is shorter + // d1 <-DownstreamOf- d4 <-DownstreamOf- d5 + new Edge(datasetFourUrn, datasetOneUrn, downstreamOf, 13L, null, 13L, null, null), + new Edge(datasetFiveUrn, datasetFourUrn, downstreamOf, 13L, null, 13L, null, null) + ); + edges.forEach(service::addEdge); + + // simple path finding + final var upstreamLineageDataset3Hop3 = service.getLineage(datasetThreeUrn, LineageDirection.UPSTREAM, 0, 1000, 3); + assertEquals(upstreamLineageDataset3Hop3.getTotal().intValue(), 3); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageDataset3Hop3), + Set.of( + new UrnArray(datasetThreeUrn, datasetTwoUrn), + new UrnArray(datasetThreeUrn, datasetTwoUrn, dataJobOneUrn), + new UrnArray(datasetThreeUrn, datasetTwoUrn, dataJobOneUrn, datasetOneUrn))); + + // simple path finding + final var upstreamLineageDatasetFiveHop2 = service.getLineage(datasetFiveUrn, LineageDirection.UPSTREAM, 0, 1000, 2); + assertEquals(upstreamLineageDatasetFiveHop2.getTotal().intValue(), 4); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageDatasetFiveHop2), + Set.of( + new UrnArray(datasetFiveUrn, datasetThreeUrn), + new UrnArray(datasetFiveUrn, datasetThreeUrn, datasetTwoUrn), + new UrnArray(datasetFiveUrn, datasetFourUrn), + new UrnArray(datasetFiveUrn, datasetFourUrn, datasetOneUrn))); + + // there are two paths from p5 to p1, one longer and one shorter, and the longer one is discarded from result + final var upstreamLineageDataset5Hop5 = service.getLineage(datasetFiveUrn, LineageDirection.UPSTREAM, 0, 1000, 5); + assertEquals(upstreamLineageDataset5Hop5.getTotal().intValue(), 5); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageDataset5Hop5), + Set.of( + new UrnArray(datasetFiveUrn, datasetThreeUrn), + new UrnArray(datasetFiveUrn, datasetThreeUrn, datasetTwoUrn), + new UrnArray(datasetFiveUrn, datasetThreeUrn, datasetTwoUrn, dataJobOneUrn), + new UrnArray(datasetFiveUrn, datasetFourUrn), + new UrnArray(datasetFiveUrn, datasetFourUrn, datasetOneUrn))); + + // downstream lookup + final var downstreamLineageDataset1Hop2 = service.getLineage(datasetOneUrn, LineageDirection.DOWNSTREAM, 0, 1000, 2); + assertEquals(downstreamLineageDataset1Hop2.getTotal().intValue(), 4); + assertEquals( + getPathUrnArraysFromLineageResult(downstreamLineageDataset1Hop2), + Set.of( + new UrnArray(datasetOneUrn, dataJobOneUrn), + new UrnArray(datasetOneUrn, dataJobOneUrn, datasetTwoUrn), + new UrnArray(datasetOneUrn, datasetFourUrn), + new UrnArray(datasetOneUrn, datasetFourUrn, datasetFiveUrn))); + } + @Test public void testGetLineageTimeFilterQuery() throws Exception { GraphService service = getGraphService(); List edges = Arrays.asList( + // d1 <-Consumes- dj1 -Produces-> d2 <-DownstreamOf- d3 <-DownstreamOf- d4 new Edge(dataJobOneUrn, datasetOneUrn, consumes, 1L, null, 3L, null, null), new Edge(dataJobOneUrn, datasetTwoUrn, produces, 5L, null, 7L, null, null), new Edge(datasetThreeUrn, datasetTwoUrn, downstreamOf, 9L, null, null, null, null), @@ -206,21 +281,76 @@ public void testGetLineageTimeFilterQuery() throws Exception { ); edges.forEach(service::addEdge); + // no time filtering EntityLineageResult upstreamLineageTwoHops = service.getLineage(datasetFourUrn, LineageDirection.UPSTREAM, 0, 1000, 2); assertEquals(upstreamLineageTwoHops.getTotal().intValue(), 2); assertEquals(upstreamLineageTwoHops.getRelationships().size(), 2); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageTwoHops), + Set.of( + new UrnArray(datasetFourUrn, datasetThreeUrn), + new UrnArray(datasetFourUrn, datasetThreeUrn, datasetTwoUrn))); + // with time filtering EntityLineageResult upstreamLineageTwoHopsWithTimeFilter = service.getLineage(datasetFourUrn, LineageDirection.UPSTREAM, 0, 1000, 2, 10L, 12L); assertEquals(upstreamLineageTwoHopsWithTimeFilter.getTotal().intValue(), 1); assertEquals(upstreamLineageTwoHopsWithTimeFilter.getRelationships().size(), 1); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageTwoHopsWithTimeFilter), + Set.of( + new UrnArray(datasetFourUrn, datasetThreeUrn))); + // with time filtering EntityLineageResult upstreamLineageTimeFilter = service.getLineage(datasetTwoUrn, LineageDirection.UPSTREAM, 0, 1000, 4, 2L, 6L); assertEquals(upstreamLineageTimeFilter.getTotal().intValue(), 2); assertEquals(upstreamLineageTimeFilter.getRelationships().size(), 2); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageTimeFilter), + Set.of( + new UrnArray(datasetTwoUrn, dataJobOneUrn), + new UrnArray(datasetTwoUrn, dataJobOneUrn, datasetOneUrn))); + // with time filtering EntityLineageResult downstreamLineageTimeFilter = service.getLineage(datasetOneUrn, LineageDirection.DOWNSTREAM, 0, 1000, 4, 0L, 4L); assertEquals(downstreamLineageTimeFilter.getTotal().intValue(), 1); assertEquals(downstreamLineageTimeFilter.getRelationships().size(), 1); + assertEquals( + getPathUrnArraysFromLineageResult(downstreamLineageTimeFilter), + Set.of( + new UrnArray(datasetOneUrn, dataJobOneUrn))); + } + + @Test + public void testGetLineageTimeFilteringSkipsShorterButNonMatchingPaths() { + GraphService service = getGraphService(); + List edges = Arrays.asList( + // d1 <-Consumes- dj1 -Produces-> d2 <-DownstreamOf- d3 + new Edge(dataJobOneUrn, datasetOneUrn, consumes, 5L, null, 5L, null, null), + new Edge(dataJobOneUrn, datasetTwoUrn, produces, 7L, null, 7L, null, null), + new Edge(datasetThreeUrn, datasetTwoUrn, downstreamOf, 9L, null, null, null, null), + + // d1 <-DownstreamOf- d3 (shorter path from d3 to d1, but with very old time) + new Edge(datasetThreeUrn, datasetOneUrn, downstreamOf, 1L, null, 2L, null, null) + ); + edges.forEach(service::addEdge); + + // no time filtering, shorter path from d3 to d1 is returned + EntityLineageResult upstreamLineageNoTimeFiltering = service.getLineage(datasetThreeUrn, LineageDirection.UPSTREAM, 0, 1000, 3); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageNoTimeFiltering), + Set.of( + new UrnArray(datasetThreeUrn, datasetTwoUrn), + new UrnArray(datasetThreeUrn, datasetTwoUrn, dataJobOneUrn), + new UrnArray(datasetThreeUrn, datasetOneUrn))); + + // with time filtering, shorter path from d3 to d1 is excluded so longer path is returned + EntityLineageResult upstreamLineageTimeFiltering = service.getLineage(datasetThreeUrn, LineageDirection.UPSTREAM, 0, 1000, 3, 3L, 17L); + assertEquals( + getPathUrnArraysFromLineageResult(upstreamLineageTimeFiltering), + Set.of( + new UrnArray(datasetThreeUrn, datasetTwoUrn), + new UrnArray(datasetThreeUrn, datasetTwoUrn, dataJobOneUrn), + new UrnArray(datasetThreeUrn, datasetTwoUrn, dataJobOneUrn, datasetOneUrn))); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jTestServerBuilder.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jTestServerBuilder.java index 4d6d15255b9222..ba4e4cec379144 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jTestServerBuilder.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jTestServerBuilder.java @@ -2,6 +2,8 @@ import java.io.File; import java.net.URI; + +import apoc.path.PathExplorer; import org.neo4j.graphdb.GraphDatabaseService; import org.neo4j.harness.Neo4j; import org.neo4j.harness.Neo4jBuilder; @@ -17,7 +19,9 @@ private Neo4jTestServerBuilder(Neo4jBuilder builder) { } public Neo4jTestServerBuilder() { - this(new InProcessNeo4jBuilder()); + this(new InProcessNeo4jBuilder() + .withProcedure(PathExplorer.class) + ); } public Neo4jTestServerBuilder(File workingDirectory) { From b851d59e208d6f1f9c33f90d43d49933f6e557be Mon Sep 17 00:00:00 2001 From: purnimagarg1 <139125209+purnimagarg1@users.noreply.github.com> Date: Fri, 10 Nov 2023 22:52:04 +0530 Subject: [PATCH 18/29] Managed Ingestion UX Improvements (#9216) --- .../source/IngestionSourceTableColumns.tsx | 12 ++++-- .../ExecutionRequestDetailsModal.tsx | 38 ++++++++++++++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceTableColumns.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceTableColumns.tsx index c47d08d5b60035..155e75f1895f53 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceTableColumns.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceTableColumns.tsx @@ -61,6 +61,14 @@ const CliBadge = styled.span` margin-right: 5px; } `; +const StatusText = styled(Typography.Text)` + font-weight: bold; + margin-left: 8px; + color: ${(props) => props.color}; + &:hover { + text-decoration: underline; + }, +`; interface TypeColumnProps { type: string; record: any; @@ -124,9 +132,7 @@ export function LastStatusColumn({ status, record, setFocusExecutionUrn }: LastS type="link" onClick={() => setFocusExecutionUrn(record.lastExecUrn)} > - - {text || 'Pending...'} - + {text || 'Pending...'} ); diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index 00fdc89964f88a..96dfc05e391532 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -83,6 +83,17 @@ const ShowMoreButton = styled(Button)` padding: 0px; `; +const LogsContainer = styled.div` + margin-bottom: -25px; + ${(props) => + props.areLogsExpandable && + !props.showExpandedLogs && + ` + -webkit-mask-image: linear-gradient(to bottom, rgba(0,0,0,1) 50%, rgba(255,0,0,0.5) 60%, rgba(255,0,0,0) 90% ); + mask-image: linear-gradient(to bottom, rgba(0,0,0,1) 50%, rgba(255,0,0,0.5) 60%, rgba(255,0,0,0) 90%); + `} +`; + const modalStyle = { top: 100, }; @@ -91,6 +102,11 @@ const modalBodyStyle = { padding: 0, }; +type LogsContainerProps = { + showExpandedLogs: boolean; + areLogsExpandable: boolean; +}; + type Props = { urn: string; visible: boolean; @@ -108,7 +124,7 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { downloadFile(output, `exec-${urn}.log`); }; - const logs = (showExpandedLogs && output) || output.slice(0, 100); + const logs = (showExpandedLogs && output) || output.slice(0, 250); const result = data?.executionRequest?.result?.status; useEffect(() => { @@ -140,7 +156,7 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { } const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 1).join('\n'); - const areLogsExpandable = output.length > 100; + const areLogsExpandable = output.length > 250; const isRecipeExpandable = recipeYaml?.includes('\n'); return ( @@ -181,14 +197,16 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { Download - -
{`${logs}${!showExpandedLogs && areLogsExpandable ? '...' : ''}`}
- {areLogsExpandable && ( - setShowExpandedLogs(!showExpandedLogs)}> - {showExpandedLogs ? 'Hide' : 'Show More'} - - )} -
+ + +
{`${logs}${!showExpandedLogs && areLogsExpandable ? '...' : ''}`}
+
+
+ {areLogsExpandable && ( + setShowExpandedLogs(!showExpandedLogs)}> + {showExpandedLogs ? 'Hide' : 'Show More'} + + )} {recipe && ( From 89dff8f7bddee15d578170f5c1db586c628cabf4 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 10 Nov 2023 09:34:08 -0800 Subject: [PATCH 19/29] chore(ingest): start working on pydantic v2 support (#9220) --- metadata-ingestion/scripts/avro_codegen.py | 2 +- metadata-ingestion/setup.cfg | 1 + .../src/datahub/cli/cli_utils.py | 2 +- .../src/datahub/cli/lite_cli.py | 2 +- .../src/datahub/configuration/_config_enum.py | 26 ++++-- .../src/datahub/configuration/common.py | 27 ++++-- .../src/datahub/configuration/oauth.py | 6 +- .../pydantic_migration_helpers.py | 30 +++++++ .../src/datahub/emitter/mcp_builder.py | 2 +- .../ingestion/extractor/mce_extractor.py | 6 +- .../datahub/ingestion/run/pipeline_config.py | 2 +- .../source/bigquery_v2/bigquery_config.py | 2 +- .../source/data_lake_common/path_spec.py | 2 +- .../ingestion/source/datahub/config.py | 2 +- .../datahub/ingestion/source/dbt/dbt_cloud.py | 1 + .../ingestion/source/dbt/dbt_common.py | 6 +- .../src/datahub/ingestion/source/kafka.py | 4 +- .../ingestion/source/looker/lookml_source.py | 7 +- .../src/datahub/ingestion/source/nifi.py | 4 +- .../ingestion/source/powerbi/config.py | 5 +- .../report_server_domain.py | 84 ++++++++++--------- .../ingestion/source/redshift/config.py | 6 +- .../src/datahub/ingestion/source/s3/config.py | 2 +- .../source/s3/datalake_profiler_config.py | 2 +- .../datahub/ingestion/source/salesforce.py | 2 +- .../source/snowflake/snowflake_usage_v2.py | 20 ++--- .../ingestion/source/sql/clickhouse.py | 11 +-- .../src/datahub/ingestion/source/sql/druid.py | 2 +- .../src/datahub/ingestion/source/sql/hive.py | 2 +- .../src/datahub/ingestion/source/sql/mysql.py | 4 +- .../datahub/ingestion/source/sql/postgres.py | 6 +- .../datahub/ingestion/source/sql/presto.py | 2 +- .../datahub/ingestion/source/sql/redshift.py | 2 +- .../ingestion/source/sql/sql_config.py | 2 +- .../datahub/ingestion/source/sql/teradata.py | 2 +- .../src/datahub/ingestion/source/sql/trino.py | 2 +- .../source/state/stateful_ingestion_base.py | 10 +-- .../src/datahub/ingestion/source/superset.py | 2 +- .../datahub/ingestion/source/unity/config.py | 2 +- .../ingestion/source_config/sql/snowflake.py | 3 +- .../source_config/usage/bigquery_usage.py | 2 +- .../src/datahub/upgrade/upgrade.py | 8 +- .../datahub/utilities/lossy_collections.py | 12 +++ .../src/datahub/utilities/sqlglot_lineage.py | 26 +++--- 44 files changed, 216 insertions(+), 139 deletions(-) create mode 100644 metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 021ebd4a31eb3a..de8836559217b3 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -192,7 +192,7 @@ def add_avro_python3_warning(filepath: Path) -> None: # This means that installation order matters, which is a pretty unintuitive outcome. # See https://github.com/pypa/pip/issues/4625 for details. try: - from avro.schema import SchemaFromJSONData + from avro.schema import SchemaFromJSONData # type: ignore import warnings warnings.warn("It seems like 'avro-python3' is installed, which conflicts with the 'avro' package used by datahub. " diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg index 8b78e4d3c9c6f9..b3fc53ccfaf584 100644 --- a/metadata-ingestion/setup.cfg +++ b/metadata-ingestion/setup.cfg @@ -88,6 +88,7 @@ filterwarnings = ignore:Deprecated call to \`pkg_resources.declare_namespace:DeprecationWarning ignore:pkg_resources is deprecated as an API:DeprecationWarning ignore:Did not recognize type:sqlalchemy.exc.SAWarning + ignore::datahub.configuration.pydantic_migration_helpers.PydanticDeprecatedSince20 [coverage:run] # Because of some quirks in the way setup.cfg, coverage.py, pytest-cov, diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index a7ea5b4f65785d..8ac9a101121be6 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -47,7 +47,7 @@ class GmsConfig(BaseModel): server: str - token: Optional[str] + token: Optional[str] = None class DatahubConfig(BaseModel): diff --git a/metadata-ingestion/src/datahub/cli/lite_cli.py b/metadata-ingestion/src/datahub/cli/lite_cli.py index b49284bb627f28..8636187a51d09f 100644 --- a/metadata-ingestion/src/datahub/cli/lite_cli.py +++ b/metadata-ingestion/src/datahub/cli/lite_cli.py @@ -40,7 +40,7 @@ class DuckDBLiteConfigWrapper(DuckDBLiteConfig): class LiteCliConfig(DatahubConfig): lite: LiteLocalConfig = LiteLocalConfig( - type="duckdb", config=DuckDBLiteConfigWrapper() + type="duckdb", config=DuckDBLiteConfigWrapper().dict() ) diff --git a/metadata-ingestion/src/datahub/configuration/_config_enum.py b/metadata-ingestion/src/datahub/configuration/_config_enum.py index b4fb93dae4439f..190a006b077d9f 100644 --- a/metadata-ingestion/src/datahub/configuration/_config_enum.py +++ b/metadata-ingestion/src/datahub/configuration/_config_enum.py @@ -4,6 +4,8 @@ import pydantic.types import pydantic.validators +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 + class ConfigEnum(Enum): # Ideally we would use @staticmethod here, but some versions of Python don't support it. @@ -15,11 +17,25 @@ def _generate_next_value_( # type: ignore # From https://stackoverflow.com/a/44785241/5004662. return name - @classmethod - def __get_validators__(cls) -> "pydantic.types.CallableGenerator": - # We convert the text to uppercase before attempting to match it to an enum value. - yield cls.validate - yield pydantic.validators.enum_member_validator + if PYDANTIC_VERSION_2: + # if TYPE_CHECKING: + # from pydantic import GetCoreSchemaHandler + + @classmethod + def __get_pydantic_core_schema__(cls, source_type, handler): # type: ignore + from pydantic_core import core_schema + + return core_schema.no_info_before_validator_function( + cls.validate, handler(source_type) + ) + + else: + + @classmethod + def __get_validators__(cls) -> "pydantic.types.CallableGenerator": + # We convert the text to uppercase before attempting to match it to an enum value. + yield cls.validate + yield pydantic.validators.enum_member_validator @classmethod def validate(cls, v): # type: ignore[no-untyped-def] diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 73ac4baac48c0f..f225856ca43ce4 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -11,6 +11,7 @@ from typing_extensions import Protocol, runtime_checkable from datahub.configuration._config_enum import ConfigEnum +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.utilities.dedup_list import deduplicate_list _ConfigSelf = TypeVar("_ConfigSelf", bound="ConfigModel") @@ -71,14 +72,8 @@ def redact_raw_config(obj: Any) -> Any: class ConfigModel(BaseModel): class Config: - extra = Extra.forbid - underscore_attrs_are_private = True - keep_untouched = ( - cached_property, - ) # needed to allow cached_property to work. See https://github.com/samuelcolvin/pydantic/issues/1241 for more info. - @staticmethod - def schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: + def _schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: # We use the custom "hidden_from_docs" attribute to hide fields from the # autogenerated docs. remove_fields = [] @@ -89,6 +84,19 @@ def schema_extra(schema: Dict[str, Any], model: Type["ConfigModel"]) -> None: for key in remove_fields: del schema["properties"][key] + # This is purely to suppress pydantic's warnings, since this class is used everywhere. + if PYDANTIC_VERSION_2: + extra = "forbid" + ignored_types = (cached_property,) + json_schema_extra = _schema_extra + else: + extra = Extra.forbid + underscore_attrs_are_private = True + keep_untouched = ( + cached_property, + ) # needed to allow cached_property to work. See https://github.com/samuelcolvin/pydantic/issues/1241 for more info. + schema_extra = _schema_extra + @classmethod def parse_obj_allow_extras(cls: Type[_ConfigSelf], obj: Any) -> _ConfigSelf: with unittest.mock.patch.object(cls.Config, "extra", pydantic.Extra.allow): @@ -102,7 +110,10 @@ class PermissiveConfigModel(ConfigModel): # It is usually used for argument bags that are passed through to third-party libraries. class Config: - extra = Extra.allow + if PYDANTIC_VERSION_2: + extra = "allow" + else: + extra = Extra.allow class TransformerSemantics(ConfigEnum): diff --git a/metadata-ingestion/src/datahub/configuration/oauth.py b/metadata-ingestion/src/datahub/configuration/oauth.py index 9a1ddbf437913f..61a06580299db6 100644 --- a/metadata-ingestion/src/datahub/configuration/oauth.py +++ b/metadata-ingestion/src/datahub/configuration/oauth.py @@ -24,11 +24,11 @@ class OAuthConfiguration(ConfigModel): default=False, ) client_secret: Optional[SecretStr] = Field( - description="client secret of the application if use_certificate = false" + None, description="client secret of the application if use_certificate = false" ) encoded_oauth_public_key: Optional[str] = Field( - description="base64 encoded certificate content if use_certificate = true" + None, description="base64 encoded certificate content if use_certificate = true" ) encoded_oauth_private_key: Optional[str] = Field( - description="base64 encoded private key content if use_certificate = true" + None, description="base64 encoded private key content if use_certificate = true" ) diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py new file mode 100644 index 00000000000000..f1876b500598ba --- /dev/null +++ b/metadata-ingestion/src/datahub/configuration/pydantic_migration_helpers.py @@ -0,0 +1,30 @@ +import pydantic.version +from packaging.version import Version + +PYDANTIC_VERSION_2: bool +if Version(pydantic.version.VERSION) >= Version("2.0"): + PYDANTIC_VERSION_2 = True +else: + PYDANTIC_VERSION_2 = False + + +# This can be used to silence deprecation warnings while we migrate. +if PYDANTIC_VERSION_2: + from pydantic import PydanticDeprecatedSince20 # type: ignore +else: + + class PydanticDeprecatedSince20(Warning): # type: ignore + pass + + +if PYDANTIC_VERSION_2: + from pydantic import BaseModel as GenericModel +else: + from pydantic.generics import GenericModel # type: ignore + + +__all__ = [ + "PYDANTIC_VERSION_2", + "PydanticDeprecatedSince20", + "GenericModel", +] diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index d50feba8b119c8..a7578e39374ac5 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -127,7 +127,7 @@ class BucketKey(ContainerKey): class NotebookKey(DatahubKey): notebook_id: int platform: str - instance: Optional[str] + instance: Optional[str] = None def as_urn(self) -> str: return make_dataset_urn_with_platform_instance( diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py index 36450dda153d72..7ad68c0fcf8ea2 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py @@ -26,11 +26,11 @@ def _try_reformat_with_black(code: str) -> str: class WorkUnitRecordExtractorConfig(ConfigModel): - set_system_metadata = True - set_system_metadata_pipeline_name = ( + set_system_metadata: bool = True + set_system_metadata_pipeline_name: bool = ( False # false for now until the models are available in OSS ) - unpack_mces_into_mcps = False + unpack_mces_into_mcps: bool = False class WorkUnitRecordExtractor( diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py index da3cee8ad9c1b8..f22f94c9e93514 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py @@ -72,7 +72,7 @@ class PipelineConfig(ConfigModel): source: SourceConfig sink: DynamicTypedConfig - transformers: Optional[List[DynamicTypedConfig]] + transformers: Optional[List[DynamicTypedConfig]] = None flags: FlagsConfig = Field(default=FlagsConfig(), hidden_from_docs=True) reporting: List[ReporterConfig] = [] run_id: str = DEFAULT_RUN_ID diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index f762d451849ab7..cbe68a454ea436 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -265,7 +265,7 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="Option to exclude empty projects from being ingested.", ) - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index a35fb94614f722..05b1b6b7cc0403 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -214,7 +214,7 @@ def glob_include(self): logger.debug(f"Setting _glob_include: {glob_include}") return glob_include - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def validate_path_spec(cls, values: Dict) -> Dict[str, Any]: # validate that main fields are populated required_fields = ["include", "file_types", "default_extension"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index 83958dc76754fc..a2bd6fd1e5558e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -80,7 +80,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): hidden_from_docs=True, ) - @root_validator + @root_validator(skip_on_failure=True) def check_ingesting_data(cls, values): if ( not values.get("database_connection") diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index da1ea8ecb4678a..a9685b2554553d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -46,6 +46,7 @@ class DBTCloudConfig(DBTCommonConfig): description="The ID of the job to ingest metadata from.", ) run_id: Optional[int] = Field( + None, description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index c4de24bf192f16..76cb82aaa5b4be 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -150,7 +150,7 @@ class DBTEntitiesEnabled(ConfigModel): description="Emit metadata for test results when set to Yes or Only", ) - @root_validator + @root_validator(skip_on_failure=True) def process_only_directive(cls, values): # Checks that at most one is set to ONLY, and then sets the others to NO. @@ -229,7 +229,7 @@ class DBTCommonConfig( default={}, description="mapping rules that will be executed against dbt column meta properties. Refer to the section below on dbt meta automated mappings.", ) - enable_meta_mapping = Field( + enable_meta_mapping: bool = Field( default=True, description="When enabled, applies the mappings that are defined through the meta_mapping directives.", ) @@ -237,7 +237,7 @@ class DBTCommonConfig( default={}, description="mapping rules that will be executed against dbt query_tag meta properties. Refer to the section below on dbt meta automated mappings.", ) - enable_query_tag_mapping = Field( + enable_query_tag_mapping: bool = Field( default=True, description="When enabled, applies the mappings that are defined through the `query_tag_mapping` directives.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index 23770ff3cf8122..25520e7aa66fff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -100,11 +100,11 @@ class KafkaSourceConfig( default="datahub.ingestion.source.confluent_schema_registry.ConfluentSchemaRegistry", description="The fully qualified implementation class(custom) that implements the KafkaSchemaRegistryBase interface.", ) - schema_tags_field = pydantic.Field( + schema_tags_field: str = pydantic.Field( default="tags", description="The field name in the schema metadata that contains the tags to be added to the dataset.", ) - enable_meta_mapping = pydantic.Field( + enable_meta_mapping: bool = pydantic.Field( default=True, description="When enabled, applies the mappings that are defined through the meta_mapping directives.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index e6b78cc7a77450..9d7c9726127779 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -275,7 +275,7 @@ def convert_string_to_connection_def(cls, conn_map): ) return conn_map - @root_validator() + @root_validator(skip_on_failure=True) def check_either_connection_map_or_connection_provided(cls, values): """Validate that we must either have a connection map or an api credential""" if not values.get("connection_to_platform_map", {}) and not values.get( @@ -286,7 +286,7 @@ def check_either_connection_map_or_connection_provided(cls, values): ) return values - @root_validator() + @root_validator(skip_on_failure=True) def check_either_project_name_or_api_provided(cls, values): """Validate that we must either have a project name or an api credential to fetch project names""" if not values.get("project_name") and not values.get("api"): @@ -1070,7 +1070,6 @@ def _get_fields( def determine_view_file_path( cls, base_folder_path: str, absolute_file_path: str ) -> str: - splits: List[str] = absolute_file_path.split(base_folder_path, 1) if len(splits) != 2: logger.debug( @@ -1104,7 +1103,6 @@ def from_looker_dict( populate_sql_logic_in_descriptions: bool = False, process_isolation_for_sql_parsing: bool = False, ) -> Optional["LookerView"]: - view_name = looker_view["name"] logger.debug(f"Handling view {view_name} in model {model_name}") # The sql_table_name might be defined in another view and this view is extending that view, @@ -2087,7 +2085,6 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 ) if looker_viewfile is not None: - for raw_view in looker_viewfile.views: raw_view_name = raw_view["name"] if LookerRefinementResolver.is_refinement(raw_view_name): diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index bc05edbb3c623a..ab418b1705956f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -126,7 +126,7 @@ class NifiSourceConfig(EnvConfigMixin): description="Path to PEM file containing certs for the root CA(s) for the NiFi", ) - @root_validator + @root_validator(skip_on_failure=True) def validate_auth_params(cla, values): if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get( "client_cert_file" @@ -143,7 +143,7 @@ def validate_auth_params(cla, values): ) return values - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def validator_site_url_to_site_name(cls, values): site_url_to_site_name = values.get("site_url_to_site_name") site_url = values.get("site_url") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 96729f4c60c6c4..b8cc34c234ffa4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -405,8 +405,7 @@ class PowerBiDashboardSourceConfig( "Works for M-Query where native SQL is used for transformation.", ) - @root_validator - @classmethod + @root_validator(skip_on_failure=True) def validate_extract_column_level_lineage(cls, values: Dict) -> Dict: flags = [ "native_query_parsing", @@ -445,7 +444,7 @@ def map_data_platform(cls, value): return value - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def workspace_id_backward_compatibility(cls, values: Dict) -> Dict: workspace_id = values.get("workspace_id") workspace_id_pattern = values.get("workspace_id_pattern") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py index 60426fc5bd660d..ee87d93774b3dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi_report_server/report_server_domain.py @@ -12,21 +12,21 @@ class CatalogItem(BaseModel): id: str = Field(alias="Id") name: str = Field(alias="Name") - description: Optional[str] = Field(alias="Description") + description: Optional[str] = Field(None, alias="Description") path: str = Field(alias="Path") - type: Any = Field(alias="Type") + type: Any = Field(None, alias="Type") hidden: bool = Field(alias="Hidden") size: int = Field(alias="Size") - modified_by: Optional[str] = Field(alias="ModifiedBy") - modified_date: Optional[datetime] = Field(alias="ModifiedDate") - created_by: Optional[str] = Field(alias="CreatedBy") - created_date: Optional[datetime] = Field(alias="CreatedDate") - parent_folder_id: Optional[str] = Field(alias="ParentFolderId") - content_type: Optional[str] = Field(alias="ContentType") + modified_by: Optional[str] = Field(None, alias="ModifiedBy") + modified_date: Optional[datetime] = Field(None, alias="ModifiedDate") + created_by: Optional[str] = Field(None, alias="CreatedBy") + created_date: Optional[datetime] = Field(None, alias="CreatedDate") + parent_folder_id: Optional[str] = Field(None, alias="ParentFolderId") + content_type: Optional[str] = Field(None, alias="ContentType") content: str = Field(alias="Content") is_favorite: bool = Field(alias="IsFavorite") - user_info: Any = Field(alias="UserInfo") - display_name: Optional[str] = Field(alias="DisplayName") + user_info: Any = Field(None, alias="UserInfo") + display_name: Optional[str] = Field(None, alias="DisplayName") has_data_sources: bool = Field(default=False, alias="HasDataSources") data_sources: Optional[List["DataSource"]] = Field( default_factory=list, alias="DataSources" @@ -72,12 +72,12 @@ def __hash__(self): class DataModelDataSource(BaseModel): - auth_type: Optional[str] = Field(alias="AuthType") + auth_type: Optional[str] = Field(None, alias="AuthType") supported_auth_types: List[Optional[str]] = Field(alias="SupportedAuthTypes") kind: str = Field(alias="Kind") model_connection_name: str = Field(alias="ModelConnectionName") secret: str = Field(alias="Secret") - type: Optional[str] = Field(alias="Type") + type: Optional[str] = Field(None, alias="Type") username: str = Field(alias="Username") @@ -135,21 +135,23 @@ class DataSource(CatalogItem): is_enabled: bool = Field(alias="IsEnabled") connection_string: str = Field(alias="ConnectionString") data_model_data_source: Optional[DataModelDataSource] = Field( - alias="DataModelDataSource" + None, alias="DataModelDataSource" ) - data_source_sub_type: Optional[str] = Field(alias="DataSourceSubType") - data_source_type: Optional[str] = Field(alias="DataSourceType") + data_source_sub_type: Optional[str] = Field(None, alias="DataSourceSubType") + data_source_type: Optional[str] = Field(None, alias="DataSourceType") is_original_connection_string_expression_based: bool = Field( alias="IsOriginalConnectionStringExpressionBased" ) is_connection_string_overridden: bool = Field(alias="IsConnectionStringOverridden") - credentials_by_user: Optional[CredentialsByUser] = Field(alias="CredentialsByUser") + credentials_by_user: Optional[CredentialsByUser] = Field( + None, alias="CredentialsByUser" + ) credentials_in_server: Optional[CredentialsInServer] = Field( - alias="CredentialsInServer" + None, alias="CredentialsInServer" ) is_reference: bool = Field(alias="IsReference") - subscriptions: Optional[Subscription] = Field(alias="Subscriptions") - meta_data: Optional[MetaData] = Field(alias="MetaData") + subscriptions: Optional[Subscription] = Field(None, alias="Subscriptions") + meta_data: Optional[MetaData] = Field(None, alias="MetaData") def __members(self): return (self.id,) @@ -274,15 +276,15 @@ def __hash__(self): class CorpUserEditableInfo(BaseModel): display_name: str = Field(alias="displayName") title: str - about_me: Optional[str] = Field(alias="aboutMe") - teams: Optional[List[str]] - skills: Optional[List[str]] - picture_link: Optional[str] = Field(alias="pictureLink") + about_me: Optional[str] = Field(None, alias="aboutMe") + teams: Optional[List[str]] = None + skills: Optional[List[str]] = None + picture_link: Optional[str] = Field(None, alias="pictureLink") class CorpUserEditableProperties(CorpUserEditableInfo): - slack: Optional[str] - phone: Optional[str] + slack: Optional[str] = None + phone: Optional[str] = None email: str @@ -305,21 +307,21 @@ class EntityRelationshipsResult(BaseModel): start: int count: int total: int - relationships: Optional[EntityRelationship] + relationships: Optional[EntityRelationship] = None class CorpUserProperties(BaseModel): active: bool display_name: str = Field(alias="displayName") email: str - title: Optional[str] - manager: Optional["CorpUser"] - department_id: Optional[int] = Field(alias="departmentId") - department_name: Optional[str] = Field(alias="departmentName") - first_name: Optional[str] = Field(alias="firstName") - last_name: Optional[str] = Field(alias="lastName") - full_name: Optional[str] = Field(alias="fullName") - country_code: Optional[str] = Field(alias="countryCode") + title: Optional[str] = None + manager: Optional["CorpUser"] = None + department_id: Optional[int] = Field(None, alias="departmentId") + department_name: Optional[str] = Field(None, alias="departmentName") + first_name: Optional[str] = Field(None, alias="firstName") + last_name: Optional[str] = Field(None, alias="lastName") + full_name: Optional[str] = Field(None, alias="fullName") + country_code: Optional[str] = Field(None, alias="countryCode") class CorpUser(BaseModel): @@ -328,13 +330,13 @@ class CorpUser(BaseModel): username: str properties: CorpUserProperties editable_properties: Optional[CorpUserEditableProperties] = Field( - alias="editableProperties" + None, alias="editableProperties" ) - status: Optional[CorpUserStatus] - tags: Optional[GlobalTags] - relationships: Optional[EntityRelationshipsResult] - editableInfo: Optional[CorpUserEditableInfo] = Field(alias="editableInfo") - global_tags: Optional[GlobalTags] = Field(alias="globalTags") + status: Optional[CorpUserStatus] = None + tags: Optional[GlobalTags] = None + relationships: Optional[EntityRelationshipsResult] = None + editableInfo: Optional[CorpUserEditableInfo] = Field(None, alias="editableInfo") + global_tags: Optional[GlobalTags] = Field(None, alias="globalTags") def get_urn_part(self): return "{}".format(self.username) @@ -353,7 +355,7 @@ def __hash__(self): class OwnershipData(BaseModel): existing_owners: Optional[List[OwnerClass]] = [] - owner_to_add: Optional[CorpUser] + owner_to_add: Optional[CorpUser] = None class Config: arbitrary_types_allowed = True diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 79b044841e0541..9cbf1823db9395 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -81,7 +81,7 @@ class RedshiftConfig( # Because of this behavior, it uses dramatically fewer round trips for # large Redshift warehouses. As an example, see this query for the columns: # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745. - scheme = Field( + scheme: str = Field( default="redshift+psycopg2", description="", hidden_from_schema=True, @@ -150,14 +150,14 @@ def check_email_is_set_on_usage(cls, values): ), "email_domain needs to be set if usage is enabled" return values - @root_validator() + @root_validator(skip_on_failure=True) def check_database_or_database_alias_set(cls, values): assert values.get("database") or values.get( "database_alias" ), "either database or database_alias must be set" return values - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def backward_compatibility_configs_set(cls, values: Dict) -> Dict: match_fully_qualified_names = values.get("match_fully_qualified_names") diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index 3ef6476078f6fb..f752a33b42d9c5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -144,7 +144,7 @@ def platform_not_empty(cls, platform: str, values: dict) -> str: raise ValueError("platform must not be empty") return platform - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py index 9f6d13a08b182e..89c092875e4490 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/datalake_profiler_config.py @@ -72,7 +72,7 @@ class DataLakeProfilerConfig(ConfigModel): description="Whether to profile for the sample values for all columns.", ) - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def ensure_field_level_settings_are_normalized( cls: "DataLakeProfilerConfig", values: Dict[str, Any] ) -> Dict[str, Any]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py index 3475c9f2881c16..6d52646f85d0a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py +++ b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py @@ -83,7 +83,7 @@ class SalesforceProfilingConfig(ConfigModel): class SalesforceConfig(DatasetSourceConfigMixin): - platform = "salesforce" + platform: str = "salesforce" auth: SalesforceAuthType = SalesforceAuthType.USERNAME_PASSWORD diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 1cbd4a3b3ea244..8f571313f18883 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -79,30 +79,30 @@ class SnowflakeColumnReference(PermissiveModel): class SnowflakeObjectAccessEntry(PermissiveModel): - columns: Optional[List[SnowflakeColumnReference]] + columns: Optional[List[SnowflakeColumnReference]] = None objectDomain: str objectName: str # Seems like it should never be null, but in practice have seen null objectIds - objectId: Optional[int] - stageKind: Optional[str] + objectId: Optional[int] = None + stageKind: Optional[str] = None class SnowflakeJoinedAccessEvent(PermissiveModel): query_start_time: datetime query_text: str query_type: str - rows_inserted: Optional[int] - rows_updated: Optional[int] - rows_deleted: Optional[int] + rows_inserted: Optional[int] = None + rows_updated: Optional[int] = None + rows_deleted: Optional[int] = None base_objects_accessed: List[SnowflakeObjectAccessEntry] direct_objects_accessed: List[SnowflakeObjectAccessEntry] objects_modified: List[SnowflakeObjectAccessEntry] user_name: str - first_name: Optional[str] - last_name: Optional[str] - display_name: Optional[str] - email: Optional[str] + first_name: Optional[str] = None + last_name: Optional[str] = None + display_name: Optional[str] = None + email: Optional[str] = None role_name: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 8873038079bada..30893fd03be226 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -5,12 +5,11 @@ from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union -import clickhouse_driver # noqa: F401 +import clickhouse_driver import clickhouse_sqlalchemy.types as custom_types import pydantic from clickhouse_sqlalchemy.drivers import base from clickhouse_sqlalchemy.drivers.base import ClickHouseDialect -from pydantic.class_validators import root_validator from pydantic.fields import Field from sqlalchemy import create_engine, text from sqlalchemy.engine import reflection @@ -59,6 +58,8 @@ UpstreamClass, ) +assert clickhouse_driver + # adding extra types not handled by clickhouse-sqlalchemy 0.1.8 base.ischema_names["DateTime64(0)"] = DATETIME base.ischema_names["DateTime64(1)"] = DATETIME @@ -126,8 +127,8 @@ class ClickHouseConfig( TwoTierSQLAlchemyConfig, BaseTimeWindowConfig, DatasetLineageProviderConfigBase ): # defaults - host_port = Field(default="localhost:8123", description="ClickHouse host URL.") - scheme = Field(default="clickhouse", description="", hidden_from_docs=True) + host_port: str = Field(default="localhost:8123", description="ClickHouse host URL.") + scheme: str = Field(default="clickhouse", description="", hidden_from_docs=True) password: pydantic.SecretStr = Field( default=pydantic.SecretStr(""), description="password" ) @@ -165,7 +166,7 @@ def get_sql_alchemy_url(self, current_db=None): return str(url) # pre = True because we want to take some decision before pydantic initialize the configuration to default values - @root_validator(pre=True) + @pydantic.root_validator(pre=True) def projects_backward_compatibility(cls, values: Dict) -> Dict: secure = values.get("secure") protocol = values.get("protocol") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py index 1dfa44f5491353..3f20e0a0f18b65 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py @@ -32,7 +32,7 @@ def get_table_names(self, connection, schema=None, **kwargs): class DruidConfig(BasicSQLAlchemyConfig): # defaults - scheme = "druid" + scheme: str = "druid" schema_pattern: AllowDenyPattern = Field( default=AllowDenyPattern(deny=["^(lookup|sysgit|view).*"]), description="regex patterns for schemas to filter in ingestion.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index d081acb6c1effa..003732236ba80c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -122,7 +122,7 @@ def get_view_definition_patched(self, connection, view_name, schema=None, **kw): class HiveConfig(TwoTierSQLAlchemyConfig): # defaults - scheme = Field(default="hive", hidden_from_docs=True) + scheme: str = Field(default="hive", hidden_from_docs=True) @validator("host_port") def clean_host_port(cls, v): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py index e4969ce946f787..891b64066721bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mysql.py @@ -48,8 +48,8 @@ class MySQLConnectionConfig(SQLAlchemyConnectionConfig): # defaults - host_port = Field(default="localhost:3306", description="MySQL host URL.") - scheme = "mysql+pymysql" + host_port: str = Field(default="localhost:3306", description="MySQL host URL.") + scheme: str = "mysql+pymysql" class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index 4f133c6459a0ff..c8418075928efa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -98,8 +98,10 @@ class ViewLineageEntry(BaseModel): class BasePostgresConfig(BasicSQLAlchemyConfig): - scheme = Field(default="postgresql+psycopg2", description="database scheme") - schema_pattern = Field(default=AllowDenyPattern(deny=["information_schema"])) + scheme: str = Field(default="postgresql+psycopg2", description="database scheme") + schema_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern(deny=["information_schema"]) + ) class PostgresConfig(BasePostgresConfig): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py index c7331b4e53e5ee..9333c6edd1fa5d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/presto.py @@ -85,7 +85,7 @@ def _get_full_table( # type: ignore class PrestoConfig(TrinoConfig): # defaults - scheme = Field(default="presto", description="", hidden_from_docs=True) + scheme: str = Field(default="presto", description="", hidden_from_docs=True) @platform_name("Presto", doc_order=1) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py index cdab52ebc39356..33d517c8589e91 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py @@ -145,7 +145,7 @@ def get_identifier(self, schema: str, table: str) -> str: # Because of this behavior, it uses dramatically fewer round trips for # large Redshift warehouses. As an example, see this query for the columns: # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745. - scheme = Field( + scheme: str = Field( default="redshift+psycopg2", description="", hidden_from_docs=True, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 095b8e64431719..6a76ae847218d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -107,7 +107,7 @@ def view_pattern_is_table_pattern_unless_specified( values["view_pattern"] = table_pattern return values - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index 899a7b6697c0a5..8aeb1e50cd0b30 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -70,7 +70,7 @@ class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowRep class BaseTeradataConfig(TwoTierSQLAlchemyConfig): - scheme = Field(default="teradatasql", description="database scheme") + scheme: str = Field(default="teradatasql", description="database scheme") class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py index 3b80cbed86c027..2b693d9d80d91a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/trino.py @@ -133,7 +133,7 @@ def _get_columns(self, connection, table_name, schema: str = None, **kw): # typ class TrinoConfig(BasicSQLAlchemyConfig): # defaults - scheme = Field(default="trino", description="", hidden_from_docs=True) + scheme: str = Field(default="trino", description="", hidden_from_docs=True) def get_identifier(self: BasicSQLAlchemyConfig, schema: str, table: str) -> str: regular = f"{schema}.{table}" diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index d11b1f9ad6a537..b1d2b276130a96 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -5,13 +5,13 @@ import pydantic from pydantic import root_validator from pydantic.fields import Field -from pydantic.generics import GenericModel from datahub.configuration.common import ( ConfigModel, ConfigurationError, DynamicTypedConfig, ) +from datahub.configuration.pydantic_migration_helpers import GenericModel from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.common import PipelineContext @@ -77,7 +77,7 @@ class StatefulIngestionConfig(ConfigModel): hidden_from_docs=True, ) - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("enabled"): if values.get("state_provider") is None: @@ -112,7 +112,7 @@ class StatefulLineageConfigMixin: "store_last_lineage_extraction_timestamp", "enable_stateful_lineage_ingestion" ) - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def lineage_stateful_option_validator(cls, values: Dict) -> Dict: sti = values.get("stateful_ingestion") if not sti or not sti.enabled: @@ -137,7 +137,7 @@ class StatefulProfilingConfigMixin(ConfigModel): "store_last_profiling_timestamps", "enable_stateful_profiling" ) - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def profiling_stateful_option_validator(cls, values: Dict) -> Dict: sti = values.get("stateful_ingestion") if not sti or not sti.enabled: @@ -161,7 +161,7 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig): "store_last_usage_extraction_timestamp", "enable_stateful_usage_ingestion" ) - @root_validator(pre=False) + @root_validator(skip_on_failure=True) def last_usage_extraction_stateful_option_validator(cls, values: Dict) -> Dict: sti = values.get("stateful_ingestion") if not sti or not sti.enabled: diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index e491a1e8b82fa3..1ae971e4a82d0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -105,7 +105,7 @@ class SupersetConfig(StatefulIngestionConfigBase, ConfigModel): def remove_trailing_slash(cls, v): return config_clean.remove_trailing_slashes(v) - @root_validator + @root_validator(skip_on_failure=True) def default_display_uri_to_connect_uri(cls, values): base = values.get("display_uri") if base is None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 16820c37d546ef..7073830318abe4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -76,7 +76,7 @@ class UnityCatalogProfilerConfig(ConfigModel): description="Number of worker threads to use for profiling. Set to 1 to disable.", ) - @pydantic.root_validator + @pydantic.root_validator(skip_on_failure=True) def warehouse_id_required_for_profiling( cls, values: Dict[str, Any] ) -> Dict[str, Any]: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 9fc697018ecd6b..2e9a15063661e8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -340,7 +340,6 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection: class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig): - include_table_lineage: bool = pydantic.Field( default=True, description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", @@ -357,7 +356,7 @@ class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig ignore_start_time_lineage: bool = False upstream_lineage_in_report: bool = False - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def validate_include_view_lineage(cls, values): if ( "include_table_lineage" in values diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py index 6037490acb267d..5eb9c83236e4f9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py @@ -44,7 +44,7 @@ class BigQueryCredential(ConfigModel): description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", ) - @pydantic.root_validator() + @pydantic.root_validator(skip_on_failure=True) def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("client_x509_cert_url") is None: values[ diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index acc7954ad25a63..075bfd29008f64 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -23,18 +23,18 @@ class VersionStats(BaseModel, arbitrary_types_allowed=True): version: Version - release_date: Optional[datetime] + release_date: Optional[datetime] = None class ServerVersionStats(BaseModel): current: VersionStats - latest: Optional[VersionStats] - current_server_type: Optional[str] + latest: Optional[VersionStats] = None + current_server_type: Optional[str] = None class ClientVersionStats(BaseModel): current: VersionStats - latest: Optional[VersionStats] + latest: Optional[VersionStats] = None class DataHubVersionStats(BaseModel): diff --git a/metadata-ingestion/src/datahub/utilities/lossy_collections.py b/metadata-ingestion/src/datahub/utilities/lossy_collections.py index f0c1e0da405528..0542a9dfd51f9f 100644 --- a/metadata-ingestion/src/datahub/utilities/lossy_collections.py +++ b/metadata-ingestion/src/datahub/utilities/lossy_collections.py @@ -1,6 +1,8 @@ import random from typing import Dict, Iterator, List, Set, TypeVar, Union +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 + T = TypeVar("T") _KT = TypeVar("_KT") _VT = TypeVar("_VT") @@ -41,6 +43,16 @@ def __repr__(self) -> str: def __str__(self) -> str: return repr(self) + if PYDANTIC_VERSION_2: + # With pydantic 2, it doesn't recognize that this is a list subclass, + # so we need to make it explicit. + + @classmethod + def __get_pydantic_core_schema__(cls, source_type, handler): # type: ignore + from pydantic_core import core_schema + + return core_schema.no_info_after_validator_function(cls, handler(list)) + def as_obj(self) -> List[Union[T, str]]: base_list: List[Union[T, str]] = list(self.__iter__()) if self.sampled: diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 6413275ac63a6f..cdffb684d958e5 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -17,6 +17,7 @@ from pydantic import BaseModel from typing_extensions import TypedDict +from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2 from datahub.emitter.mce_builder import ( DEFAULT_ENV, make_dataset_urn_with_platform_instance, @@ -122,12 +123,17 @@ class _ParserBaseModel( SchemaFieldDataTypeClass: lambda v: v.to_obj(), }, ): - pass + def json(self, *args: Any, **kwargs: Any) -> str: + if PYDANTIC_VERSION_2: + return super().model_dump_json(*args, **kwargs) # type: ignore + else: + return super().json(*args, **kwargs) @functools.total_ordering class _FrozenModel(_ParserBaseModel, frozen=True): def __lt__(self, other: "_FrozenModel") -> bool: + # TODO: The __fields__ attribute is deprecated in Pydantic v2. for field in self.__fields__: self_v = getattr(self, field) other_v = getattr(other, field) @@ -138,8 +144,8 @@ def __lt__(self, other: "_FrozenModel") -> bool: class _TableName(_FrozenModel): - database: Optional[str] - db_schema: Optional[str] + database: Optional[str] = None + db_schema: Optional[str] = None table: str def as_sqlglot_table(self) -> sqlglot.exp.Table: @@ -187,16 +193,16 @@ class ColumnRef(_ParserBaseModel): class _DownstreamColumnRef(_ParserBaseModel): - table: Optional[_TableName] + table: Optional[_TableName] = None column: str - column_type: Optional[sqlglot.exp.DataType] + column_type: Optional[sqlglot.exp.DataType] = None class DownstreamColumnRef(_ParserBaseModel): - table: Optional[Urn] + table: Optional[Urn] = None column: str - column_type: Optional[SchemaFieldDataTypeClass] - native_column_type: Optional[str] + column_type: Optional[SchemaFieldDataTypeClass] = None + native_column_type: Optional[str] = None @pydantic.validator("column_type", pre=True) def _load_column_type( @@ -213,7 +219,7 @@ class _ColumnLineageInfo(_ParserBaseModel): downstream: _DownstreamColumnRef upstreams: List[_ColumnRef] - logic: Optional[str] + logic: Optional[str] = None class ColumnLineageInfo(_ParserBaseModel): @@ -244,7 +250,7 @@ class SqlParsingResult(_ParserBaseModel): in_tables: List[Urn] out_tables: List[Urn] - column_lineage: Optional[List[ColumnLineageInfo]] + column_lineage: Optional[List[ColumnLineageInfo]] = None # TODO include formatted original sql logic # TODO include list of referenced columns From a187127ac5e5a3aebd9ef217e3facadc159f59fa Mon Sep 17 00:00:00 2001 From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com> Date: Sat, 11 Nov 2023 04:06:00 +0530 Subject: [PATCH 20/29] feat(ingestion): file-based state checkpoint provider (#9029) --- metadata-ingestion/setup.py | 1 + ...gestion_job_checkpointing_provider_base.py | 2 +- .../src/datahub/ingestion/graph/client.py | 2 +- .../source/state/stateful_ingestion_base.py | 17 +- ...atahub_ingestion_checkpointing_provider.py | 27 +- .../file_ingestion_checkpointing_provider.py | 108 +++ .../integration/lookml/golden_test_state.json | 26 + .../lookml_mces_golden_deleted_stateful.json | 650 ------------------ .../tests/integration/lookml/test_lookml.py | 116 +--- ...atahub_ingestion_checkpointing_provider.py | 170 ----- .../provider/test_provider.py | 183 +++++ .../state/golden_test_checkpoint_state.json | 26 + ...n_test_checkpoint_state_after_deleted.json | 26 + .../state/golden_test_stateful_ingestion.json | 50 ++ ...test_stateful_ingestion_after_deleted.json | 50 ++ .../state/test_stateful_ingestion.py | 227 ++++++ .../unit/stateful_ingestion/test_configs.py | 15 +- 17 files changed, 739 insertions(+), 957 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py create mode 100644 metadata-ingestion/tests/integration/lookml/golden_test_state.json delete mode 100644 metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json delete mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py create mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/provider/test_provider.py create mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json create mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json create mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json create mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json create mode 100644 metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 2392fce0580613..5f44f14c3d74cc 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -666,6 +666,7 @@ ], "datahub.ingestion.checkpointing_provider.plugins": [ "datahub = datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider:DatahubIngestionCheckpointingProvider", + "file = datahub.ingestion.source.state_provider.file_ingestion_checkpointing_provider:FileIngestionCheckpointingProvider", ], "datahub.ingestion.reporting_provider.plugins": [ "datahub = datahub.ingestion.reporting.datahub_ingestion_run_summary_provider:DatahubIngestionRunSummaryProvider", diff --git a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py index ca02b88ab63245..285ad9c0884474 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +++ b/metadata-ingestion/src/datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py @@ -35,7 +35,7 @@ def __init__( @classmethod @abstractmethod def create( - cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext, name: str + cls: Type[_Self], config_dict: Dict[str, Any], ctx: PipelineContext ) -> "_Self": pass diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index ccff677c3a4716..d91165ac9777ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -756,7 +756,7 @@ def get_latest_pipeline_checkpoint( DatahubIngestionCheckpointingProvider, ) - checkpoint_provider = DatahubIngestionCheckpointingProvider(self, "graph") + checkpoint_provider = DatahubIngestionCheckpointingProvider(self) job_name = StaleEntityRemovalHandler.compute_job_id(platform) raw_checkpoint = checkpoint_provider.get_latest_checkpoint( diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index b1d2b276130a96..8a448f40e95b4b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass -from typing import Any, Dict, Generic, Optional, Type, TypeVar, cast +from typing import Any, Dict, Generic, Optional, Type, TypeVar import pydantic from pydantic import root_validator @@ -39,10 +39,8 @@ class DynamicTypedStateProviderConfig(DynamicTypedConfig): type: str = Field( description="The type of the state provider to use. For DataHub use `datahub`", ) - # This config type is declared Optional[Any] here. The eventual parser for the - # specified type is responsible for further validation. - config: Optional[Any] = Field( - default=None, + config: Dict[str, Any] = Field( + default={}, description="The configuration required for initializing the state provider. Default: The datahub_api config if set at pipeline level. Otherwise, the default DatahubClientConfig. See the defaults (https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/src/datahub/ingestion/graph/client.py#L19).", ) @@ -82,7 +80,7 @@ def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("enabled"): if values.get("state_provider") is None: values["state_provider"] = DynamicTypedStateProviderConfig( - type="datahub", config=None + type="datahub", config={} ) return values @@ -252,15 +250,10 @@ def _initialize_checkpointing_state_provider(self) -> None: f"Cannot find checkpoint provider class of type={self.stateful_ingestion_config.state_provider.type} " " in the registry! Please check the type of the checkpointing provider in your config." ) - config_dict: Dict[str, Any] = cast( - Dict[str, Any], - self.stateful_ingestion_config.state_provider.dict().get("config", {}), - ) self.ingestion_checkpointing_state_provider = ( checkpointing_state_provider_class.create( - config_dict=config_dict, + config_dict=self.stateful_ingestion_config.state_provider.config, ctx=self.ctx, - name=checkpointing_state_provider_class.__name__, ) ) assert self.ingestion_checkpointing_state_provider diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py index d7ebcba2c6695a..442abb3aaf4cf8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py @@ -17,14 +17,17 @@ class DatahubIngestionStateProviderConfig(IngestionCheckpointingProviderConfig): - datahub_api: Optional[DatahubClientConfig] = DatahubClientConfig() + datahub_api: DatahubClientConfig = DatahubClientConfig() class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase): orchestrator_name: str = "datahub" - def __init__(self, graph: DataHubGraph, name: str): - super().__init__(name) + def __init__( + self, + graph: DataHubGraph, + ): + super().__init__(self.__class__.__name__) self.graph = graph if not self._is_server_stateful_ingestion_capable(): raise ConfigurationError( @@ -34,24 +37,14 @@ def __init__(self, graph: DataHubGraph, name: str): @classmethod def create( - cls, config_dict: Dict[str, Any], ctx: PipelineContext, name: str + cls, config_dict: Dict[str, Any], ctx: PipelineContext ) -> "DatahubIngestionCheckpointingProvider": + config = DatahubIngestionStateProviderConfig.parse_obj(config_dict) if ctx.graph: # Use the pipeline-level graph if set - return cls(ctx.graph, name) - elif config_dict is None: - raise ConfigurationError("Missing provider configuration.") + return cls(ctx.graph) else: - provider_config = ( - DatahubIngestionStateProviderConfig.parse_obj_allow_extras(config_dict) - ) - if provider_config.datahub_api: - graph = DataHubGraph(provider_config.datahub_api) - return cls(graph, name) - else: - raise ConfigurationError( - "Missing datahub_api. Provide either a global one or under the state_provider." - ) + return cls(DataHubGraph(config.datahub_api)) def _is_server_stateful_ingestion_capable(self) -> bool: server_config = self.graph.get_config() if self.graph else None diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py new file mode 100644 index 00000000000000..a37774773b84d7 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py @@ -0,0 +1,108 @@ +import logging +import pathlib +from datetime import datetime +from typing import Any, Dict, List, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import ( + IngestionCheckpointingProviderBase, + IngestionCheckpointingProviderConfig, + JobId, +) +from datahub.ingestion.sink.file import write_metadata_file +from datahub.ingestion.source.file import read_metadata_file +from datahub.metadata.schema_classes import DatahubIngestionCheckpointClass + +logger = logging.getLogger(__name__) + + +class FileIngestionStateProviderConfig(IngestionCheckpointingProviderConfig): + filename: str + + +class FileIngestionCheckpointingProvider(IngestionCheckpointingProviderBase): + orchestrator_name: str = "file" + + def __init__(self, config: FileIngestionStateProviderConfig): + super().__init__(self.__class__.__name__) + self.config = config + + @classmethod + def create( + cls, config_dict: Dict[str, Any], ctx: PipelineContext + ) -> "FileIngestionCheckpointingProvider": + config = FileIngestionStateProviderConfig.parse_obj(config_dict) + return cls(config) + + def get_latest_checkpoint( + self, + pipeline_name: str, + job_name: JobId, + ) -> Optional[DatahubIngestionCheckpointClass]: + logger.debug( + f"Querying for the latest ingestion checkpoint for pipelineName:'{pipeline_name}'," + f" job_name:'{job_name}'" + ) + + data_job_urn = self.get_data_job_urn( + self.orchestrator_name, pipeline_name, job_name + ) + latest_checkpoint: Optional[DatahubIngestionCheckpointClass] = None + try: + for obj in read_metadata_file(pathlib.Path(self.config.filename)): + if ( + isinstance(obj, MetadataChangeProposalWrapper) + and obj.entityUrn == data_job_urn + and obj.aspect + and isinstance(obj.aspect, DatahubIngestionCheckpointClass) + and obj.aspect.get("pipelineName", "") == pipeline_name + ): + latest_checkpoint = obj.aspect + break + except FileNotFoundError: + logger.debug(f"File {self.config.filename} not found") + + if latest_checkpoint: + logger.debug( + f"The last committed ingestion checkpoint for pipelineName:'{pipeline_name}'," + f" job_name:'{job_name}' found with start_time:" + f" {datetime.utcfromtimestamp(latest_checkpoint.timestampMillis/1000)}" + ) + return latest_checkpoint + else: + logger.debug( + f"No committed ingestion checkpoint for pipelineName:'{pipeline_name}'," + f" job_name:'{job_name}' found" + ) + + return None + + def commit(self) -> None: + if not self.state_to_commit: + logger.warning(f"No state available to commit for {self.name}") + return None + + checkpoint_workunits: List[MetadataChangeProposalWrapper] = [] + for job_name, checkpoint in self.state_to_commit.items(): + # Emit the ingestion state for each job + logger.debug( + f"Committing ingestion checkpoint for pipeline:'{checkpoint.pipelineName}', " + f"job:'{job_name}'" + ) + datajob_urn = self.get_data_job_urn( + self.orchestrator_name, + checkpoint.pipelineName, + job_name, + ) + checkpoint_workunits.append( + MetadataChangeProposalWrapper( + entityUrn=datajob_urn, + aspect=checkpoint, + ) + ) + write_metadata_file(pathlib.Path(self.config.filename), checkpoint_workunits) + self.committed = True + logger.debug( + f"Committed all ingestion checkpoints for pipeline:'{checkpoint.pipelineName}'" + ) diff --git a/metadata-ingestion/tests/integration/lookml/golden_test_state.json b/metadata-ingestion/tests/integration/lookml/golden_test_state.json new file mode 100644 index 00000000000000..c62106ac100890 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/golden_test_state.json @@ -0,0 +1,26 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(file,lookml_stateful,prod),lookml_stale_entity_removal)", + "changeType": "UPSERT", + "aspectName": "datahubIngestionCheckpoint", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "pipelineName": "lookml_stateful", + "platformInstanceId": "", + "config": "", + "state": { + "formatVersion": "1.0", + "serde": "base85-bz2-json", + "payload": "LRx4!F+o`-Q(4)<4JiNuUmt)_WdINa0@Mn>@BivB0a-v1sF;Ar&}h&A0K-EjK*+=xnKU%Oib;?JVrrXB7?aRqCarWwpZm8v5Yh+DsN{|c*msMh9%WJXjKPvIPsDn^@g3;DD9Q9kBh?*|=8M4uRW$_0HKn3XhN;RhAcLIBhLnO2%UA@Ykl;h&Xx(^@2;Y9C#d4g3K_2CA-I*M)h{NMA8Nu4C3XjEQYdh{nR--&lfRUsTL}OOkOO435f=1nKzYJ^9)mbBljM0}gaqy26URw1=q<80Eb9y)y?Vl88kG;g~MToq#r%6trK9U`U?k}RS<@^?i@1M1@9*%tk}1N3hRzUaNB" + }, + "runId": "lookml-test" + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json deleted file mode 100644 index a3231186669408..00000000000000 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json +++ /dev/null @@ -1,650 +0,0 @@ -[ -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/prod/looker/lkml_samples/views" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", - "type": "VIEW" - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "country", - "nullable": false, - "description": "The country", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "city", - "nullable": false, - "description": "City", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "is_latest", - "nullable": false, - "description": "Is latest data", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "yesno", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "timestamp", - "nullable": false, - "description": "Timestamp of measurement", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.TimeType": {} - } - }, - "nativeDataType": "time", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - }, - { - "tag": "urn:li:tag:Temporal" - } - ] - }, - "isPartOfKey": false - }, - { - "fieldPath": "average_measurement", - "nullable": false, - "description": "My measurement", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "average", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Measure" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "foo.view.lkml" - }, - "name": "my_view", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "SELECT\n is_latest,\n country,\n city,\n timestamp,\n measurement\n FROM\n my_table", - "viewLanguage": "sql" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "looker" - }, - { - "id": "lkml_samples" - }, - { - "id": "views" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.common.BrowsePaths": { - "paths": [ - "/prod/looker/lkml_samples/views" - ] - } - }, - { - "com.linkedin.pegasus2avro.common.Status": { - "removed": false - } - }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),id)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),owner_name)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD),owner_name)" - ], - "confidenceScore": 1.0 - } - ] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "owners", - "platform": "urn:li:dataPlatform:looker", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": true - }, - { - "fieldPath": "owner_name", - "nullable": false, - "description": "", - "label": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "globalTags": { - "tags": [ - { - "tag": "urn:li:tag:Dimension" - } - ] - }, - "isPartOfKey": false - } - ], - "primaryKeys": [ - "id" - ] - } - }, - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "looker.file.path": "owners.view.lkml" - }, - "name": "owners", - "tags": [] - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": false, - "viewLogic": "view: owners {\n dimension: id {\n primary_key: yes\n sql: ${TABLE}.id ;;\n }\n dimension: owner_name {\n sql: ${TABLE}.owner_name ;;\n }\n}", - "viewLanguage": "lookml" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "looker" - }, - { - "id": "lkml_samples" - }, - { - "id": "views" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:Dimension", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "Dimension" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:Measure", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "Measure" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "tag", - "entityUrn": "urn:li:tag:Temporal", - "changeType": "UPSERT", - "aspectName": "tagKey", - "aspect": { - "json": { - "name": "Temporal" - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": true - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test" - } -} -] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 21a0b19849d975..b1853cfa2b3c0a 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -1,6 +1,6 @@ import logging import pathlib -from typing import Any, Dict, List, cast +from typing import Any, List from unittest import mock import pydantic @@ -17,17 +17,13 @@ LookerRefinementResolver, LookMLSourceConfig, ) -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.metadata.schema_classes import ( DatasetSnapshotClass, MetadataChangeEventClass, UpstreamLineageClass, ) from tests.test_helpers import mce_helpers -from tests.test_helpers.state_helpers import ( - get_current_checkpoint_from_pipeline, - validate_all_providers_have_committed_successfully, -) +from tests.test_helpers.state_helpers import get_current_checkpoint_from_pipeline logging.getLogger("lkml").setLevel(logging.INFO) @@ -728,11 +724,10 @@ def test_hive_platform_drops_ids(pytestconfig, tmp_path, mock_time): @freeze_time(FROZEN_TIME) -def test_lookml_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): +def test_lookml_stateful_ingestion(pytestconfig, tmp_path, mock_time): output_file_name: str = "lookml_mces.json" - golden_file_name: str = "expected_output.json" - output_file_deleted_name: str = "lookml_mces_deleted_stateful.json" - golden_file_deleted_name: str = "lookml_mces_golden_deleted_stateful.json" + state_file_name: str = "lookml_state_mces.json" + golden_file_name: str = "golden_test_state.json" test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" @@ -754,106 +749,37 @@ def test_lookml_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_ "remove_stale_metadata": True, "fail_safe_threshold": 100.0, "state_provider": { - "type": "datahub", - "config": {"datahub_api": {"server": GMS_SERVER}}, + "type": "file", + "config": { + "filename": f"{tmp_path}/{state_file_name}", + }, }, }, }, }, "sink": { "type": "file", - "config": {}, + "config": { + "filename": f"{tmp_path}/{output_file_name}", + }, }, } - pipeline_run1 = None - with mock.patch( - "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", - mock_datahub_graph, - ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph - pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( # type: ignore - base_pipeline_config # type: ignore - ) - # Set the special properties for this run - pipeline_run1_config["source"]["config"]["emit_reachable_views_only"] = False - pipeline_run1_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_name}" - pipeline_run1 = Pipeline.create(pipeline_run1_config) - pipeline_run1.run() - pipeline_run1.raise_from_status() - pipeline_run1.pretty_print_summary() + pipeline_run1 = Pipeline.create(base_pipeline_config) + pipeline_run1.run() + pipeline_run1.raise_from_status() + pipeline_run1.pretty_print_summary() - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / output_file_name, - golden_path=f"{test_resources_dir}/{golden_file_name}", - ) + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/{state_file_name}", + golden_path=f"{test_resources_dir}/{golden_file_name}", + ) checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1) assert checkpoint1 assert checkpoint1.state - pipeline_run2 = None - with mock.patch( - "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", - mock_datahub_graph, - ) as mock_checkpoint: - mock_checkpoint.return_value = mock_datahub_graph - pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config) # type: ignore - # Set the special properties for this run - pipeline_run2_config["source"]["config"]["emit_reachable_views_only"] = True - pipeline_run2_config["sink"]["config"][ - "filename" - ] = f"{tmp_path}/{output_file_deleted_name}" - pipeline_run2 = Pipeline.create(pipeline_run2_config) - pipeline_run2.run() - pipeline_run2.raise_from_status() - pipeline_run2.pretty_print_summary() - - mce_helpers.check_golden_file( - pytestconfig, - output_path=tmp_path / output_file_deleted_name, - golden_path=f"{test_resources_dir}/{golden_file_deleted_name}", - ) - checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) - assert checkpoint2 - assert checkpoint2.state - - # Validate that all providers have committed successfully. - validate_all_providers_have_committed_successfully( - pipeline=pipeline_run1, expected_providers=1 - ) - validate_all_providers_have_committed_successfully( - pipeline=pipeline_run2, expected_providers=1 - ) - - # Perform all assertions on the states. The deleted table should not be - # part of the second state - state1 = cast(GenericCheckpointState, checkpoint1.state) - state2 = cast(GenericCheckpointState, checkpoint2.state) - - difference_dataset_urns = list( - state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2) - ) - # the difference in dataset urns are all the views that are not reachable from the model file - assert len(difference_dataset_urns) == 11 - deleted_dataset_urns: List[str] = [ - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", - ] - assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns) - def test_lookml_base_folder(): fake_api = { diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py deleted file mode 100644 index 600985266043b1..00000000000000 --- a/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_datahub_ingestion_checkpointing_provider.py +++ /dev/null @@ -1,170 +0,0 @@ -import types -import unittest -from typing import Dict, List, Optional, Type -from unittest.mock import MagicMock, patch - -from avrogen.dict_wrapper import DictWrapper - -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import ( - CheckpointJobStateType, - JobId, -) -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) -from datahub.ingestion.source.state.usage_common_state import ( - BaseTimeWindowCheckpointState, -) -from datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider import ( - DatahubIngestionCheckpointingProvider, -) -from tests.test_helpers.type_helpers import assert_not_null - - -class TestDatahubIngestionCheckpointProvider(unittest.TestCase): - # Static members for the tests - pipeline_name: str = "test_pipeline" - job_names: List[JobId] = [JobId("job1"), JobId("job2")] - run_id: str = "test_run" - - def setUp(self) -> None: - self._setup_mock_graph() - self.provider = self._create_provider() - assert self.provider - - def _setup_mock_graph(self) -> None: - """ - Setup monkey-patched graph client. - """ - self.patcher = patch( - "datahub.ingestion.graph.client.DataHubGraph", autospec=True - ) - self.addCleanup(self.patcher.stop) - self.mock_graph = self.patcher.start() - # Make server stateful ingestion capable - self.mock_graph.get_config.return_value = {"statefulIngestionCapable": True} - # Bind mock_graph's emit_mcp to testcase's monkey_patch_emit_mcp so that we can emulate emits. - self.mock_graph.emit_mcp = types.MethodType( - self.monkey_patch_emit_mcp, self.mock_graph - ) - # Bind mock_graph's get_latest_timeseries_value to monkey_patch_get_latest_timeseries_value - self.mock_graph.get_latest_timeseries_value = types.MethodType( - self.monkey_patch_get_latest_timeseries_value, self.mock_graph - ) - # Tracking for emitted mcps. - self.mcps_emitted: Dict[str, MetadataChangeProposalWrapper] = {} - - def _create_provider(self) -> DatahubIngestionCheckpointingProvider: - ctx: PipelineContext = PipelineContext( - run_id=self.run_id, pipeline_name=self.pipeline_name - ) - ctx.graph = self.mock_graph - return DatahubIngestionCheckpointingProvider.create( - {}, ctx, name=DatahubIngestionCheckpointingProvider.__name__ - ) - - def monkey_patch_emit_mcp( - self, graph_ref: MagicMock, mcpw: MetadataChangeProposalWrapper - ) -> None: - """ - Mockey patched implementation of DatahubGraph.emit_mcp that caches the mcp locally in memory. - """ - self.assertIsNotNone(graph_ref) - if mcpw.aspectName != "status": - self.assertEqual(mcpw.entityType, "dataJob") - self.assertEqual(mcpw.aspectName, "datahubIngestionCheckpoint") - # Cache the mcpw against the entityUrn - assert mcpw.entityUrn is not None - self.mcps_emitted[mcpw.entityUrn] = mcpw - - def monkey_patch_get_latest_timeseries_value( - self, - graph_ref: MagicMock, - entity_urn: str, - aspect_type: Type[DictWrapper], - filter_criteria_map: Dict[str, str], - ) -> Optional[DictWrapper]: - """ - Monkey patched implementation of DatahubGraph.get_latest_timeseries_value that returns the latest cached aspect - for a given entity urn. - """ - self.assertIsNotNone(graph_ref) - self.assertEqual(aspect_type, CheckpointJobStateType) - self.assertEqual( - filter_criteria_map, - { - "pipelineName": self.pipeline_name, - }, - ) - # Retrieve the cached mcpw and return its aspect value. - mcpw = self.mcps_emitted.get(entity_urn) - if mcpw: - return mcpw.aspect - return None - - def test_provider(self): - # 1. Create the individual job checkpoints with appropriate states. - # Job1 - Checkpoint with a BaseSQLAlchemyCheckpointState state - job1_state_obj = BaseSQLAlchemyCheckpointState() - job1_checkpoint = Checkpoint( - job_name=self.job_names[0], - pipeline_name=self.pipeline_name, - run_id=self.run_id, - state=job1_state_obj, - ) - # Job2 - Checkpoint with a BaseTimeWindowCheckpointState state - job2_state_obj = BaseTimeWindowCheckpointState( - begin_timestamp_millis=10, end_timestamp_millis=100 - ) - job2_checkpoint = Checkpoint( - job_name=self.job_names[1], - pipeline_name=self.pipeline_name, - run_id=self.run_id, - state=job2_state_obj, - ) - - # 2. Set the provider's state_to_commit. - self.provider.state_to_commit = { - # NOTE: state_to_commit accepts only the aspect version of the checkpoint. - self.job_names[0]: assert_not_null( - job1_checkpoint.to_checkpoint_aspect(max_allowed_state_size=2**20) - ), - self.job_names[1]: assert_not_null( - job2_checkpoint.to_checkpoint_aspect(max_allowed_state_size=2**20) - ), - } - - # 3. Perform the commit - # NOTE: This will commit the state to the in-memory self.mcps_emitted because of the monkey-patching. - self.provider.commit() - self.assertTrue(self.provider.committed) - - # 4. Get last committed state. This must match what has been committed earlier. - # NOTE: This will retrieve from in-memory self.mcps_emitted because of the monkey-patching. - job1_last_state = self.provider.get_latest_checkpoint( - self.pipeline_name, self.job_names[0] - ) - job2_last_state = self.provider.get_latest_checkpoint( - self.pipeline_name, self.job_names[1] - ) - - # 5. Validate individual job checkpoint state values that have been committed and retrieved - # against the original values. - self.assertIsNotNone(job1_last_state) - job1_last_checkpoint = Checkpoint.create_from_checkpoint_aspect( - job_name=self.job_names[0], - checkpoint_aspect=job1_last_state, - state_class=type(job1_state_obj), - ) - self.assertEqual(job1_last_checkpoint, job1_checkpoint) - - self.assertIsNotNone(job2_last_state) - job2_last_checkpoint = Checkpoint.create_from_checkpoint_aspect( - job_name=self.job_names[1], - checkpoint_aspect=job2_last_state, - state_class=type(job2_state_obj), - ) - self.assertEqual(job2_last_checkpoint, job2_checkpoint) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_provider.py b/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_provider.py new file mode 100644 index 00000000000000..4387e5a17790f7 --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/provider/test_provider.py @@ -0,0 +1,183 @@ +import tempfile +import types +import unittest +from typing import Dict, List, Optional, Type +from unittest.mock import MagicMock, patch + +from avrogen.dict_wrapper import DictWrapper + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import ( + CheckpointJobStateType, + IngestionCheckpointingProviderBase, + JobId, +) +from datahub.ingestion.source.state.checkpoint import Checkpoint +from datahub.ingestion.source.state.sql_common_state import ( + BaseSQLAlchemyCheckpointState, +) +from datahub.ingestion.source.state.usage_common_state import ( + BaseTimeWindowCheckpointState, +) +from datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider import ( + DatahubIngestionCheckpointingProvider, +) +from datahub.ingestion.source.state_provider.file_ingestion_checkpointing_provider import ( + FileIngestionCheckpointingProvider, +) +from tests.test_helpers.type_helpers import assert_not_null + + +class TestIngestionCheckpointProviders(unittest.TestCase): + # Static members for the tests + pipeline_name: str = "test_pipeline" + job_names: List[JobId] = [JobId("job1"), JobId("job2")] + run_id: str = "test_run" + + def setUp(self) -> None: + self._setup_mock_graph() + self._create_providers() + + def _setup_mock_graph(self) -> None: + """ + Setup monkey-patched graph client. + """ + self.patcher = patch( + "datahub.ingestion.graph.client.DataHubGraph", autospec=True + ) + self.addCleanup(self.patcher.stop) + self.mock_graph = self.patcher.start() + # Make server stateful ingestion capable + self.mock_graph.get_config.return_value = {"statefulIngestionCapable": True} + # Bind mock_graph's emit_mcp to testcase's monkey_patch_emit_mcp so that we can emulate emits. + self.mock_graph.emit_mcp = types.MethodType( + self.monkey_patch_emit_mcp, self.mock_graph + ) + # Bind mock_graph's get_latest_timeseries_value to monkey_patch_get_latest_timeseries_value + self.mock_graph.get_latest_timeseries_value = types.MethodType( + self.monkey_patch_get_latest_timeseries_value, self.mock_graph + ) + # Tracking for emitted mcps. + self.mcps_emitted: Dict[str, MetadataChangeProposalWrapper] = {} + + def _create_providers(self) -> None: + ctx: PipelineContext = PipelineContext( + run_id=self.run_id, pipeline_name=self.pipeline_name + ) + ctx.graph = self.mock_graph + self.providers: List[IngestionCheckpointingProviderBase] = [ + DatahubIngestionCheckpointingProvider.create({}, ctx), + FileIngestionCheckpointingProvider.create( + {"filename": f"{tempfile.mkdtemp()}/checkpoint_mces.json"}, + ctx, + ), + ] + + def monkey_patch_emit_mcp( + self, graph_ref: MagicMock, mcpw: MetadataChangeProposalWrapper + ) -> None: + """ + Mockey patched implementation of DatahubGraph.emit_mcp that caches the mcp locally in memory. + """ + self.assertIsNotNone(graph_ref) + if mcpw.aspectName != "status": + self.assertEqual(mcpw.entityType, "dataJob") + self.assertEqual(mcpw.aspectName, "datahubIngestionCheckpoint") + # Cache the mcpw against the entityUrn + assert mcpw.entityUrn is not None + self.mcps_emitted[mcpw.entityUrn] = mcpw + + def monkey_patch_get_latest_timeseries_value( + self, + graph_ref: MagicMock, + entity_urn: str, + aspect_type: Type[DictWrapper], + filter_criteria_map: Dict[str, str], + ) -> Optional[DictWrapper]: + """ + Monkey patched implementation of DatahubGraph.get_latest_timeseries_value that returns the latest cached aspect + for a given entity urn. + """ + self.assertIsNotNone(graph_ref) + self.assertEqual(aspect_type, CheckpointJobStateType) + self.assertEqual( + filter_criteria_map, + { + "pipelineName": self.pipeline_name, + }, + ) + # Retrieve the cached mcpw and return its aspect value. + mcpw = self.mcps_emitted.get(entity_urn) + if mcpw: + return mcpw.aspect + return None + + def test_providers(self): + self.assertEqual(len(self.providers), 2) + for provider in self.providers: + assert provider + # 1. Create the individual job checkpoints with appropriate states. + # Job1 - Checkpoint with a BaseSQLAlchemyCheckpointState state + job1_state_obj = BaseSQLAlchemyCheckpointState() + job1_checkpoint = Checkpoint( + job_name=self.job_names[0], + pipeline_name=self.pipeline_name, + run_id=self.run_id, + state=job1_state_obj, + ) + # Job2 - Checkpoint with a BaseTimeWindowCheckpointState state + job2_state_obj = BaseTimeWindowCheckpointState( + begin_timestamp_millis=10, end_timestamp_millis=100 + ) + job2_checkpoint = Checkpoint( + job_name=self.job_names[1], + pipeline_name=self.pipeline_name, + run_id=self.run_id, + state=job2_state_obj, + ) + + # 2. Set the provider's state_to_commit. + provider.state_to_commit = { + # NOTE: state_to_commit accepts only the aspect version of the checkpoint. + self.job_names[0]: assert_not_null( + job1_checkpoint.to_checkpoint_aspect(max_allowed_state_size=2**20) + ), + self.job_names[1]: assert_not_null( + job2_checkpoint.to_checkpoint_aspect(max_allowed_state_size=2**20) + ), + } + + # 3. Perform the commit + # NOTE: This will commit the state to + # In-memory self.mcps_emitted because of the monkey-patching for datahub ingestion checkpointer provider. + # And to temp directory json file for file ingestion checkpointer provider. + provider.commit() + self.assertTrue(provider.committed) + + # 4. Get last committed state. This must match what has been committed earlier. + # NOTE: This will retrieve the state form where it is committed. + job1_last_state = provider.get_latest_checkpoint( + self.pipeline_name, self.job_names[0] + ) + job2_last_state = provider.get_latest_checkpoint( + self.pipeline_name, self.job_names[1] + ) + + # 5. Validate individual job checkpoint state values that have been committed and retrieved + # against the original values. + self.assertIsNotNone(job1_last_state) + job1_last_checkpoint = Checkpoint.create_from_checkpoint_aspect( + job_name=self.job_names[0], + checkpoint_aspect=job1_last_state, + state_class=type(job1_state_obj), + ) + self.assertEqual(job1_last_checkpoint, job1_checkpoint) + + self.assertIsNotNone(job2_last_state) + job2_last_checkpoint = Checkpoint.create_from_checkpoint_aspect( + job_name=self.job_names[1], + checkpoint_aspect=job2_last_state, + state_class=type(job2_state_obj), + ) + self.assertEqual(job2_last_checkpoint, job2_checkpoint) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json new file mode 100644 index 00000000000000..4e62492918bfb9 --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state.json @@ -0,0 +1,26 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(file,dummy_stateful,prod),default_stale_entity_removal)", + "changeType": "UPSERT", + "aspectName": "datahubIngestionCheckpoint", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "pipelineName": "dummy_stateful", + "platformInstanceId": "", + "config": "", + "state": { + "formatVersion": "1.0", + "serde": "base85-bz2-json", + "payload": "LRx4!F+o`-Q(1w>5G4QrYoCBnWH=B60MH7jr`{?c0BA?5L)2-AGyu>6y;V<9hz%Mv0Bt1*)lOMzr>a0|Iq-4VtTsYONQsFPLn1EpdQS;HIy|&CvSAlRvAJwmtCEM+Rx(v_)~sVvkx3V@WX4O`=losC6yZWb2OL0@" + }, + "runId": "dummy-test-stateful-ingestion" + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json new file mode 100644 index 00000000000000..6ecd43483d9483 --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_checkpoint_state_after_deleted.json @@ -0,0 +1,26 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(file,dummy_stateful,prod),default_stale_entity_removal)", + "changeType": "UPSERT", + "aspectName": "datahubIngestionCheckpoint", + "aspect": { + "json": { + "timestampMillis": 1586847600000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "pipelineName": "dummy_stateful", + "platformInstanceId": "", + "config": "", + "state": { + "formatVersion": "1.0", + "serde": "base85-bz2-json", + "payload": "LRx4!F+o`-Q(317h`0a%NgsevWH1l}0MH7jr`{?c0B9vdZ9%mLfYG4P6;f$2G%+v`9z&~6n|e(JEPC2_Iix~CA_im)jR-zsjEK*yo|HQz#IUUHtf@DYVEme-lUW9{Xmmt~y^2jCdyY95az!{$kf#WUxB" + }, + "runId": "dummy-test-stateful-ingestion" + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json new file mode 100644 index 00000000000000..4a77651c930667 --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion.json @@ -0,0 +1,50 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json new file mode 100644 index 00000000000000..9d6f755374462b --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/golden_test_stateful_ingestion_after_deleted.json @@ -0,0 +1,50 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "dummy-test-stateful-ingestion", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py new file mode 100644 index 00000000000000..2b811d5e5e3a33 --- /dev/null +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_stateful_ingestion.py @@ -0,0 +1,227 @@ +from dataclasses import dataclass, field as dataclass_field +from typing import Any, Dict, Iterable, List, Optional, cast + +import pydantic +from freezegun import freeze_time +from pydantic import Field + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, + StatefulIngestionSourceBase, +) +from datahub.metadata.schema_classes import StatusClass +from datahub.utilities.urns.dataset_urn import DatasetUrn +from tests.test_helpers import mce_helpers +from tests.test_helpers.state_helpers import ( + get_current_checkpoint_from_pipeline, + validate_all_providers_have_committed_successfully, +) + +FROZEN_TIME = "2020-04-14 07:00:00" + +dummy_datasets: List = ["dummy_dataset1", "dummy_dataset2", "dummy_dataset3"] + + +@dataclass +class DummySourceReport(StaleEntityRemovalSourceReport): + datasets_scanned: int = 0 + filtered_datasets: List[str] = dataclass_field(default_factory=list) + + def report_datasets_scanned(self, count: int = 1) -> None: + self.datasets_scanned += count + + def report_datasets_dropped(self, model: str) -> None: + self.filtered_datasets.append(model) + + +class DummySourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): + dataset_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for datasets to filter in ingestion.", + ) + # Configuration for stateful ingestion + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( + default=None, description="Dummy source Ingestion Config." + ) + + +class DummySource(StatefulIngestionSourceBase): + """ + This is dummy source which only extract dummy datasets + """ + + source_config: DummySourceConfig + reporter: DummySourceReport + + def __init__(self, config: DummySourceConfig, ctx: PipelineContext): + super(DummySource, self).__init__(config, ctx) + self.source_config = config + self.reporter = DummySourceReport() + # Create and register the stateful ingestion use-case handler. + self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ) + + @classmethod + def create(cls, config_dict, ctx): + config = DummySourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + self.stale_entity_removal_handler.workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + for dataset in dummy_datasets: + if not self.source_config.dataset_patterns.allowed(dataset): + self.reporter.report_datasets_dropped(dataset) + continue + else: + self.reporter.report_datasets_scanned() + dataset_urn = DatasetUrn.create_from_ids( + platform_id="postgres", + table_name=dataset, + env=DEFAULT_ENV, + ) + yield MetadataChangeProposalWrapper( + entityUrn=str(dataset_urn), + aspect=StatusClass(removed=False), + ).as_workunit() + + def get_report(self) -> SourceReport: + return self.reporter + + +@freeze_time(FROZEN_TIME) +def test_stateful_ingestion(pytestconfig, tmp_path, mock_time): + # test stateful ingestion using dummy source + state_file_name: str = "checkpoint_state_mces.json" + golden_state_file_name: str = "golden_test_checkpoint_state.json" + golden_state_file_name_after_deleted: str = ( + "golden_test_checkpoint_state_after_deleted.json" + ) + output_file_name: str = "dummy_mces.json" + golden_file_name: str = "golden_test_stateful_ingestion.json" + output_file_name_after_deleted: str = "dummy_mces_stateful_after_deleted.json" + golden_file_name_after_deleted: str = ( + "golden_test_stateful_ingestion_after_deleted.json" + ) + + test_resources_dir = pytestconfig.rootpath / "tests/unit/stateful_ingestion/state" + + base_pipeline_config = { + "run_id": "dummy-test-stateful-ingestion", + "pipeline_name": "dummy_stateful", + "source": { + "type": "tests.unit.stateful_ingestion.state.test_stateful_ingestion.DummySource", + "config": { + "stateful_ingestion": { + "enabled": True, + "remove_stale_metadata": True, + "state_provider": { + "type": "file", + "config": { + "filename": f"{tmp_path}/{state_file_name}", + }, + }, + }, + }, + }, + "sink": { + "type": "file", + "config": {}, + }, + } + + pipeline_run1 = None + pipeline_run1_config: Dict[str, Dict[str, Dict[str, Any]]] = dict( # type: ignore + base_pipeline_config # type: ignore + ) + pipeline_run1_config["sink"]["config"][ + "filename" + ] = f"{tmp_path}/{output_file_name}" + pipeline_run1 = Pipeline.create(pipeline_run1_config) + pipeline_run1.run() + pipeline_run1.raise_from_status() + pipeline_run1.pretty_print_summary() + + # validate both dummy source mces and checkpoint state mces files + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / output_file_name, + golden_path=f"{test_resources_dir}/{golden_file_name}", + ) + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / state_file_name, + golden_path=f"{test_resources_dir}/{golden_state_file_name}", + ) + checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1) + assert checkpoint1 + assert checkpoint1.state + + pipeline_run2 = None + pipeline_run2_config: Dict[str, Dict[str, Dict[str, Any]]] = dict(base_pipeline_config) # type: ignore + pipeline_run2_config["source"]["config"]["dataset_patterns"] = { + "allow": ["dummy_dataset1", "dummy_dataset2"], + } + pipeline_run2_config["sink"]["config"][ + "filename" + ] = f"{tmp_path}/{output_file_name_after_deleted}" + pipeline_run2 = Pipeline.create(pipeline_run2_config) + pipeline_run2.run() + pipeline_run2.raise_from_status() + pipeline_run2.pretty_print_summary() + + # validate both updated dummy source mces and checkpoint state mces files after deleting dataset + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / output_file_name_after_deleted, + golden_path=f"{test_resources_dir}/{golden_file_name_after_deleted}", + ) + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / state_file_name, + golden_path=f"{test_resources_dir}/{golden_state_file_name_after_deleted}", + ) + checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) + assert checkpoint2 + assert checkpoint2.state + + # Validate that all providers have committed successfully. + validate_all_providers_have_committed_successfully( + pipeline=pipeline_run1, expected_providers=1 + ) + validate_all_providers_have_committed_successfully( + pipeline=pipeline_run2, expected_providers=1 + ) + + # Perform all assertions on the states. The deleted table should not be + # part of the second state + state1 = cast(GenericCheckpointState, checkpoint1.state) + state2 = cast(GenericCheckpointState, checkpoint2.state) + + difference_dataset_urns = list( + state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2) + ) + # the difference in dataset urns is the dataset which is not allowed to ingest + assert len(difference_dataset_urns) == 1 + deleted_dataset_urns: List[str] = [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,dummy_dataset3,PROD)", + ] + assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py b/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py index 9edfe8c4a957b4..0e6d60e3440b20 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py @@ -3,9 +3,10 @@ import pytest from pydantic import ValidationError -from datahub.configuration.common import ConfigModel, DynamicTypedConfig +from datahub.configuration.common import ConfigModel from datahub.ingestion.graph.client import DatahubClientConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( + DynamicTypedStateProviderConfig, StatefulIngestionConfig, ) from datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider import ( @@ -23,7 +24,6 @@ }, "simple": {}, "default": {}, - "none": None, } @@ -81,13 +81,6 @@ ), False, ), - # None - "checkpointing_bad_config": ( - DatahubIngestionStateProviderConfig, - datahub_client_configs["none"], - None, - True, - ), } @@ -119,7 +112,7 @@ max_checkpoint_state_size=1024, ignore_old_state=True, ignore_new_state=True, - state_provider=DynamicTypedConfig( + state_provider=DynamicTypedStateProviderConfig( type="datahub", config=datahub_client_configs["full"], ), @@ -148,7 +141,7 @@ max_checkpoint_state_size=2**24, ignore_old_state=False, ignore_new_state=False, - state_provider=DynamicTypedConfig(type="datahub", config=None), + state_provider=DynamicTypedStateProviderConfig(type="datahub"), ), False, ), From bb7300251f6d65024b3440379d8eef3e2413a2b0 Mon Sep 17 00:00:00 2001 From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com> Date: Sat, 11 Nov 2023 05:29:18 +0530 Subject: [PATCH 21/29] feat(ingestion/airflow): support datajobs as task inlets (#9211) Co-authored-by: Harshal Sheth --- .../datahub_listener.py | 14 ++-- .../src/datahub_airflow_plugin/entities.py | 28 ++++++-- .../example_dags/lineage_backend_demo.py | 1 + .../lineage_backend_taskflow_demo.py | 1 + .../lineage/_lineage_core.py | 18 ++--- .../tests/integration/dags/basic_iolets.py | 1 + .../tests/integration/dags/simple_dag.py | 1 + .../integration/goldens/v1_basic_iolets.json | 56 ++++++++++++--- .../integration/goldens/v1_simple_dag.json | 70 ++++++++++++++----- .../integration/goldens/v2_basic_iolets.json | 26 ++++--- .../v2_basic_iolets_no_dag_listener.json | 24 ++++--- .../integration/goldens/v2_simple_dag.json | 38 +++++----- .../v2_simple_dag_no_dag_listener.json | 32 +++++---- .../goldens/v2_snowflake_operator.json | 2 +- .../goldens/v2_sqlite_operator.json | 2 +- .../v2_sqlite_operator_no_dag_listener.json | 40 +++++------ .../airflow-plugin/tests/unit/test_airflow.py | 31 ++++++-- 17 files changed, 261 insertions(+), 124 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index a3f5cb489e29fb..d00b10bbe1756f 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -17,7 +17,6 @@ ) from datahub.telemetry import telemetry from datahub.utilities.sqlglot_lineage import SqlParsingResult -from datahub.utilities.urns.dataset_urn import DatasetUrn from openlineage.airflow.listener import TaskHolder from openlineage.airflow.utils import redact_with_exclusions from openlineage.client.serde import Serde @@ -32,7 +31,11 @@ from datahub_airflow_plugin._datahub_ol_adapter import translate_ol_to_datahub_urn from datahub_airflow_plugin._extractors import SQL_PARSING_RESULT_KEY, ExtractorManager from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator -from datahub_airflow_plugin.entities import _Entity +from datahub_airflow_plugin.entities import ( + _Entity, + entities_to_datajob_urn_list, + entities_to_dataset_urn_list, +) _F = TypeVar("_F", bound=Callable[..., None]) if TYPE_CHECKING: @@ -272,10 +275,9 @@ def _extract_lineage( ) # Write the lineage to the datajob object. - datajob.inlets.extend(DatasetUrn.create_from_string(urn) for urn in input_urns) - datajob.outlets.extend( - DatasetUrn.create_from_string(urn) for urn in output_urns - ) + datajob.inlets.extend(entities_to_dataset_urn_list(input_urns)) + datajob.outlets.extend(entities_to_dataset_urn_list(output_urns)) + datajob.upstream_urns.extend(entities_to_datajob_urn_list(input_urns)) datajob.fine_grained_lineages.extend(fine_grained_lineages) # Merge in extra stuff that was present in the DataJob we constructed diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py index 69f667cad3241d..5a4bcb0097a8c4 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/entities.py @@ -1,8 +1,10 @@ from abc import abstractmethod -from typing import Optional +from typing import List, Optional import attr import datahub.emitter.mce_builder as builder +from datahub.utilities.urns.data_job_urn import DataJobUrn +from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import guess_entity_type @@ -38,10 +40,28 @@ class Urn(_Entity): def _validate_urn(self, attribute, value): if not value.startswith("urn:"): raise ValueError("invalid urn provided: urns must start with 'urn:'") - if guess_entity_type(value) != "dataset": - # This is because DataJobs only support Dataset lineage. - raise ValueError("Airflow lineage currently only supports datasets") + if guess_entity_type(value) not in ["dataset", "dataJob"]: + # This is because DataJobs only support Dataset and upstream Datajob lineage. + raise ValueError( + "Airflow lineage currently only supports datasets and upstream datajobs" + ) @property def urn(self): return self._urn + + +def entities_to_dataset_urn_list(iolets: List[str]) -> List[DatasetUrn]: + dataset_urn_list: List[DatasetUrn] = [] + for let in iolets: + if guess_entity_type(let) == "dataset": + dataset_urn_list.append(DatasetUrn.create_from_string(let)) + return dataset_urn_list + + +def entities_to_datajob_urn_list(inlets: List[str]) -> List[DataJobUrn]: + datajob_urn_list: List[DataJobUrn] = [] + for let in inlets: + if guess_entity_type(let) == "dataJob": + datajob_urn_list.append(DataJobUrn.create_from_string(let)) + return datajob_urn_list diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py index 3caea093b932d4..ce161d6a415e9e 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py @@ -46,6 +46,7 @@ Urn( "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" ), + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"), ], outlets=[Dataset("snowflake", "mydb.schema.tableD")], ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py index ceb0f452b540a0..80df7053a49f9a 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py @@ -37,6 +37,7 @@ def datahub_lineage_backend_taskflow_demo(): Urn( "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" ), + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,dag1,prod),task1)"), ], outlets=[Dataset("snowflake", "mydb.schema.tableD")], ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py index f5f519fa23b11e..75fc79443e49e0 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py @@ -2,11 +2,14 @@ from typing import TYPE_CHECKING, Dict, List from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub_airflow_plugin._config import DatahubLineageConfig from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator -from datahub_airflow_plugin.entities import _Entity +from datahub_airflow_plugin.entities import ( + _Entity, + entities_to_datajob_urn_list, + entities_to_dataset_urn_list, +) if TYPE_CHECKING: from airflow import DAG @@ -16,10 +19,6 @@ from datahub_airflow_plugin._airflow_shims import Operator -def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: - return [DatasetUrn.create_from_string(let.urn) for let in iolets] - - def send_lineage_to_datahub( config: DatahubLineageConfig, operator: "Operator", @@ -53,8 +52,11 @@ def send_lineage_to_datahub( capture_tags=config.capture_tags_info, capture_owner=config.capture_ownership_info, ) - datajob.inlets.extend(_entities_to_urn_list(inlets)) - datajob.outlets.extend(_entities_to_urn_list(outlets)) + datajob.inlets.extend(entities_to_dataset_urn_list([let.urn for let in inlets])) + datajob.outlets.extend(entities_to_dataset_urn_list([let.urn for let in outlets])) + datajob.upstream_urns.extend( + entities_to_datajob_urn_list([let.urn for let in inlets]) + ) datajob.emit(emitter) operator.log.info(f"Emitted from Lineage: {datajob}") diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py index 8b0803ab98422b..11b3731c52bca8 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py @@ -26,6 +26,7 @@ Urn( "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" ), + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)"), ], outlets=[ Dataset("snowflake", "mydb.schema.tableD"), diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py index 1dd047f0a6dccb..71b462159ac603 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py @@ -21,6 +21,7 @@ Urn( "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" ), + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)"), ], outlets=[Dataset("snowflake", "mydb.schema.tableD")], ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json index a4c17c73e9c7e6..6b460e99b1f281 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json @@ -11,7 +11,7 @@ "catchup": "False", "description": "None", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "fileloc": "'/home/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "None", @@ -95,7 +95,8 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -150,6 +151,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -245,7 +257,8 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -300,6 +313,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -365,9 +389,9 @@ "json": { "customProperties": { "run_id": "manual_run_test", - "duration": "0.176536", - "start_date": "2023-09-30 00:49:56.670239+00:00", - "end_date": "2023-09-30 00:49:56.846775+00:00", + "duration": "0.143271", + "start_date": "2023-11-08 09:55:05.801617+00:00", + "end_date": "2023-11-08 09:55:05.944888+00:00", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "1", "max_tries": "0", @@ -384,7 +408,7 @@ "name": "basic_iolets_run_data_task_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696034996670, + "time": 1699437305801, "actor": "urn:li:corpuser:datahub" } } @@ -413,7 +437,8 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" ] } } @@ -476,6 +501,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -505,7 +541,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696034996670, + "timestampMillis": 1699437305801, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -522,7 +558,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696034996846, + "timestampMillis": 1699437305944, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json index a0a95716a09931..7ec172e3678dcf 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json @@ -11,7 +11,7 @@ "catchup": "False", "description": "'A simple DAG that runs a few fake data tasks.'", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "fileloc": "'/home/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "None", @@ -94,7 +94,8 @@ "json": { "inputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" @@ -126,6 +127,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -208,7 +220,8 @@ "json": { "inputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" ], "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" @@ -240,6 +253,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -294,9 +318,9 @@ "json": { "customProperties": { "run_id": "manual_run_test", - "duration": "0.175983", - "start_date": "2023-09-30 00:48:58.943850+00:00", - "end_date": "2023-09-30 00:48:59.119833+00:00", + "duration": "0.120524", + "start_date": "2023-11-08 09:54:06.065112+00:00", + "end_date": "2023-11-08 09:54:06.185636+00:00", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "1", "max_tries": "0", @@ -313,7 +337,7 @@ "name": "simple_dag_task_1_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696034938943, + "time": 1699437246065, "actor": "urn:li:corpuser:datahub" } } @@ -340,7 +364,8 @@ "json": { "inputs": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" ] } } @@ -380,6 +405,17 @@ } } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", @@ -398,7 +434,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696034938943, + "timestampMillis": 1699437246065, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -415,7 +451,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696034939119, + "timestampMillis": 1699437246185, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -440,7 +476,7 @@ "catchup": "False", "description": "'A simple DAG that runs a few fake data tasks.'", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "fileloc": "'/home/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "None", @@ -651,9 +687,9 @@ "json": { "customProperties": { "run_id": "manual_run_test", - "duration": "0.129888", - "start_date": "2023-09-30 00:49:02.158752+00:00", - "end_date": "2023-09-30 00:49:02.288640+00:00", + "duration": "0.099975", + "start_date": "2023-11-08 09:54:09.744583+00:00", + "end_date": "2023-11-08 09:54:09.844558+00:00", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "1", "max_tries": "0", @@ -670,7 +706,7 @@ "name": "simple_dag_run_another_data_task_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696034942158, + "time": 1699437249744, "actor": "urn:li:corpuser:datahub" } } @@ -695,7 +731,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696034942158, + "timestampMillis": 1699437249744, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -712,7 +748,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696034942288, + "timestampMillis": 1699437249844, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json index 1974f1f085df0b..6767a368f366ae 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json @@ -11,7 +11,7 @@ "catchup": "False", "description": "None", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "fileloc": "'/Users/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "[]", @@ -73,9 +73,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "[]", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_is_setup\": false, \"_is_teardown\": false, \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": {}, \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", "name": "run_data_task", @@ -102,7 +102,9 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -217,7 +219,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 01:13:14.266272+00:00", + "start_date": "2023-10-30 13:07:55.311482+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -235,7 +237,7 @@ "name": "basic_iolets_run_data_task_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696036394266, + "time": 1698671275311, "actor": "urn:li:corpuser:datahub" } } @@ -356,7 +358,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696036394266, + "timestampMillis": 1698671275311, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -383,9 +385,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "[]", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_is_setup\": false, \"_is_teardown\": false, \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": {}, \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", "name": "run_data_task", @@ -412,7 +414,9 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -524,7 +528,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696036394833, + "timestampMillis": 1698671276777, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json index d02951bc9e82dd..63b0a059355541 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json @@ -73,9 +73,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "[]", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", "name": "run_data_task", @@ -102,7 +102,9 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -217,7 +219,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:59:52.401211+00:00", + "start_date": "2023-11-10 19:11:17.444435+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -235,7 +237,7 @@ "name": "basic_iolets_run_data_task_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057192401, + "time": 1699643477444, "actor": "urn:li:corpuser:datahub" } } @@ -356,7 +358,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057192401, + "timestampMillis": 1699643477444, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -383,9 +385,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "[]", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", "name": "run_data_task", @@ -412,7 +414,9 @@ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -524,7 +528,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057192982, + "timestampMillis": 1699643478123, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json index 9acc47ec1321e5..c558f79c32e150 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json @@ -11,7 +11,7 @@ "catchup": "False", "description": "'A simple DAG that runs a few fake data tasks.'", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "fileloc": "'/Users/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "[]", @@ -74,9 +74,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "['run_another_data_task']", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_is_setup\": false, \"_is_teardown\": false, \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": {}, \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", "name": "task_1", @@ -100,7 +100,9 @@ "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -182,7 +184,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:53:58.219003+00:00", + "start_date": "2023-10-30 13:06:07.193282+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -200,7 +202,7 @@ "name": "simple_dag_task_1_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696056838219, + "time": 1698671167193, "actor": "urn:li:corpuser:datahub" } } @@ -285,7 +287,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056838219, + "timestampMillis": 1698671167193, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -312,9 +314,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "['run_another_data_task']", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_is_setup\": false, \"_is_teardown\": false, \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": {}, \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", "name": "task_1", @@ -338,7 +340,9 @@ "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -417,7 +421,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056838648, + "timestampMillis": 1698671168726, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -449,7 +453,7 @@ "downstream_task_ids": "[]", "inlets": "[]", "outlets": "[]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_is_setup\": false, \"_is_teardown\": false, \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": {}, \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", "name": "run_another_data_task", @@ -519,7 +523,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:54:02.407515+00:00", + "start_date": "2023-10-30 13:06:19.970466+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -537,7 +541,7 @@ "name": "simple_dag_run_another_data_task_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696056842407, + "time": 1698671179970, "actor": "urn:li:corpuser:datahub" } } @@ -562,7 +566,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056842407, + "timestampMillis": 1698671179970, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -591,7 +595,7 @@ "downstream_task_ids": "[]", "inlets": "[]", "outlets": "[]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_is_setup\": false, \"_is_teardown\": false, \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": {}, \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", "name": "run_another_data_task", @@ -658,7 +662,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696056842831, + "timestampMillis": 1698671180730, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json index 03299c483f57fd..ec0f3cab1e81f3 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json @@ -74,9 +74,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "['run_another_data_task']", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", "name": "task_1", @@ -100,7 +100,9 @@ "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -182,7 +184,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:58:56.105026+00:00", + "start_date": "2023-11-10 19:10:10.856995+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -200,7 +202,7 @@ "name": "simple_dag_task_1_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057136105, + "time": 1699643410856, "actor": "urn:li:corpuser:datahub" } } @@ -285,7 +287,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057136105, + "timestampMillis": 1699643410856, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -312,9 +314,9 @@ "trigger_rule": "", "wait_for_downstream": "False", "downstream_task_ids": "['run_another_data_task']", - "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)'), Urn(_urn='urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)')]", "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", - "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}, {\"_urn\": \"urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" }, "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", "name": "task_1", @@ -338,7 +340,9 @@ "outputDatasets": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" ], - "inputDatajobs": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_dag,PROD),test_task)" + ], "fineGrainedLineages": [] } } @@ -417,7 +421,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057136612, + "timestampMillis": 1699643411390, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -577,7 +581,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-09-30 06:58:59.567004+00:00", + "start_date": "2023-11-10 19:10:15.128009+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -595,7 +599,7 @@ "name": "simple_dag_run_another_data_task_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1696057139567, + "time": 1699643415128, "actor": "urn:li:corpuser:datahub" } } @@ -620,7 +624,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057139567, + "timestampMillis": 1699643415128, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -716,7 +720,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1696057140164, + "timestampMillis": 1699643415856, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json index 11a0b17b45b95c..0a704ed10c911e 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json @@ -11,7 +11,7 @@ "catchup": "False", "description": "None", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py'", + "fileloc": "'/Users/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "[]", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json index 19e4aac9fb95e1..3b4b60174f99f1 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json @@ -11,7 +11,7 @@ "catchup": "False", "description": "None", "doc_md": "None", - "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "fileloc": "'/Users/shubham/airflow1/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", "is_paused_upon_creation": "None", "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", "tags": "[]", diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json index b67464b385335c..99a8aadb7fd9c1 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json @@ -194,7 +194,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-10-15 20:27:26.883178+00:00", + "start_date": "2023-11-10 19:12:17.805860+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -212,7 +212,7 @@ "name": "sqlite_operator_create_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1697401646883, + "time": 1699643537805, "actor": "urn:li:corpuser:datahub" } } @@ -261,7 +261,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401646883, + "timestampMillis": 1699643537805, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -442,7 +442,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401647826, + "timestampMillis": 1699643538759, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -615,7 +615,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-10-15 20:27:31.398799+00:00", + "start_date": "2023-11-10 19:12:22.560376+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -633,7 +633,7 @@ "name": "sqlite_operator_populate_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1697401651398, + "time": 1699643542560, "actor": "urn:li:corpuser:datahub" } } @@ -682,7 +682,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401651398, + "timestampMillis": 1699643542560, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -792,7 +792,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401652651, + "timestampMillis": 1699643543925, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1035,7 +1035,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-10-15 20:27:37.697995+00:00", + "start_date": "2023-11-10 19:12:29.429032+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1053,7 +1053,7 @@ "name": "sqlite_operator_transform_cost_table_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1697401657697, + "time": 1699643549429, "actor": "urn:li:corpuser:datahub" } } @@ -1126,7 +1126,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401657697, + "timestampMillis": 1699643549429, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1362,7 +1362,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401659496, + "timestampMillis": 1699643551423, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1537,7 +1537,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-10-15 20:27:45.670215+00:00", + "start_date": "2023-11-10 19:12:37.423556+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1555,7 +1555,7 @@ "name": "sqlite_operator_cleanup_costs_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1697401665670, + "time": 1699643557423, "actor": "urn:li:corpuser:datahub" } } @@ -1604,7 +1604,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401665670, + "timestampMillis": 1699643557423, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1716,7 +1716,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401667670, + "timestampMillis": 1699643559607, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -1891,7 +1891,7 @@ "customProperties": { "run_id": "manual_run_test", "duration": "None", - "start_date": "2023-10-15 20:27:51.559194+00:00", + "start_date": "2023-11-10 19:12:43.792375+00:00", "end_date": "None", "execution_date": "2023-09-27 21:34:38+00:00", "try_number": "0", @@ -1909,7 +1909,7 @@ "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", "type": "BATCH_AD_HOC", "created": { - "time": 1697401671559, + "time": 1699643563792, "actor": "urn:li:corpuser:datahub" } } @@ -1958,7 +1958,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401671559, + "timestampMillis": 1699643563792, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" @@ -2070,7 +2070,7 @@ "aspectName": "dataProcessInstanceRunEvent", "aspect": { "json": { - "timestampMillis": 1697401673788, + "timestampMillis": 1699643566350, "partitionSpec": { "type": "FULL_TABLE", "partition": "FULL_TABLE_SNAPSHOT" diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index d8620e74d7e305..7fbf7079959942 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -188,10 +188,17 @@ def test_entities(): == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) + assert ( + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)").urn + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)" + ) + with pytest.raises(ValueError, match="invalid"): Urn("not a URN") - with pytest.raises(ValueError, match="only supports datasets"): + with pytest.raises( + ValueError, match="only supports datasets and upstream datajobs" + ): Urn("urn:li:mlModel:(urn:li:dataPlatform:science,scienceModel,PROD)") @@ -199,13 +206,19 @@ def test_entities(): ["inlets", "outlets", "capture_executions"], [ pytest.param( - [Dataset("snowflake", "mydb.schema.tableConsumed")], + [ + Dataset("snowflake", "mydb.schema.tableConsumed"), + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)"), + ], [Dataset("snowflake", "mydb.schema.tableProduced")], False, id="airflow-lineage-no-executions", ), pytest.param( - [Dataset("snowflake", "mydb.schema.tableConsumed")], + [ + Dataset("snowflake", "mydb.schema.tableConsumed"), + Urn("urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)"), + ], [Dataset("snowflake", "mydb.schema.tableProduced")], True, id="airflow-lineage-capture-executions", @@ -293,9 +306,13 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. - assert len(op2.inlets) == 1 + assert len(op2.inlets) == 2 assert len(op2.outlets) == 1 - assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) + assert all( + map( + lambda let: isinstance(let, Dataset) or isinstance(let, Urn), op2.inlets + ) + ) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. @@ -338,6 +355,10 @@ def test_lineage_backend(mock_emit, inlets, outlets, capture_executions): mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" ) + assert ( + mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[1] + == "urn:li:dataJob:(urn:li:dataFlow:(airflow,testDag,PROD),testTask)" + ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" From 0e3efabd2c19e24bcfb81602f897802be1cb1d06 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 10 Nov 2023 16:00:20 -0800 Subject: [PATCH 22/29] fix(build): set `@cliMajorVersion@` correctly (#9228) --- .github/workflows/build-and-test.yml | 6 +-- .github/workflows/check-datahub-jars.yml | 4 +- .github/workflows/code-checks.yml | 4 +- .github/workflows/docker-ingestion-smoke.yml | 4 +- .github/workflows/docker-postgres-setup.yml | 4 +- .github/workflows/docker-unified.yml | 52 +++++-------------- .github/workflows/publish-datahub-jars.yml | 4 +- .github/workflows/spark-smoke-test.yml | 5 +- metadata-service/configuration/build.gradle | 7 ++- .../src/main/resources/application.yml | 9 ++-- metadata-service/factories/build.gradle | 5 -- .../tests/read_only/test_services_up.py | 8 +++ 12 files changed, 39 insertions(+), 73 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 25f3957e8f0861..10c137a206531a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -27,7 +27,7 @@ jobs: command: [ # metadata-ingestion and airflow-plugin each have dedicated build jobs "except_metadata_ingestion", - "frontend" + "frontend", ] timezone: ["UTC", "America/New_York"] runs-on: ubuntu-latest @@ -36,9 +36,7 @@ jobs: - uses: szenius/set-timezone@v1.0 with: timezoneLinux: ${{ matrix.timezone }} - - uses: actions/checkout@v3 - with: - fetch-depth: 800 + - uses: hsheth2/sane-checkout-action@v1 - name: Set up JDK 11 uses: actions/setup-java@v3 with: diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index 41f9ea91a94e20..8e507ea40fd963 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -27,9 +27,7 @@ jobs: command: ["datahub-client", "datahub-protobuf", "spark-lineage"] runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 800 + - uses: hsheth2/sane-checkout-action@v1 - name: Set up JDK 11 uses: actions/setup-java@v3 with: diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index e12971b8a62084..38f0946678034b 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -31,9 +31,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - uses: actions/setup-python@v4 with: python-version: "3.10" diff --git a/.github/workflows/docker-ingestion-smoke.yml b/.github/workflows/docker-ingestion-smoke.yml index 82b57d23609a56..803ddc6fcec751 100644 --- a/.github/workflows/docker-ingestion-smoke.yml +++ b/.github/workflows/docker-ingestion-smoke.yml @@ -50,9 +50,7 @@ jobs: if: ${{ needs.setup.outputs.publish == 'true' }} steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml index fda4349f90bf7c..e4d6cfc106f812 100644 --- a/.github/workflows/docker-postgres-setup.yml +++ b/.github/workflows/docker-postgres-setup.yml @@ -46,9 +46,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 5f5a62de6288c8..18cb946b951dd1 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -74,9 +74,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image run: | ./gradlew :metadata-service:war:build -x test --parallel @@ -132,9 +130,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image run: | ./gradlew :metadata-jobs:mae-consumer-job:build -x test --parallel @@ -190,9 +186,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image run: | ./gradlew :metadata-jobs:mce-consumer-job:build -x test --parallel @@ -248,9 +242,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image run: | ./gradlew :datahub-upgrade:build -x test --parallel @@ -306,9 +298,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Pre-build artifacts for docker image run: | ./gradlew :datahub-frontend:dist -x test -x yarnTest -x yarnLint --parallel @@ -366,9 +356,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: @@ -388,9 +376,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: @@ -410,9 +396,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - name: Build and push uses: ./.github/actions/docker-custom-build-and-push with: @@ -434,9 +418,7 @@ jobs: needs: setup steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 id: filter with: @@ -468,9 +450,7 @@ jobs: needs: [setup, datahub_ingestion_base_build] steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 id: filter with: @@ -510,9 +490,7 @@ jobs: needs: [setup, datahub_ingestion_base_build] steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 id: filter with: @@ -554,9 +532,7 @@ jobs: needs: [setup, datahub_ingestion_base_slim_build] steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 id: filter with: @@ -637,9 +613,7 @@ jobs: needs: [setup, datahub_ingestion_base_full_build] steps: - name: Check out the repo - uses: actions/checkout@v3 - with: - fetch-depth: 800 + uses: hsheth2/sane-checkout-action@v1 - uses: dorny/paths-filter@v2 id: filter with: diff --git a/.github/workflows/publish-datahub-jars.yml b/.github/workflows/publish-datahub-jars.yml index 7cd07b130dd800..ec7985ef3b3d03 100644 --- a/.github/workflows/publish-datahub-jars.yml +++ b/.github/workflows/publish-datahub-jars.yml @@ -48,9 +48,7 @@ jobs: needs: ["check-secret", "setup"] if: ${{ needs.check-secret.outputs.publish-enabled == 'true' }} steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 800 + - uses: hsheth2/sane-checkout-action@v1 - name: Set up JDK 11 uses: actions/setup-java@v3 with: diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 541b2019b93ef1..70b66d6452b266 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -29,10 +29,7 @@ jobs: spark-smoke-test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 800 - fetch-tags: true + - uses: hsheth2/sane-checkout-action@v1 - name: Set up JDK 11 uses: actions/setup-java@v3 with: diff --git a/metadata-service/configuration/build.gradle b/metadata-service/configuration/build.gradle index bf79469633b0f1..80cf6541261c23 100644 --- a/metadata-service/configuration/build.gradle +++ b/metadata-service/configuration/build.gradle @@ -1,6 +1,7 @@ plugins { id 'java' } +apply from: "../../gradle/versioning/versioning.gradle" dependencies { implementation externalDependency.jacksonDataBind @@ -12,4 +13,8 @@ dependencies { compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok -} \ No newline at end of file +} + +processResources.configure { + finalizedBy printVersionDetails // always print version details +} diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 46aa02d98572e6..40674e13e647f9 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -44,6 +44,7 @@ authorization: restApiAuthorization: ${REST_API_AUTHORIZATION_ENABLED:false} ingestion: + # The value of cliMajorVersion is substituted in by the processResources Gradle task. enabled: ${UI_INGESTION_ENABLED:true} defaultCliVersion: "${UI_INGESTION_DEFAULT_CLI_VERSION:@cliMajorVersion@}" maxSerializedStringLength: "${INGESTION_MAX_SERIALIZED_STRING_LENGTH:16000000}" # Indicates the maximum allowed JSON String length Jackson will handle, impacts the maximum size of ingested aspects @@ -74,7 +75,7 @@ datahub: plugin: pluginSecurityMode: ${PLUGIN_SECURITY_MODE:RESTRICTED} # Possible value RESTRICTED or LENIENT, default to RESTRICTED entityRegistry: - path: ${ENTITY_REGISTRY_PLUGIN_PATH:/etc/datahub/plugins/models} + path: ${ENTITY_REGISTRY_PLUGIN_PATH:/etc/datahub/plugins/models} retention: path: ${RETENTION_PLUGIN_PATH:/etc/datahub/plugins/retention} auth: @@ -280,14 +281,13 @@ updateIndices: ingestionScheduler: enabled: ${ENABLE_INGESTION_SCHEDULER_HOOK:true} # enable to execute ingestion scheduling - bootstrap: upgradeDefaultBrowsePaths: enabled: ${UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED:false} # enable to run the upgrade to migrate legacy default browse paths to new ones backfillBrowsePathsV2: enabled: ${BACKFILL_BROWSE_PATHS_V2:false} # Enables running the backfill of browsePathsV2 upgrade step. There are concerns about the load of this step so hiding it behind a flag. Deprecating in favor of running through SystemUpdate reprocessDefaultBrowsePathsV2: - enabled: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:false} # reprocess V2 browse paths which were set to the default: {"path":[{"id":"Default"}]} + enabled: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:false} # reprocess V2 browse paths which were set to the default: {"path":[{"id":"Default"}]} policies: file: ${BOOTSTRAP_POLICIES_FILE:classpath:boot/policies.json} # eg for local file @@ -295,7 +295,6 @@ bootstrap: servlets: waitTimeout: ${BOOTSTRAP_SERVLETS_WAITTIMEOUT:60} # Total waiting time in seconds for servlets to initialize - systemUpdate: initialBackOffMs: ${BOOTSTRAP_SYSTEM_UPDATE_INITIAL_BACK_OFF_MILLIS:5000} maxBackOffs: ${BOOTSTRAP_SYSTEM_UPDATE_MAX_BACK_OFFS:50} @@ -371,4 +370,4 @@ cache: corpUserCredentials: 20 corpUserSettings: 20 -springdoc.api-docs.groups.enabled: true \ No newline at end of file +springdoc.api-docs.groups.enabled: true diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index f848a5e3397817..2e99def17c3c50 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -1,5 +1,4 @@ apply plugin: 'java-library' -apply from: "../../gradle/versioning/versioning.gradle" dependencies { api project(':metadata-io') @@ -65,7 +64,3 @@ configurations.all{ exclude group: "commons-io", module:"commons-io" exclude group: "jline", module:"jline" } - -processResources.configure { - finalizedBy printVersionDetails // always print version details -} diff --git a/smoke-test/tests/read_only/test_services_up.py b/smoke-test/tests/read_only/test_services_up.py index e48df52bb98642..cbe92625f4689a 100644 --- a/smoke-test/tests/read_only/test_services_up.py +++ b/smoke-test/tests/read_only/test_services_up.py @@ -23,3 +23,11 @@ def test_gms_config_accessible(): assert gms_config["versions"]["linkedin/datahub"]["version"] == DATAHUB_VERSION else: print("[WARN] TEST_DATAHUB_VERSION is not set") + + # Make sure that the default CLI version gets generated properly. + # While we don't want to hardcode the actual value, we can make + # sure it mostly looks like a version string. + default_cli_version: str = gms_config["managedIngestion"]["defaultCliVersion"] + print(f"Default CLI version: {default_cli_version}") + assert not default_cli_version.startswith("@") + assert "." in default_cli_version From ebd2e2312bdd23a92bbf403a26f64194807f70e6 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Fri, 10 Nov 2023 22:10:00 -0600 Subject: [PATCH 23/29] fix(datahub-ingestion): remove old jars, sync pyspark version (#9217) --- docker/datahub-ingestion-base/build.gradle | 7 ++++--- docker/datahub-ingestion/Dockerfile | 16 ++++++++++++++-- docker/datahub-ingestion/pyspark_jars.sh | 22 ++++++++++++++++++++++ metadata-ingestion/setup.py | 4 ++-- 4 files changed, 42 insertions(+), 7 deletions(-) create mode 100755 docker/datahub-ingestion/pyspark_jars.sh diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index bbd8242553cc56..64635671343ef4 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -9,20 +9,21 @@ ext { docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry docker_repo = 'datahub-ingestion-base' docker_dir = 'datahub-ingestion-base' + docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") revision = 2 // increment to trigger rebuild } docker { - name "${docker_registry}/${docker_repo}:v${version}-slim" - version "v${version}-slim" + name "${docker_registry}/${docker_repo}:v${version}-${docker_target}" + version "v${version}-${docker_target}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { include "docker/${docker_dir}/*" }.exclude { i -> i.file.isHidden() || i.file == buildDir } - buildArgs([APP_ENV: 'slim']) + buildArgs([APP_ENV: docker_target]) } tasks.getByName('docker').dependsOn('build') diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 0132ceaa9b1a95..2abd4e2f33befd 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -22,10 +22,22 @@ ENV PATH="/datahub-ingestion/.local/bin:$PATH" FROM base as slim-install RUN pip install --no-cache --user ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" -FROM base as full-install +FROM base as full-install-build + +USER 0 +RUN apt-get update && apt-get install -y -qq maven + +USER datahub +COPY ./docker/datahub-ingestion/pyspark_jars.sh . + RUN pip install --no-cache --user ".[base]" && \ pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \ - pip install --no-cache --user ".[all]" + pip install --no-cache --user ".[all]" && \ + ./pyspark_jars.sh + +FROM base as full-install + +COPY --from=full-install-build /datahub-ingestion/.local /datahub-ingestion/.local FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. diff --git a/docker/datahub-ingestion/pyspark_jars.sh b/docker/datahub-ingestion/pyspark_jars.sh new file mode 100755 index 00000000000000..ecd24e78c41057 --- /dev/null +++ b/docker/datahub-ingestion/pyspark_jars.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +set -ex + +HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}" +ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" +PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars" + +# Remove conflicting versions +echo "Removing version conflicts from $PYSPARK_JARS" +CONFLICTS="zookeeper hadoop- slf4j-" +for jar in $CONFLICTS; do + rm "$PYSPARK_JARS/$jar"*.jar +done + +# Fetch dependencies +mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY" +mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY" + +# Move to pyspark location +echo "Moving jars to $PYSPARK_JARS" +find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \; diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 5f44f14c3d74cc..f3782abe576d32 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -242,7 +242,7 @@ } data_lake_profiling = { - "pydeequ==1.1.0", + "pydeequ~=1.1.0", "pyspark~=3.3.0", } @@ -256,7 +256,7 @@ databricks = { # 0.1.11 appears to have authentication issues with azure databricks "databricks-sdk>=0.9.0", - "pyspark", + "pyspark~=3.3.0", "requests", } From 7ba54fdb9820cd79296801c48d05dc177e3739f1 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Mon, 13 Nov 2023 12:33:34 +0900 Subject: [PATCH 24/29] fix: re-add security.md to sidebar (#9229) --- SECURITY.md | 2 +- docs-website/sidebars.js | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/SECURITY.md b/SECURITY.md index 3ca87b08d844df..0e301d37483739 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,4 +1,4 @@ -# Reporting security issues +# Reporting Security Issues If you think you have found a security vulnerability, please send a report to security@datahubproject.io. This address can be used for all of Acryl Data’s open source and commercial products (including but not limited to DataHub and Acryl Data). We can accept only vulnerability reports at this address. diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index f15f2927379c56..801e0fbd07d365 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -660,6 +660,7 @@ module.exports = { "docs/CONTRIBUTING", "docs/links", "docs/rfc", + "SECURITY", ], }, { From 582eebe739ef7f8fc7651a78eee9306143360b68 Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Mon, 13 Nov 2023 10:57:49 -0600 Subject: [PATCH 25/29] feat(policies): reduce default access for all users (#9067) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- .github/scripts/check_policies.py | 28 ++++-- .../war/src/main/resources/boot/policies.json | 89 ++++++++++--------- .../cypress/e2e/settings/managing_groups.js | 2 +- 3 files changed, 70 insertions(+), 49 deletions(-) diff --git a/.github/scripts/check_policies.py b/.github/scripts/check_policies.py index 2ad5f7fff015b2..cc3576e05413c4 100644 --- a/.github/scripts/check_policies.py +++ b/.github/scripts/check_policies.py @@ -20,7 +20,7 @@ elif urn == "urn:li:dataHubPolicy:editor-platform-policy": editor_platform_policy_privileges = policy["info"]["privileges"] elif urn == "urn:li:dataHubPolicy:7": - all_user_platform_policy_privilges = policy["info"]["privileges"] + all_user_platform_policy_privileges = policy["info"]["privileges"] try: doc_type = policy["info"]["type"] privileges = policy["info"]["privileges"] @@ -54,10 +54,22 @@ ) assert len(diff_policies) == 0, f"Missing privileges for root user are {diff_policies}" -diff_policies = set(editor_platform_policy_privileges).difference( - set(all_user_platform_policy_privilges) -) -assert "MANAGE_POLICIES" not in all_user_platform_policy_privilges -assert ( - len(diff_policies) == 0 -), f"Missing privileges for all user policies are {diff_policies}" +# All users privileges checks +assert "MANAGE_POLICIES" not in all_user_platform_policy_privileges +assert "MANAGE_USERS_AND_GROUPS" not in all_user_platform_policy_privileges +assert "MANAGE_SECRETS" not in all_user_platform_policy_privileges +assert "MANAGE_USER_CREDENTIALS" not in all_user_platform_policy_privileges +assert "MANAGE_ACCESS_TOKENS" not in all_user_platform_policy_privileges +assert "EDIT_ENTITY" not in all_user_platform_policy_privileges +assert "DELETE_ENTITY" not in all_user_platform_policy_privileges + +# Editor checks +assert "MANAGE_POLICIES" not in editor_platform_policy_privileges +assert "MANAGE_USERS_AND_GROUPS" not in editor_platform_policy_privileges +assert "MANAGE_SECRETS" not in editor_platform_policy_privileges +assert "MANAGE_USER_CREDENTIALS" not in editor_platform_policy_privileges +assert "MANAGE_ACCESS_TOKENS" not in editor_platform_policy_privileges +# These don't prevent a user from modifying entities they are an asset owner of, i.e. their own profile info +assert "EDIT_CONTACT_INFO" not in editor_platform_policy_privileges +assert "EDIT_USER_PROFILE" not in editor_platform_policy_privileges +assert "EDIT_ENTITY_OWNERS" not in editor_platform_policy_privileges diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json index b7ffc11c08f055..32e68e7b133430 100644 --- a/metadata-service/war/src/main/resources/boot/policies.json +++ b/metadata-service/war/src/main/resources/boot/policies.json @@ -74,21 +74,6 @@ "editable":false } }, - { - "urn": "urn:li:dataHubPolicy:2" - }, - { - "urn": "urn:li:dataHubPolicy:3" - }, - { - "urn": "urn:li:dataHubPolicy:4" - }, - { - "urn": "urn:li:dataHubPolicy:5" - }, - { - "urn": "urn:li:dataHubPolicy:6" - }, { "urn": "urn:li:dataHubPolicy:7", "info": { @@ -99,18 +84,8 @@ "users":[] }, "privileges":[ - "MANAGE_INGESTION", - "MANAGE_SECRETS", - "MANAGE_USERS_AND_GROUPS", "VIEW_ANALYTICS", - "GENERATE_PERSONAL_ACCESS_TOKENS", - "MANAGE_DOMAINS", - "MANAGE_GLOBAL_ANNOUNCEMENTS", - "MANAGE_TESTS", - "MANAGE_GLOSSARIES", - "MANAGE_TAGS", - "MANAGE_GLOBAL_VIEWS", - "MANAGE_GLOBAL_OWNERSHIP_TYPES" + "GENERATE_PERSONAL_ACCESS_TOKENS" ], "displayName":"All Users - Base Platform Privileges", "description":"Grants base platform privileges to ALL users of DataHub. Change this policy to alter that behavior.", @@ -119,15 +94,6 @@ "editable":true } }, - { - "urn": "urn:li:dataHubPolicy:8" - }, - { - "urn": "urn:li:dataHubPolicy:9" - }, - { - "urn": "urn:li:dataHubPolicy:10" - }, { "urn": "urn:li:dataHubPolicy:view-entity-page-all", "info": { @@ -313,7 +279,6 @@ "VIEW_ENTITY_PAGE", "EDIT_ENTITY_TAGS", "EDIT_ENTITY_GLOSSARY_TERMS", - "EDIT_ENTITY_OWNERS", "EDIT_ENTITY_DOCS", "EDIT_ENTITY_DOC_LINKS", "EDIT_ENTITY_STATUS", @@ -321,16 +286,12 @@ "EDIT_ENTITY_DATA_PRODUCTS", "EDIT_DEPRECATION_PRIVILEGE", "EDIT_ENTITY_ASSERTIONS", - "EDIT_ENTITY", "EDIT_DATASET_COL_TAGS", "EDIT_DATASET_COL_GLOSSARY_TERMS", "EDIT_DATASET_COL_DESCRIPTION", "VIEW_DATASET_USAGE", "VIEW_DATASET_PROFILE", "EDIT_TAG_COLOR", - "EDIT_GROUP_MEMBERS", - "EDIT_USER_PROFILE", - "EDIT_CONTACT_INFO", "EDIT_LINEAGE", "EDIT_ENTITY_QUERIES", "SEARCH_PRIVILEGE", @@ -348,6 +309,54 @@ "editable":false } }, + { + "urn": "urn:li:dataHubPolicy:editor-metadata-policy-entities", + "info": { + "actors":{ + "resourceOwners":false, + "allUsers":false, + "allGroups":false, + "users":[], + "groups":[], + "roles":[ + "urn:li:dataHubRole:Editor" + ] + }, + "resources": { + "allResources": false, + "filter": { + "criteria": [ + { + "field": "RESOURCE_TYPE", + "values": [ + "dataset", + "chart", + "dashboard", + "dataFlow", + "dataJob", + "tag", + "container", + "domain", + "glossaryTerm", + "glossaryNode", + "notebook", + "dataProduct" + ], + "condition": "EQUALS" + } + ] + } + }, + "privileges":[ + "EDIT_ENTITY" + ], + "displayName":"Editors - Edit Metadata Entities", + "description":"Editors can edit primary metadata entities.", + "state":"ACTIVE", + "type":"METADATA", + "editable":true + } + }, { "urn": "urn:li:dataHubPolicy:reader-platform-policy", "info": { diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 8d689c7e2303c4..70219a550cd8bb 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -96,7 +96,7 @@ describe("create and manage group", () => { }); it("test user verify group participation", () => { - cy.loginWithCredentials(email,password); + cy.loginWithCredentials(); cy.visit("/settings/identities/groups"); cy.hideOnboardingTour(); cy.clickOptionWithText(`Test group EDITED ${test_id}`); From 4461b60583235c27bfe6244e6e0f12d08638aee5 Mon Sep 17 00:00:00 2001 From: Yuriy Gavrilov <44679014+YuriyGavrilov@users.noreply.github.com> Date: Mon, 13 Nov 2023 21:15:13 +0300 Subject: [PATCH 26/29] Update add new company s7 airlines (#9019) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 79f85433fbc184..6b8fa520e432ef 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [SpotHero](https://spothero.com) - [Stash](https://www.stash.com) - [Shanghai HuaRui Bank](https://www.shrbank.com) +- [s7 Airlines](https://www.s7.ru/) - [ThoughtWorks](https://www.thoughtworks.com) - [TypeForm](http://typeform.com) - [Udemy](https://www.udemy.com/) From 3844b78fa220a92cb2ec9dd8599d9109106f8a24 Mon Sep 17 00:00:00 2001 From: RyanHolstien Date: Mon, 13 Nov 2023 16:19:41 -0600 Subject: [PATCH 27/29] docs(debug): add debug information for cli (#9208) --- docs/actions/README.md | 3 ++- docs/cli.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/actions/README.md b/docs/actions/README.md index 23596ec67514e5..4fa44eec588bed 100644 --- a/docs/actions/README.md +++ b/docs/actions/README.md @@ -162,7 +162,8 @@ datahub actions -c -c ### Running in debug mode -Simply append the `--debug` flag to the CLI to run your action in debug mode. +Simply append the `--debug` flag to the CLI to run your action in debug mode. NOTE: This will reveal sensitive information in the logs, do not share the logs with outside resources and ensure untrusted +users will not have access to logs through UI ingestions before enabling on instances. ``` datahub actions -c --debug diff --git a/docs/cli.md b/docs/cli.md index 267f289d9f54a6..7dfac1e9b2bffc 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -134,7 +134,7 @@ The environment variables listed below take precedence over the DataHub CLI conf - `DATAHUB_GMS_TOKEN` (default `None`) - Used for communicating with DataHub Cloud. - `DATAHUB_TELEMETRY_ENABLED` (default `true`) - Set to `false` to disable telemetry. If CLI is being run in an environment with no access to public internet then this should be disabled. - `DATAHUB_TELEMETRY_TIMEOUT` (default `10`) - Set to a custom integer value to specify timeout in secs when sending telemetry. -- `DATAHUB_DEBUG` (default `false`) - Set to `true` to enable debug logging for CLI. Can also be achieved through `--debug` option of the CLI. +- `DATAHUB_DEBUG` (default `false`) - Set to `true` to enable debug logging for CLI. Can also be achieved through `--debug` option of the CLI. This exposes sensitive information in logs, enabling on production instances should be avoided especially if UI ingestion is in use as logs can be made available for runs through the UI. - `DATAHUB_VERSION` (default `head`) - Set to a specific version to run quickstart with the particular version of docker images. - `ACTIONS_VERSION` (default `head`) - Set to a specific version to run quickstart with that image tag of `datahub-actions` container. - `DATAHUB_ACTIONS_IMAGE` (default `acryldata/datahub-actions`) - Set to `-slim` to run a slimmer actions container without pyspark/deequ features. From ff90fb633da78a25f19f33dca0dae58df2b5ff82 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Mon, 13 Nov 2023 16:26:53 -0600 Subject: [PATCH 28/29] =?UTF-8?q?fix(datahub-ingestion):=20prevent=20trans?= =?UTF-8?q?itive=20deps,=20bump=20addtional=20pyspa=E2=80=A6=20(#9233)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .dockerignore | 5 +-- datahub-frontend/build.gradle | 3 +- datahub-upgrade/build.gradle | 3 +- docker/datahub-ingestion-base/build.gradle | 8 +++-- docker/datahub-ingestion/Dockerfile | 4 +-- docker/datahub-ingestion/README.md | 7 ++++ docker/datahub-ingestion/build.gradle | 16 ++++----- docker/datahub-ingestion/pyspark_jars.sh | 40 +++++++++++++-------- docker/elasticsearch-setup/build.gradle | 5 +-- docker/kafka-setup/build.gradle | 3 +- docker/mysql-setup/build.gradle | 3 +- docker/postgres-setup/build.gradle | 3 +- metadata-jobs/mae-consumer-job/build.gradle | 3 +- metadata-jobs/mce-consumer-job/build.gradle | 3 +- metadata-service/war/build.gradle | 3 +- 15 files changed, 70 insertions(+), 39 deletions(-) diff --git a/.dockerignore b/.dockerignore index 29c6c45bb06536..701263f5fedded 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,7 @@ **/node_modules/ -datahub-frontend/build/ -metadata-ingestion/venv/ +*/build/ +*/*/build/ +*/venv/ out **/*.class # Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars diff --git a/datahub-frontend/build.gradle b/datahub-frontend/build.gradle index fdf13bac0accc0..eb81b317455361 100644 --- a/datahub-frontend/build.gradle +++ b/datahub-frontend/build.gradle @@ -77,10 +77,11 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/datahub-upgrade/build.gradle b/datahub-upgrade/build.gradle index 5d0edf3ee8427c..81e6e79c2a5e52 100644 --- a/datahub-upgrade/build.gradle +++ b/datahub-upgrade/build.gradle @@ -88,10 +88,11 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index 64635671343ef4..c4d8a962dcd325 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -10,18 +10,20 @@ ext { docker_repo = 'datahub-ingestion-base' docker_dir = 'datahub-ingestion-base' docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") + docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" revision = 2 // increment to trigger rebuild } docker { - name "${docker_registry}/${docker_repo}:v${version}-${docker_target}" - version "v${version}-${docker_target}" + name "${docker_registry}/${docker_repo}:v${docker_version}" + version "v${docker_version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } buildArgs([APP_ENV: docker_target]) } diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 2abd4e2f33befd..1aee79a428a98a 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -32,8 +32,8 @@ COPY ./docker/datahub-ingestion/pyspark_jars.sh . RUN pip install --no-cache --user ".[base]" && \ pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \ - pip install --no-cache --user ".[all]" && \ - ./pyspark_jars.sh + pip install --no-cache --user ".[all]" +RUN ./pyspark_jars.sh FROM base as full-install diff --git a/docker/datahub-ingestion/README.md b/docker/datahub-ingestion/README.md index 6580199bcce216..ed856314c5cc0f 100644 --- a/docker/datahub-ingestion/README.md +++ b/docker/datahub-ingestion/README.md @@ -2,3 +2,10 @@ [![datahub-ingestion docker](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml/badge.svg)](https://github.com/datahub-project/datahub/actions/workflows/docker-ingestion.yml) Refer to the [metadata ingestion framework](../../metadata-ingestion) to understand the architecture and responsibilities of this service. + +## Slim vs Full Image Build + +There are two versions of this image. One includes pyspark and Oracle dependencies and is larger due to the java dependencies. + +Running the standard build results in the `slim` image without pyspark being generated by default. In order to build the full +image with pyspark use the following project property `-PdockerTarget=full`. diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index fed33752a4b816..247b896d6955cb 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -9,6 +9,8 @@ ext { docker_registry = rootProject.ext.docker_registry == 'linkedin' ? 'acryldata' : docker_registry docker_repo = 'datahub-ingestion' docker_dir = 'datahub-ingestion' + docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") + docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" revision = 2 // increment to trigger rebuild } @@ -19,21 +21,19 @@ dependencies { } docker { - name "${docker_registry}/${docker_repo}:v${version}-slim" - version "v${version}-slim" - dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile-slim-only") + name "${docker_registry}/${docker_repo}:v${docker_version}" + version "v${docker_version}" + dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile${docker_target == "slim" ? "-slim-only" : ""}") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" include "metadata-ingestion/**" include "metadata-ingestion-modules/**" }.exclude { - i -> i.file.isHidden() || - i.file == buildDir || - i.file == project(':metadata-ingestion').buildDir || - i.file == project(':metadata-ingestion-modules').buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } buildArgs([DOCKER_VERSION: version, - RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace('-slim', '')]) + RELEASE_VERSION: version.replace('-SNAPSHOT', '').replace('v', '').replace("-slim", '')]) } tasks.getByName('docker').dependsOn(['build', ':docker:datahub-ingestion-base:docker', diff --git a/docker/datahub-ingestion/pyspark_jars.sh b/docker/datahub-ingestion/pyspark_jars.sh index ecd24e78c41057..ab4b223f0358a5 100755 --- a/docker/datahub-ingestion/pyspark_jars.sh +++ b/docker/datahub-ingestion/pyspark_jars.sh @@ -2,21 +2,33 @@ set -ex -HADOOP_CLIENT_DEPENDENCY="${HADOOP_CLIENT_DEPENDENCY:-org.apache.hadoop:hadoop-client:3.3.6}" -ZOOKEEPER_DEPENDENCY="${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars" -# Remove conflicting versions -echo "Removing version conflicts from $PYSPARK_JARS" -CONFLICTS="zookeeper hadoop- slf4j-" -for jar in $CONFLICTS; do - rm "$PYSPARK_JARS/$jar"*.jar -done +function replace_jar { + JAR_PREFIX=$1 + TRANSITIVE=$2 + DEPENDENCY=$3 -# Fetch dependencies -mvn dependency:get -Dtransitive=true -Dartifact="$HADOOP_CLIENT_DEPENDENCY" -mvn dependency:get -Dtransitive=true -Dartifact="$ZOOKEEPER_DEPENDENCY" + echo "Removing version conflicts for $PYSPARK_JARS/$JAR_PREFIX*.jar" + ls "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true + rm "$PYSPARK_JARS/$JAR_PREFIX"*.jar || true + rm -r "$HOME/.m2" || true -# Move to pyspark location -echo "Moving jars to $PYSPARK_JARS" -find "$HOME/.m2" -type f -name "*.jar" -exec mv {} "$PYSPARK_JARS/" \; + if [ ! -z "$DEPENDENCY" ]; then + echo "Resolving $DEPENDENCY" + mvn dependency:get -Dtransitive=$TRANSITIVE -Dartifact="$DEPENDENCY" >/dev/null + + echo "Moving jars to $PYSPARK_JARS" + find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec echo "{}" \; + find "$HOME/.m2" -type f -name "$JAR_PREFIX*.jar" -exec cp {} "$PYSPARK_JARS/" \; + fi +} + +replace_jar "zookeeper-" "false" "${ZOOKEEPER_DEPENDENCY:-org.apache.zookeeper:zookeeper:3.7.2}" +replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_API_DEPENDENCY:-org.apache.hadoop:hadoop-client-api:3.3.6}" +replace_jar "hadoop-client-" "true" "${HADOOP_CLIENT_RUNTIME_DEPENDENCY:-org.apache.hadoop:hadoop-client-runtime:3.3.6}" +replace_jar "hadoop-yarn-" "true" "${HADOOP_YARN_DEPENDENCY:-org.apache.hadoop:hadoop-yarn-server-web-proxy:3.3.6}" +replace_jar "snappy-java-" "false" "${SNAPPY_JAVA_DEPENDENCY:-org.xerial.snappy:snappy-java:1.1.10.5}" +replace_jar "libthrift-" "false" "${LIBTHRIFT_DEPENDENCY:-org.apache.thrift:libthrift:0.19.0}" +replace_jar "ivy-" "false" "${IVY_DEPENDENCY:-org.apache.ivy:ivy:2.5.2}" +replace_jar "parquet-jackson-" "false" "${PARQUET_JACKSON_DEPENDENCY:-org.apache.parquet:parquet-jackson:1.13.1}" diff --git a/docker/elasticsearch-setup/build.gradle b/docker/elasticsearch-setup/build.gradle index ffee3b9c65cf4f..ac935ca42fd12a 100644 --- a/docker/elasticsearch-setup/build.gradle +++ b/docker/elasticsearch-setup/build.gradle @@ -15,10 +15,11 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" - include "metadata-service/restli-servlet-impl/src/main/resources/index/**" + include 'metadata-service/restli-servlet-impl/src/main/resources/index/**' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/kafka-setup/build.gradle b/docker/kafka-setup/build.gradle index 573ef21c88bf91..25f9847190de3c 100644 --- a/docker/kafka-setup/build.gradle +++ b/docker/kafka-setup/build.gradle @@ -15,9 +15,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/mysql-setup/build.gradle b/docker/mysql-setup/build.gradle index 0d8941cce48339..1598866914c0ee 100644 --- a/docker/mysql-setup/build.gradle +++ b/docker/mysql-setup/build.gradle @@ -16,9 +16,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/docker/postgres-setup/build.gradle b/docker/postgres-setup/build.gradle index 8a026be09d2b4c..e24e206c99145c 100644 --- a/docker/postgres-setup/build.gradle +++ b/docker/postgres-setup/build.gradle @@ -16,9 +16,10 @@ docker { version "v${version}" dockerfile file("${rootProject.projectDir}/docker/${docker_dir}/Dockerfile") files fileTree(rootProject.projectDir) { + include '.dockerignore' include "docker/${docker_dir}/*" }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mae-consumer-job/build.gradle b/metadata-jobs/mae-consumer-job/build.gradle index 51c758f4343280..5e735e118493cd 100644 --- a/metadata-jobs/mae-consumer-job/build.gradle +++ b/metadata-jobs/mae-consumer-job/build.gradle @@ -45,11 +45,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-jobs/mce-consumer-job/build.gradle b/metadata-jobs/mce-consumer-job/build.gradle index daf41a1e0303ee..ef042188bc3d83 100644 --- a/metadata-jobs/mce-consumer-job/build.gradle +++ b/metadata-jobs/mce-consumer-job/build.gradle @@ -56,11 +56,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files bootJar.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 54e95fdcfe5798..35730ad6dfa9f3 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -70,11 +70,12 @@ docker { dockerfile file("${rootProject.projectDir}/docker/${docker_repo}/Dockerfile") files war.outputs.files files fileTree(rootProject.projectDir) { + include '.dockerignore' include 'docker/monitoring/*' include "docker/${docker_repo}/*" include 'metadata-models/src/main/resources/*' }.exclude { - i -> i.file.isHidden() || i.file == buildDir + i -> (!i.file.name.endsWith(".dockerignore") && i.file.isHidden()) } tag("Debug", "${docker_registry}/${docker_repo}:debug") From 19aa21506886692ff221f1b859e0633df995fb43 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 13 Nov 2023 19:00:30 -0500 Subject: [PATCH 29/29] feat(ingest/dbt): dbt column-level lineage (#8991) --- .../airflow-plugin/setup.py | 2 +- metadata-ingestion/setup.py | 4 +- .../api/incremental_lineage_helper.py | 16 +- .../ingestion/source/dbt/dbt_common.py | 499 ++++++++++-- .../datahub/ingestion/source/dbt/dbt_core.py | 3 +- .../src/datahub/utilities/sqlglot_lineage.py | 81 +- .../src/datahub/utilities/topological_sort.py | 49 ++ .../dbt_enabled_with_schemas_mces_golden.json | 255 ++++-- .../dbt_test_column_meta_mapping_golden.json | 283 +++++-- .../dbt/dbt_test_events_golden.json | 731 ++++++++++++------ ...th_complex_owner_patterns_mces_golden.json | 240 +++++- ...th_data_platform_instance_mces_golden.json | 246 ++++-- ...h_non_incremental_lineage_mces_golden.json | 198 ++++- ..._target_platform_instance_mces_golden.json | 246 ++++-- .../tests/integration/dbt/test_dbt.py | 8 +- .../test_snowflake_cte_name_collision.json | 47 ++ ...owflake_full_table_name_col_reference.json | 55 ++ .../goldens/test_snowflake_unused_cte.json | 39 + .../unit/sql_parsing/test_sqlglot_lineage.py | 128 ++- .../tests/unit/test_topological_sort.py | 33 + 20 files changed, 2550 insertions(+), 613 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/topological_sort.py create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_cte_name_collision.json create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_full_table_name_col_reference.json create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_unused_cte.json create mode 100644 metadata-ingestion/tests/unit/test_topological_sort.py diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index e88fc870cb3331..838322f83833bb 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -14,7 +14,7 @@ def get_long_description(): return pathlib.Path(os.path.join(root, "README.md")).read_text() -_version = package_metadata["__version__"] +_version: str = package_metadata["__version__"] _self_pin = f"=={_version}" if not _version.endswith("dev0") else "" diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index f3782abe576d32..ebe180703051f7 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -305,8 +305,8 @@ "datahub-lineage-file": set(), "datahub-business-glossary": set(), "delta-lake": {*data_lake_profiling, *delta_lake}, - "dbt": {"requests"} | aws_common, - "dbt-cloud": {"requests"}, + "dbt": {"requests"} | sqlglot_lib | aws_common, + "dbt-cloud": {"requests"} | sqlglot_lib, "druid": sql_common | {"pydruid>=0.6.2"}, "dynamodb": aws_common, # Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py index 945b201ca5758c..479486ce228998 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py @@ -15,7 +15,7 @@ from datahub.specific.dataset import DatasetPatchBuilder -def _convert_upstream_lineage_to_patch( +def convert_upstream_lineage_to_patch( urn: str, aspect: UpstreamLineageClass, system_metadata: Optional[SystemMetadataClass], @@ -86,16 +86,11 @@ def _merge_upstream_lineage( def _lineage_wu_via_read_modify_write( - graph: Optional[DataHubGraph], + graph: DataHubGraph, urn: str, aspect: UpstreamLineageClass, system_metadata: Optional[SystemMetadataClass], ) -> MetadataWorkUnit: - if graph is None: - raise ValueError( - "Failed to handle incremental lineage, DataHubGraph is missing. " - "Use `datahub-rest` sink OR provide `datahub-api` config in recipe. " - ) gms_aspect = graph.get_aspect(urn, UpstreamLineageClass) if gms_aspect: new_aspect = _merge_upstream_lineage(aspect, gms_aspect) @@ -131,11 +126,16 @@ def auto_incremental_lineage( yield wu if lineage_aspect.fineGrainedLineages: + if graph is None: + raise ValueError( + "Failed to handle incremental lineage, DataHubGraph is missing. " + "Use `datahub-rest` sink OR provide `datahub-api` config in recipe. " + ) yield _lineage_wu_via_read_modify_write( graph, urn, lineage_aspect, wu.metadata.systemMetadata ) elif lineage_aspect.upstreams: - yield _convert_upstream_lineage_to_patch( + yield convert_upstream_lineage_to_patch( urn, lineage_aspect, wu.metadata.systemMetadata ) else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 76cb82aaa5b4be..94df0a4f8a166e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1,3 +1,4 @@ +import itertools import logging import re from abc import abstractmethod @@ -30,6 +31,9 @@ platform_name, support_status, ) +from datahub.ingestion.api.incremental_lineage_helper import ( + convert_upstream_lineage_to_patch, +) from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes @@ -67,6 +71,9 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, UpstreamClass, UpstreamLineage, ) @@ -100,9 +107,17 @@ UpstreamLineageClass, ViewPropertiesClass, ) -from datahub.specific.dataset import DatasetPatchBuilder from datahub.utilities.mapping import Constants, OperationProcessor +from datahub.utilities.sqlglot_lineage import ( + SchemaInfo, + SchemaResolver, + SqlParsingDebugInfo, + SqlParsingResult, + detach_ctes, + sqlglot_lineage, +) from datahub.utilities.time import datetime_to_ts_millis +from datahub.utilities.topological_sort import topological_sort logger = logging.getLogger(__name__) DBT_PLATFORM = "dbt" @@ -280,10 +295,19 @@ class DBTCommonConfig( default=False, description="When enabled, dbt test warnings will be treated as failures.", ) - # override fault value to True. + infer_dbt_schemas: bool = Field( + default=True, + description="When enabled, schemas will be inferred from the dbt node definition.", + ) + include_column_lineage: bool = Field( + default=False, + description="When enabled, column-level lineage will be extracted from the dbt node definition. Requires `infer_dbt_schemas` to be enabled. " + "If you run into issues where the column name casing does not match up with properly, providing a datahub_api or using the rest sink will improve accuracy.", + ) + # override default value to True. incremental_lineage: bool = Field( default=True, - description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.", + description="When enabled, emits incremental/patch lineage for non-dbt entities. When disabled, re-states lineage on each run.", ) @validator("target_platform") @@ -340,6 +364,17 @@ def meta_mapping_validator( ) return meta_mapping + @validator("include_column_lineage") + def validate_include_column_lineage( + cls, include_column_lineage: bool, values: Dict + ) -> bool: + if include_column_lineage and not values.get("infer_dbt_schemas"): + raise ValueError( + "`infer_dbt_schemas` must be enabled to use `include_column_lineage`" + ) + + return include_column_lineage + @dataclass class DBTColumn: @@ -352,6 +387,16 @@ class DBTColumn: meta: Dict[str, Any] = field(default_factory=dict) tags: List[str] = field(default_factory=list) + datahub_data_type: Optional[SchemaFieldDataType] = None + + +@dataclass +class DBTColumnLineageInfo: + upstream_dbt_name: str + + upstream_col: str + downstream_col: str + @dataclass class DBTNode: @@ -383,7 +428,9 @@ class DBTNode: owner: Optional[str] columns: List[DBTColumn] = field(default_factory=list) - upstream_nodes: List[str] = field(default_factory=list) + upstream_nodes: List[str] = field(default_factory=list) # list of upstream dbt_name + upstream_cll: List[DBTColumnLineageInfo] = field(default_factory=list) + cll_debug_info: Optional[SqlParsingDebugInfo] = None meta: Dict[str, Any] = field(default_factory=dict) query_tag: Dict[str, Any] = field(default_factory=dict) @@ -394,17 +441,23 @@ class DBTNode: test_info: Optional["DBTTest"] = None # only populated if node_type == 'test' test_result: Optional["DBTTestResult"] = None + @staticmethod + def _join_parts(parts: List[Optional[str]]) -> str: + joined = ".".join([part for part in parts if part]) + assert joined + return joined + def get_db_fqn(self) -> str: - if self.database: - fqn = f"{self.database}.{self.schema}.{self.name}" - else: - fqn = f"{self.schema}.{self.name}" + # Database might be None, but schema and name should always be present. + fqn = self._join_parts([self.database, self.schema, self.name]) return fqn.replace('"', "") def get_urn( self, target_platform: str, env: str, + # If target_platform = dbt, this is the dbt platform instance. + # Otherwise, it's the target platform instance. data_platform_instance: Optional[str], ) -> str: db_fqn = self.get_db_fqn() @@ -417,6 +470,80 @@ def get_urn( env=env, ) + def is_ephemeral_model(self) -> bool: + return self.materialization == "ephemeral" + + def get_fake_ephemeral_table_name(self) -> str: + assert self.is_ephemeral_model() + + # Similar to get_db_fqn. + fqn = self._join_parts( + [self.database, self.schema, f"__datahub__dbt__ephemeral__{self.name}"] + ) + return fqn.replace('"', "") + + def get_urn_for_upstream_lineage( + self, + dbt_platform_instance: Optional[str], + target_platform: str, + target_platform_instance: Optional[str], + env: str, + ) -> str: + """ + Get the urn to use when referencing this node in a dbt node's upstream lineage. + + If the node is a source or an ephemeral dbt node, we should point at the dbt node. + Otherwise, the node is materialized in the target platform, and so lineage should + point there. + """ + # TODO: This logic shouldn't live in the DBTNode class. It should be moved to the source. + + platform_value = DBT_PLATFORM + platform_instance_value = dbt_platform_instance + + materialized = self.materialization + if materialized in { + "view", + "materialized_view", + "table", + "incremental", + "snapshot", + }: + # upstream urns point to the target platform + platform_value = target_platform + platform_instance_value = target_platform_instance + + return self.get_urn( + target_platform=platform_value, + env=env, + data_platform_instance=platform_instance_value, + ) + + @property + def exists_in_target_platform(self): + return not (self.is_ephemeral_model() or self.node_type == "test") + + def columns_setdefault(self, schema_fields: List[SchemaField]) -> None: + """ + Update the column list if they are not already set. + """ + + if self.columns: + # If we already have columns, don't overwrite them. + return + + self.columns = [ + DBTColumn( + name=schema_field.fieldPath, + comment="", + description="", + index=i, + data_type=schema_field.nativeDataType, + datahub_data_type=schema_field.type, + ) + for i, schema_field in enumerate(schema_fields) + ] + def get_custom_properties(node: DBTNode) -> Dict[str, str]: # initialize custom properties to node's meta props @@ -442,6 +569,31 @@ def get_custom_properties(node: DBTNode) -> Dict[str, str]: return custom_properties +def _get_dbt_cte_names(name: str, target_platform: str) -> List[str]: + # Match the dbt CTE naming scheme: + # The default is defined here https://github.com/dbt-labs/dbt-core/blob/4122f6c308c88be4a24c1ea490802239a4c1abb8/core/dbt/adapters/base/relation.py#L222 + # However, since this PR https://github.com/dbt-labs/dbt-core/pull/2712, it's also possible + # for adapters to override this default. Only a handful actually do though: + # https://github.com/search?type=code&q=add_ephemeral_prefix+path:/%5Edbt%5C/adapters%5C// + + # Regardless, we need to keep the original name to work with older dbt versions. + default_cte_name = f"__dbt__cte__{name}" + + adapter_cte_names = { + "hive": f"tmp__dbt__cte__{name}", + "oracle": f"dbt__cte__{name}__", + "netezza": f"dbt__cte__{name}", + "exasol": f"dbt__CTE__{name}", + "db2": f"DBT_CTE__{name}", # ibm db2 + } + + cte_names = [default_cte_name] + if target_platform in adapter_cte_names: + cte_names.append(adapter_cte_names[target_platform]) + + return cte_names + + def get_upstreams( upstreams: List[str], all_nodes: Dict[str, DBTNode], @@ -462,21 +614,12 @@ def get_upstreams( upstream_manifest_node = all_nodes[upstream] # This logic creates lineages among dbt nodes. - platform_value = DBT_PLATFORM - platform_instance_value = platform_instance - - materialized = upstream_manifest_node.materialization - - if materialized in {"view", "table", "incremental", "snapshot"}: - # upstream urns point to the target platform - platform_value = target_platform - platform_instance_value = target_platform_instance - upstream_urns.append( - upstream_manifest_node.get_urn( - platform_value, - environment, - platform_instance_value, + upstream_manifest_node.get_urn_for_upstream_lineage( + dbt_platform_instance=platform_instance, + target_platform=target_platform, + target_platform_instance=target_platform_instance, + env=environment, ) ) return upstream_urns @@ -553,7 +696,7 @@ def get_column_type( @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") -@capability(SourceCapability.USAGE_STATS, "", supported=False) +@capability(SourceCapability.LINEAGE_FINE, "Enabled using `include_column_lineage`") class DBTSourceBase(StatefulIngestionSourceBase): def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str): super().__init__(config, ctx) @@ -614,9 +757,10 @@ def create_test_entity_mcps( target_platform=self.config.target_platform, target_platform_instance=self.config.target_platform_instance, environment=self.config.env, - platform_instance=None, + platform_instance=self.config.platform_instance, ) + # In case a dbt test depends on multiple tables, we create separate assertions for each. for upstream_urn in sorted(upstream_urns): if self.config.entities_enabled.can_emit_node_type("test"): yield make_assertion_from_test( @@ -651,23 +795,24 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - if self.config.write_semantics == "PATCH" and not self.ctx.graph: - raise ConfigurationError( - "With PATCH semantics, dbt source requires a datahub_api to connect to. " - "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe." - ) + if self.config.write_semantics == "PATCH": + self.ctx.require_graph("Using dbt with write_semantics=PATCH") all_nodes, additional_custom_props = self.load_nodes() all_nodes_map = {node.dbt_name: node for node in all_nodes} - nodes = self.filter_nodes(all_nodes) - additional_custom_props_filtered = { key: value for key, value in additional_custom_props.items() if value is not None } + # We need to run this before filtering nodes, because the info generated + # for a filtered node may be used by an unfiltered node. + # NOTE: This method mutates the DBTNode objects directly. + self._infer_schemas_and_update_cll(all_nodes_map) + + nodes = self._filter_nodes(all_nodes) non_test_nodes = [ dataset_node for dataset_node in nodes if dataset_node.node_type != "test" ] @@ -695,7 +840,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: all_nodes_map, ) - def filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]: + def _filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]: nodes = [] for node in all_nodes: key = node.dbt_name @@ -707,6 +852,193 @@ def filter_nodes(self, all_nodes: List[DBTNode]) -> List[DBTNode]: return nodes + @staticmethod + def _to_schema_info(schema_fields: List[SchemaField]) -> SchemaInfo: + return {column.fieldPath: column.nativeDataType for column in schema_fields} + + def _infer_schemas_and_update_cll(self, all_nodes_map: Dict[str, DBTNode]) -> None: + """Annotate the DBTNode objects with schema information and column-level lineage. + + Note that this mutates the DBTNode objects directly. + + This method does the following: + 1. Iterate over the dbt nodes in topological order. + 2. For each node, either load the schema from the graph or from the dbt catalog info. + We also add this schema to the schema resolver. + 3. Run sql parser to infer the schema + generate column lineage. + 4. Write the schema and column lineage back to the DBTNode object. + 5. If we haven't already added the node's schema to the schema resolver, do that. + """ + + if not self.config.infer_dbt_schemas: + if self.config.include_column_lineage: + raise ConfigurationError( + "`infer_dbt_schemas` must be enabled to use `include_column_lineage`" + ) + return + + graph = self.ctx.graph + + schema_resolver = SchemaResolver( + platform=self.config.target_platform, + platform_instance=self.config.target_platform_instance, + env=self.config.env, + ) + + target_platform_urn_to_dbt_name: Dict[str, str] = {} + + # Iterate over the dbt nodes in topological order. + # This ensures that we process upstream nodes before downstream nodes. + for dbt_name in topological_sort( + list(all_nodes_map.keys()), + edges=list( + (upstream, node.dbt_name) + for node in all_nodes_map.values() + for upstream in node.upstream_nodes + ), + ): + node = all_nodes_map[dbt_name] + + target_node_urn = None + should_fetch_target_node_schema = False + if node.exists_in_target_platform: + target_node_urn = node.get_urn( + self.config.target_platform, + self.config.env, + self.config.target_platform_instance, + ) + should_fetch_target_node_schema = True + elif node.is_ephemeral_model(): + # For ephemeral nodes, we "pretend" that they exist in the target platform + # for schema resolution purposes. + target_node_urn = mce_builder.make_dataset_urn_with_platform_instance( + platform=self.config.target_platform, + name=node.get_fake_ephemeral_table_name(), + platform_instance=self.config.target_platform_instance, + env=self.config.env, + ) + if target_node_urn: + target_platform_urn_to_dbt_name[target_node_urn] = node.dbt_name + + # Our schema resolver preference is: + # 1. graph + # 2. dbt catalog + # 3. inferred + # Exception: if convert_column_urns_to_lowercase is enabled, swap 1 and 2. + # Cases 1 and 2 are handled here, and case 3 is handled after schema inference has occurred. + schema_fields: Optional[List[SchemaField]] = None + + # Fetch the schema from the graph. + if target_node_urn and should_fetch_target_node_schema and graph: + schema_metadata = graph.get_aspect(target_node_urn, SchemaMetadata) + if schema_metadata: + schema_fields = schema_metadata.fields + + # Otherwise, load the schema from the dbt catalog. + # Note that this might get the casing wrong relative to DataHub, but + # has a more up-to-date column list. + if node.columns and ( + not schema_fields or self.config.convert_column_urns_to_lowercase + ): + schema_fields = [ + SchemaField( + fieldPath=column.name.lower() + if self.config.convert_column_urns_to_lowercase + else column.name, + type=column.datahub_data_type + or SchemaFieldDataType(type=NullTypeClass()), + nativeDataType=column.data_type, + ) + for column in node.columns + ] + + # Add the node to the schema resolver, so that we can get column + # casing to match the upstream platform. + added_to_schema_resolver = False + if target_node_urn and schema_fields: + schema_resolver.add_raw_schema_info( + target_node_urn, self._to_schema_info(schema_fields) + ) + added_to_schema_resolver = True + + # Run sql parser to infer the schema + generate column lineage. + sql_result = None + if node.compiled_code: + try: + # Add CTE stops based on the upstreams list. + preprocessed_sql = detach_ctes( + node.compiled_code, + platform=schema_resolver.platform, + cte_mapping={ + cte_name: upstream_node.get_fake_ephemeral_table_name() + for upstream_node in [ + all_nodes_map[upstream_node_name] + for upstream_node_name in node.upstream_nodes + if upstream_node_name in all_nodes_map + ] + if upstream_node.is_ephemeral_model() + for cte_name in _get_dbt_cte_names( + upstream_node.name, schema_resolver.platform + ) + }, + ) + except Exception as e: + sql_result = SqlParsingResult.make_from_error(e) + else: + sql_result = sqlglot_lineage( + preprocessed_sql, schema_resolver=schema_resolver + ) + + # Save the column lineage. + if self.config.include_column_lineage and sql_result: + # We only save the debug info here. We're report errors based on it later, after + # applying the configured node filters. + node.cll_debug_info = sql_result.debug_info + + if sql_result.column_lineage: + node.upstream_cll = [ + DBTColumnLineageInfo( + upstream_dbt_name=target_platform_urn_to_dbt_name[ + upstream_column.table + ], + upstream_col=upstream_column.column, + downstream_col=column_lineage_info.downstream.column, + ) + for column_lineage_info in sql_result.column_lineage + for upstream_column in column_lineage_info.upstreams + # Only include the CLL if the table in in the upstream list. + if target_platform_urn_to_dbt_name.get(upstream_column.table) + in node.upstream_nodes + ] + + # If we didn't fetch the schema from the graph, use the inferred schema. + inferred_schema_fields = None + if sql_result and sql_result.column_lineage: + inferred_schema_fields = [ + SchemaField( + fieldPath=column_lineage.downstream.column, + type=column_lineage.downstream.column_type + or SchemaFieldDataType(type=NullTypeClass()), + nativeDataType=column_lineage.downstream.native_column_type + or "", + ) + for column_lineage in sql_result.column_lineage + ] + + # Conditionally add the inferred schema to the schema resolver. + if ( + not added_to_schema_resolver + and target_node_urn + and inferred_schema_fields + ): + schema_resolver.add_raw_schema_info( + target_node_urn, self._to_schema_info(inferred_schema_fields) + ) + + # Save the inferred schema fields into the dbt node. + if inferred_schema_fields: + node.columns_setdefault(inferred_schema_fields) + def create_platform_mces( self, dbt_nodes: List[DBTNode], @@ -762,7 +1094,7 @@ def create_platform_mces( ) # mutates meta_aspects if mce_platform == DBT_PLATFORM: - aspects = self._generate_base_aspects( + aspects = self._generate_base_dbt_aspects( node, additional_custom_props_filtered, mce_platform, meta_aspects ) @@ -786,7 +1118,7 @@ def create_platform_mces( else: # We are creating empty node for platform and only add lineage/keyaspect. aspects = [] - if node.materialization == "ephemeral" or node.node_type == "test": + if not node.exists_in_target_platform: continue # This code block is run when we are generating entities of platform type. @@ -799,19 +1131,15 @@ def create_platform_mces( self.config.platform_instance, ) upstreams_lineage_class = get_upstream_lineage([upstream_dbt_urn]) - if self.config.incremental_lineage: - patch_builder: DatasetPatchBuilder = DatasetPatchBuilder( - urn=node_datahub_urn + if not is_primary_source and self.config.incremental_lineage: + # We only generate incremental lineage for non-dbt nodes. + wu = convert_upstream_lineage_to_patch( + urn=node_datahub_urn, + aspect=upstreams_lineage_class, + system_metadata=None, ) - for upstream in upstreams_lineage_class.upstreams: - patch_builder.add_upstream_lineage(upstream) - - for mcp in patch_builder.build(): - yield MetadataWorkUnit( - id=f"upstreamLineage-for-{node_datahub_urn}", - mcp_raw=mcp, - is_primary_source=is_primary_source, - ) + wu.is_primary_source = is_primary_source + yield wu else: aspects.append(upstreams_lineage_class) @@ -918,7 +1246,7 @@ def _create_view_properties_aspect( ) return view_properties - def _generate_base_aspects( + def _generate_base_dbt_aspects( self, node: DBTNode, additional_custom_props_filtered: Dict[str, str], @@ -926,8 +1254,7 @@ def _generate_base_aspects( meta_aspects: Dict[str, Any], ) -> List[Any]: """ - There are some common aspects that get generated for both dbt node and platform node depending on whether dbt - node creation is enabled or not. + Some common aspects that get generated for dbt nodes. """ # create an empty list of aspects and keep adding to it. Initializing with Any to avoid a @@ -987,6 +1314,8 @@ def get_schema_metadata( self.config.strip_user_ids_from_email, ) + # TODO if infer_dbt_schemas, load from saved schemas too + canonical_schema: List[SchemaField] = [] for column in node.columns: description = None @@ -1034,7 +1363,8 @@ def get_schema_metadata( field = SchemaField( fieldPath=field_name, nativeDataType=column.data_type, - type=get_column_type( + type=column.datahub_data_type + or get_column_type( report, node.dbt_name, column.data_type, node.dbt_adapter ), description=description, @@ -1140,27 +1470,78 @@ def _create_lineage_aspect_for_dbt_node( """ This method creates lineage amongst dbt nodes. A dbt node can be linked to other dbt nodes or a platform node. """ - upstream_urns = get_upstreams( - node.upstream_nodes, - all_nodes_map, - self.config.target_platform, - self.config.target_platform_instance, - self.config.env, - self.config.platform_instance, - ) # if a node is of type source in dbt, its upstream lineage should have the corresponding table/view # from the platform. This code block is executed when we are generating entities of type "dbt". if node.node_type == "source": - upstream_urns.append( + upstream_urns = [ node.get_urn( self.config.target_platform, self.config.env, self.config.target_platform_instance, ) + ] + cll = None + else: + upstream_urns = get_upstreams( + node.upstream_nodes, + all_nodes_map, + self.config.target_platform, + self.config.target_platform_instance, + self.config.env, + self.config.platform_instance, + ) + + node_urn = node.get_urn( + target_platform=DBT_PLATFORM, + env=self.config.env, + data_platform_instance=self.config.platform_instance, ) + + def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str: + return all_nodes_map[dbt_name].get_urn_for_upstream_lineage( + dbt_platform_instance=self.config.platform_instance, + target_platform=self.config.target_platform, + target_platform_instance=self.config.target_platform_instance, + env=self.config.env, + ) + + if node.cll_debug_info and node.cll_debug_info.error: + self.report.report_warning( + node.dbt_name, + f"Error parsing column lineage: {node.cll_debug_info.error}", + ) + cll = [ + FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + downstreamType=FineGrainedLineageDownstreamType.FIELD_SET, + upstreams=[ + mce_builder.make_schema_field_urn( + _translate_dbt_name_to_upstream_urn( + upstream_column.upstream_dbt_name + ), + upstream_column.upstream_col, + ) + for upstream_column in upstreams + ], + downstreams=[ + mce_builder.make_schema_field_urn(node_urn, downstream) + ], + confidenceScore=node.cll_debug_info.confidence + if node.cll_debug_info + else None, + ) + for downstream, upstreams in itertools.groupby( + node.upstream_cll, lambda x: x.downstream_col + ) + ] + if upstream_urns: upstreams_lineage_class = get_upstream_lineage(upstream_urns) + + if self.config.include_column_lineage and cll: + upstreams_lineage_class.fineGrainedLineages = cll + return upstreams_lineage_class return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index dc3a84847beb24..a7703b203bceee 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -171,7 +171,8 @@ def extract_dbt_entities( catalog_type = None if catalog_node is None: - if materialization != "test": + if materialization not in {"test", "ephemeral"}: + # Test and ephemeral nodes will never show up in the catalog. report.report_warning( key, f"Entity {key} ({name}) is in manifest but missing from catalog", diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index cdffb684d958e5..d1209f3ec7b755 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -260,6 +260,16 @@ class SqlParsingResult(_ParserBaseModel): exclude=True, ) + @classmethod + def make_from_error(cls, error: Exception) -> "SqlParsingResult": + return cls( + in_tables=[], + out_tables=[], + debug_info=SqlParsingDebugInfo( + table_error=error, + ), + ) + def _parse_statement(sql: sqlglot.exp.ExpOrStr, dialect: str) -> sqlglot.Expression: statement: sqlglot.Expression = sqlglot.maybe_parse( @@ -1154,14 +1164,60 @@ def sqlglot_lineage( default_schema=default_schema, ) except Exception as e: - return SqlParsingResult( - in_tables=[], - out_tables=[], - column_lineage=None, - debug_info=SqlParsingDebugInfo( - table_error=e, - ), - ) + return SqlParsingResult.make_from_error(e) + + +def detach_ctes( + sql: sqlglot.exp.ExpOrStr, platform: str, cte_mapping: Dict[str, str] +) -> sqlglot.exp.Expression: + """Replace CTE references with table references. + + For example, with cte_mapping = {"__cte_0": "_my_cte_table"}, the following SQL + + WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id + + is transformed into + + WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table ON table2.id = _my_cte_table.id + + Note that the original __cte_0 definition remains in the query, but is simply not referenced. + The query optimizer should be able to remove it. + + This method makes a major assumption: that no other table/column has the same name as a + key in the cte_mapping. + """ + + dialect = _get_dialect(platform) + statement = _parse_statement(sql, dialect=dialect) + + def replace_cte_refs(node: sqlglot.exp.Expression) -> sqlglot.exp.Expression: + if ( + isinstance(node, sqlglot.exp.Identifier) + and node.parent + and not isinstance(node.parent.parent, sqlglot.exp.CTE) + and node.name in cte_mapping + ): + full_new_name = cte_mapping[node.name] + table_expr = sqlglot.maybe_parse( + full_new_name, dialect=dialect, into=sqlglot.exp.Table + ) + + # We expect node.parent to be a Table or Column. + # Either way, it should support catalog/db/name. + parent = node.parent + + if "catalog" in parent.arg_types: + parent.set("catalog", table_expr.catalog) + if "db" in parent.arg_types: + parent.set("db", table_expr.db) + + new_node = sqlglot.exp.Identifier(this=table_expr.name) + + return new_node + else: + return node + + return statement.transform(replace_cte_refs, copy=False) def create_lineage_sql_parsed_result( @@ -1197,14 +1253,7 @@ def create_lineage_sql_parsed_result( default_schema=schema, ) except Exception as e: - return SqlParsingResult( - in_tables=[], - out_tables=[], - column_lineage=None, - debug_info=SqlParsingDebugInfo( - table_error=e, - ), - ) + return SqlParsingResult.make_from_error(e) finally: if needs_close: schema_resolver.close() diff --git a/metadata-ingestion/src/datahub/utilities/topological_sort.py b/metadata-ingestion/src/datahub/utilities/topological_sort.py new file mode 100644 index 00000000000000..f807dfe96063a7 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/topological_sort.py @@ -0,0 +1,49 @@ +from collections import deque +from typing import Dict, Iterable, List, Tuple, TypeVar + +_K = TypeVar("_K") + + +def topological_sort(nodes: List[_K], edges: List[Tuple[_K, _K]]) -> Iterable[_K]: + """Topological sort of a directed acyclic graph or forest. + + This is an implementation of Kahn's algorithm. + + Args: + nodes: List of nodes. + edges: List of edges, as tuples of (source, target). + + Returns: + List of nodes in topological order. + """ + + # Build adjacency list. + adj_list: Dict[_K, List[_K]] = {node: [] for node in nodes} + for source, target in edges: + adj_list[source].append(target) + + # Build in-degree map. + in_degrees: Dict[_K, int] = {node: 0 for node in nodes} + for _source, target in edges: + in_degrees[target] += 1 + + # Initialize queue with nodes with in-degree 0. + queue = deque(node for node in nodes if in_degrees[node] == 0) + + results = 0 + while queue: + node = queue.popleft() + + results += 1 + yield node + + # Decrement in-degree of each neighbor. + for neighbor in adj_list[node]: + in_degrees[neighbor] -= 1 + + # If in-degree is 0, add to queue. + if in_degrees[neighbor] == 0: + queue.append(neighbor) + + if results != len(nodes): + raise ValueError("Graph contains cycles.") diff --git a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json index 16df7b8e51b24f..e4f01ef7a6c537 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_enabled_with_schemas_mces_golden.json @@ -14,7 +14,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -131,7 +132,92 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + } + ] } }, { @@ -176,7 +262,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -195,7 +282,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -355,7 +443,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -373,7 +462,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +665,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -594,7 +685,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -712,7 +804,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -730,7 +823,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -882,7 +976,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -900,7 +995,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1070,7 +1166,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1088,7 +1185,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1198,7 +1296,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1216,7 +1315,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1338,7 +1438,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1356,7 +1457,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1486,7 +1588,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1504,7 +1607,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1698,7 +1802,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1716,7 +1821,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1862,7 +1968,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -1880,7 +1987,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2047,7 +2155,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2065,7 +2174,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2211,7 +2321,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2229,7 +2340,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2375,7 +2487,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2393,7 +2506,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2539,7 +2653,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2557,7 +2672,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2703,7 +2819,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2712,12 +2829,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2726,12 +2856,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2740,12 +2883,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2760,7 +2916,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2775,7 +2932,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2790,7 +2948,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2805,7 +2964,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } }, { @@ -2820,7 +2980,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-schemas-dbt-enabled" + "runId": "dbt-test-with-schemas-dbt-enabled", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json index 4557cb03248291..4d5b008b695f97 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json @@ -14,7 +14,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -65,7 +66,104 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "initial_full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + } + ] } }, { @@ -118,7 +216,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -137,7 +236,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -275,7 +375,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -293,7 +394,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +589,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -506,7 +609,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -652,7 +756,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -670,7 +775,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -920,7 +1026,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -938,7 +1045,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1084,7 +1192,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1102,7 +1211,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1272,7 +1382,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1290,7 +1401,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1400,7 +1512,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1531,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1540,7 +1654,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1558,7 +1673,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1688,7 +1804,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1706,7 +1823,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1900,7 +2018,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -1918,7 +2037,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2064,7 +2184,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2082,7 +2203,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2249,7 +2371,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2267,7 +2390,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2413,7 +2537,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2431,7 +2556,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2577,7 +2703,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2595,7 +2722,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2741,7 +2869,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2759,7 +2888,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2905,7 +3035,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2914,12 +3045,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.an-aliased-view-for-monthly-billing%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.an-aliased-view-for-monthly-billing%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an-aliased-view-for-monthly-billing,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2928,12 +3072,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.an_aliased_view_for_payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.an_aliased_view_for_payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.an_aliased_view_for_payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2942,12 +3099,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.payments_by_customer_by_month%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.payments_by_customer_by_month%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payments_by_customer_by_month,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2956,12 +3126,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.customer_snapshot%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.public.customer_snapshot%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer_snapshot,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2976,7 +3159,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } }, { @@ -2991,7 +3175,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-column-meta-mapping" + "runId": "dbt-column-meta-mapping", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_events_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_events_golden.json index 086c5a78e92a45..3e8ddf317f387e 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_events_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_events_golden.json @@ -15,7 +15,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -203,7 +204,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -222,7 +224,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -429,7 +432,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -447,7 +451,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -565,7 +570,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -583,7 +589,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -713,7 +720,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -731,7 +739,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -861,7 +870,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -878,7 +888,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -975,7 +986,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -992,7 +1004,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1101,7 +1114,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1118,7 +1132,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1227,7 +1242,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1236,12 +1252,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.customers%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.customers,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.customers%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.customers,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1250,12 +1279,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.orders%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.orders,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.orders%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.orders,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1264,12 +1306,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.stg_customers%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.stg_customers,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.stg_customers%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.stg_customers,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1278,12 +1333,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.stg_orders%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.stg_orders,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.stg_orders%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.stg_orders,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1292,12 +1360,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.stg_payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.stg_payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.stg_payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.stg_payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1306,12 +1387,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.raw_customers%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.raw_customers,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.raw_customers%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.raw_customers,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1320,12 +1414,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.raw_orders%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.raw_orders,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.raw_orders%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.raw_orders,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1334,12 +1441,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.raw_payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.raw_payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Ccalm-pagoda-323403.jaffle_shop.raw_payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.raw_payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1354,7 +1474,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1397,7 +1518,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1408,23 +1530,24 @@ "aspect": { "json": { "timestampMillis": 1655565131058, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:b052a324c05327985f3b579a19ad7579", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:b052a324c05327985f3b579a19ad7579", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1439,7 +1562,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1482,7 +1606,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1493,23 +1618,24 @@ "aspect": { "json": { "timestampMillis": 1655565131075, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:da743330013b7e3e3707ac6e526ab408", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:da743330013b7e3e3707ac6e526ab408", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1524,7 +1650,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1567,7 +1694,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1578,23 +1706,24 @@ "aspect": { "json": { "timestampMillis": 1655565131073, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:2887b9c826e0be6296a37833bdc380bd", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_payments,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:2887b9c826e0be6296a37833bdc380bd", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1609,7 +1738,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1640,7 +1770,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1651,23 +1782,24 @@ "aspect": { "json": { "timestampMillis": 1655565131077, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:591d8dc8939e0cf9bf0fd03264ad1a0e", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:591d8dc8939e0cf9bf0fd03264ad1a0e", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1682,7 +1814,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1732,7 +1865,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1747,7 +1881,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1791,7 +1926,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1802,12 +1938,7 @@ "aspect": { "json": { "timestampMillis": 1655565137668, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:bf7fd2b46d2c32ee9bb036acd1559782", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)", "status": "COMPLETE", "result": { @@ -1815,12 +1946,18 @@ "nativeResults": { "message": "Database Error in test dbt_expectations_expect_column_values_to_be_in_set_customers_customer_id__customer_id_is_not_null__0__1__2 (models/schema.yml)\n No matching signature for operator = for argument types: INT64, STRING. Supported signature: ANY = ANY at [46:25]\n compiled SQL at target/run/jaffle_shop/models/schema.yml/dbt_expectations_expect_column_e42202dc29e1149de0f5c3966219796d.sql" } + }, + "assertionUrn": "urn:li:assertion:bf7fd2b46d2c32ee9bb036acd1559782", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1835,7 +1972,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1874,7 +2012,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1885,12 +2024,7 @@ "aspect": { "json": { "timestampMillis": 1655565137668, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:1c217b7587a0cad47a07a09bfe154055", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { @@ -1898,12 +2032,18 @@ "nativeResults": { "message": "Database Error in test dbt_expectations_expect_column_values_to_not_be_in_set_orders_credit_card_amount__credit_card_amount_is_not_null__0 (models/schema.yml)\n No matching signature for operator = for argument types: FLOAT64, STRING. Supported signature: ANY = ANY at [36:25]\n compiled SQL at target/run/jaffle_shop/models/schema.yml/dbt_expectations_expect_column_fdf581b1071168614662824120d65b90.sql" } + }, + "assertionUrn": "urn:li:assertion:1c217b7587a0cad47a07a09bfe154055", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1918,7 +2058,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1954,7 +2095,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1965,23 +2107,24 @@ "aspect": { "json": { "timestampMillis": 1655565132560, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:44519aa345bf3ea896179f9f352ae946", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:44519aa345bf3ea896179f9f352ae946", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1996,7 +2139,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2032,7 +2176,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2043,23 +2188,24 @@ "aspect": { "json": { "timestampMillis": 1655565133585, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:bbd78a070092f54313153abec49f6f31", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:bbd78a070092f54313153abec49f6f31", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2074,7 +2220,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2110,7 +2257,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2121,23 +2269,24 @@ "aspect": { "json": { "timestampMillis": 1655565133591, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:52d06197762e3608d94609e96f03a0a7", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:52d06197762e3608d94609e96f03a0a7", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2152,7 +2301,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2188,7 +2338,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2199,23 +2350,24 @@ "aspect": { "json": { "timestampMillis": 1655565133595, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:ca065a99637630468f688717590beeab", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:ca065a99637630468f688717590beeab", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2230,7 +2382,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2266,7 +2419,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2277,23 +2431,24 @@ "aspect": { "json": { "timestampMillis": 1655565134031, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:7a305acc5fc049dc9bbd141b814461d0", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:7a305acc5fc049dc9bbd141b814461d0", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2308,7 +2463,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2344,7 +2500,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2355,23 +2512,24 @@ "aspect": { "json": { "timestampMillis": 1655565134482, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:11087a3d7ae178df22c42922ac8ef8ad", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:11087a3d7ae178df22c42922ac8ef8ad", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2386,7 +2544,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2422,7 +2581,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2433,23 +2593,24 @@ "aspect": { "json": { "timestampMillis": 1655565134485, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:b301bb47cc4ebce4e78a194b3de11f25", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:b301bb47cc4ebce4e78a194b3de11f25", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2464,7 +2625,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2500,7 +2662,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2511,23 +2674,24 @@ "aspect": { "json": { "timestampMillis": 1655565134493, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:2e9117138dcc9facda66f1efd55a8cd7", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:2e9117138dcc9facda66f1efd55a8cd7", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2542,7 +2706,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2578,7 +2743,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2589,23 +2755,24 @@ "aspect": { "json": { "timestampMillis": 1655565134966, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:25ebf4faa9b1654ef54c46d975ca0a81", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_customers,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:25ebf4faa9b1654ef54c46d975ca0a81", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2620,7 +2787,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2656,7 +2824,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2667,23 +2836,24 @@ "aspect": { "json": { "timestampMillis": 1655565135368, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:b03abcc447aac70bbebb22a8a9d7dbbe", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:b03abcc447aac70bbebb22a8a9d7dbbe", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2698,7 +2868,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2734,7 +2905,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2745,23 +2917,24 @@ "aspect": { "json": { "timestampMillis": 1655565135377, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:c1eebc71f36690e4523adca30314e927", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_payments,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:c1eebc71f36690e4523adca30314e927", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2776,7 +2949,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2821,7 +2995,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2832,23 +3007,24 @@ "aspect": { "json": { "timestampMillis": 1655565135510, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:b210dbd31c2ee4efc0c24a9e4cf125ef", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:b210dbd31c2ee4efc0c24a9e4cf125ef", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2893,7 +3069,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2904,23 +3081,24 @@ "aspect": { "json": { "timestampMillis": 1655565135510, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:b210dbd31c2ee4efc0c24a9e4cf125ef", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:b210dbd31c2ee4efc0c24a9e4cf125ef", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2935,7 +3113,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2977,7 +3156,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2988,23 +3168,24 @@ "aspect": { "json": { "timestampMillis": 1655565135836, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:c51ca9c4b5a1f964bef748f0b8968e71", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:c51ca9c4b5a1f964bef748f0b8968e71", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3019,7 +3200,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3061,7 +3243,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3072,23 +3255,24 @@ "aspect": { "json": { "timestampMillis": 1655565136269, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:caa9b8060e214cecab88a92dc39c2e60", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:caa9b8060e214cecab88a92dc39c2e60", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3103,7 +3287,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3145,7 +3330,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3156,23 +3342,24 @@ "aspect": { "json": { "timestampMillis": 1655565136230, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:54bac90e6785bdefd8685ebf8814c429", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_customers,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:54bac90e6785bdefd8685ebf8814c429", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3187,7 +3374,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3229,7 +3417,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3240,23 +3429,24 @@ "aspect": { "json": { "timestampMillis": 1655565136395, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:815963e1332b46a203504ba46ebfab24", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_orders,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:815963e1332b46a203504ba46ebfab24", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3271,7 +3461,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3313,7 +3504,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3324,23 +3516,24 @@ "aspect": { "json": { "timestampMillis": 1655565136719, - "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" - }, "runId": "c7a6b778-0e0f-4789-b567-ca7e124a6840", - "assertionUrn": "urn:li:assertion:fac27f352406b941125292413afa8096", "asserteeUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.stg_payments,PROD)", "status": "COMPLETE", "result": { "type": "SUCCESS", "nativeResults": {} + }, + "assertionUrn": "urn:li:assertion:fac27f352406b941125292413afa8096", + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" } } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3355,7 +3548,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3370,7 +3564,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3385,7 +3580,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3400,7 +3596,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3415,7 +3612,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3430,7 +3628,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3445,7 +3644,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3460,7 +3660,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3475,7 +3676,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3490,7 +3692,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3505,7 +3708,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3520,7 +3724,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3535,7 +3740,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3550,7 +3756,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3565,7 +3772,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3580,7 +3788,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3595,7 +3804,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3610,7 +3820,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3625,7 +3836,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3640,7 +3852,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3655,7 +3868,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3670,7 +3884,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3685,7 +3900,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3700,7 +3916,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-2022_02_03-07_00_00" + "runId": "dbt-2022_02_03-07_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json index 19bfb60e62a08d..0bdd5e3c895c27 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_complex_owner_patterns_mces_golden.json @@ -14,7 +14,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -95,7 +96,92 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + } + ] } }, { @@ -140,7 +226,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -159,7 +246,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -302,7 +390,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -320,7 +409,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -522,7 +612,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -541,7 +632,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -659,7 +751,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -677,7 +770,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -826,7 +920,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -844,7 +939,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1014,7 +1110,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1032,7 +1129,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1142,7 +1240,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1160,7 +1259,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1282,7 +1382,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1300,7 +1401,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1427,7 +1529,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1445,7 +1548,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1639,7 +1743,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1657,7 +1762,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1803,7 +1909,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1821,7 +1928,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -1985,7 +2093,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2003,7 +2112,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2149,7 +2259,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2167,7 +2278,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2313,7 +2425,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2331,7 +2444,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2477,7 +2591,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2486,12 +2601,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2500,12 +2628,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2514,12 +2655,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2534,7 +2688,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } }, { @@ -2549,7 +2704,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-complex-owner-patterns" + "runId": "dbt-test-with-complex-owner-patterns", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json index 242c83003b1811..5ab0b11e377716 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_data_platform_instance_mces_golden.json @@ -14,7 +14,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -96,7 +97,92 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + } + ] } }, { @@ -141,7 +227,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -160,7 +247,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -303,7 +391,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -321,7 +410,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -523,7 +613,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -542,7 +633,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -660,7 +752,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +771,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -827,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -845,7 +940,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1015,7 +1111,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1033,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1143,7 +1241,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1161,7 +1260,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1283,7 +1383,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1301,7 +1402,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1428,7 +1530,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1446,7 +1549,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1640,7 +1744,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1658,7 +1763,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1804,7 +1910,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1822,7 +1929,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1986,7 +2094,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2004,7 +2113,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2150,7 +2260,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2168,7 +2279,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2314,7 +2426,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2332,7 +2445,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2478,7 +2592,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2496,7 +2611,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2642,7 +2758,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2651,12 +2768,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cdbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cdbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2665,12 +2795,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cdbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cdbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.an-aliased-view-for-payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2679,12 +2822,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cdbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cdbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,dbt-instance-1.pagila.dbt_postgres.payments_by_customer_by_month,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2699,7 +2855,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2714,7 +2871,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-data-platform-instance" + "runId": "dbt-test-with-data-platform-instance", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json index d98b63b9da62fe..3725e590fee9e4 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_non_incremental_lineage_mces_golden.json @@ -14,7 +14,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -96,7 +97,92 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + } + ] } }, { @@ -141,7 +227,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -160,7 +247,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -303,7 +391,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -321,7 +410,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -523,7 +613,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -542,7 +633,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -660,7 +752,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +771,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -827,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -845,7 +940,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1015,7 +1111,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1033,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1143,7 +1241,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1161,7 +1260,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1283,7 +1383,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1301,7 +1402,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1428,7 +1530,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1446,7 +1549,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1640,7 +1744,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1658,7 +1763,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1804,7 +1910,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1822,7 +1929,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -1986,7 +2094,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2004,7 +2113,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2150,7 +2260,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2168,7 +2279,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2314,7 +2426,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2332,7 +2445,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2478,7 +2592,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2496,7 +2611,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2642,7 +2758,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2669,7 +2786,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2696,7 +2814,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2723,7 +2842,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2738,7 +2858,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } }, { @@ -2753,7 +2874,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-non-incremental-lineage" + "runId": "dbt-test-with-non-incremental-lineage", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json index 7c024f93641b9b..a47abab6b40f7a 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_with_target_platform_instance_mces_golden.json @@ -14,7 +14,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -96,7 +97,92 @@ "tableSchema": "" } }, - "fields": [] + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "full_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "TEXT", + "recursive": false, + "isPartOfKey": false + } + ] } }, { @@ -141,7 +227,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -160,7 +247,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -303,7 +391,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -321,7 +410,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -523,7 +613,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -542,7 +633,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -660,7 +752,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +771,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -827,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -845,7 +940,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1015,7 +1111,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1033,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1143,7 +1241,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1161,7 +1260,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1283,7 +1383,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1301,7 +1402,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1428,7 +1530,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1446,7 +1549,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1640,7 +1744,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1658,7 +1763,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1804,7 +1910,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1822,7 +1929,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -1986,7 +2094,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2004,7 +2113,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2150,7 +2260,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2168,7 +2279,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2314,7 +2426,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2332,7 +2445,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2478,7 +2592,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2496,7 +2611,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2642,7 +2758,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2651,12 +2768,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-monthly-billing%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-monthly-billing,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2665,12 +2795,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.an-aliased-view-for-payments%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.an-aliased-view-for-payments,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2679,12 +2822,25 @@ "changeType": "PATCH", "aspectName": "upstreamLineage", "aspect": { - "value": "[{\"op\": \"add\", \"path\": \"/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29\", \"value\": {\"auditStamp\": {\"time\": 1643871600000, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD)\", \"type\": \"TRANSFORMED\"}}]", - "contentType": "application/json-patch+json" + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Adbt%2Cpagila.dbt_postgres.payments_by_customer_by_month%2CPROD%29", + "value": { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.dbt_postgres.payments_by_customer_by_month,PROD)", + "type": "TRANSFORMED" + } + } + ] }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2699,7 +2855,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } }, { @@ -2714,7 +2871,8 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "dbt-test-with-target-platform-instance" + "runId": "dbt-test-with-target-platform-instance", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index a970ff6a5de7ad..95b5374bbb41df 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -361,11 +361,11 @@ def test_dbt_tests_only_assertions(pytestconfig, tmp_path, mock_time, **kwargs): test_results_path=str( (test_resources_dir / "jaffle_shop_test_results.json").resolve() ), - # this is just here to avoid needing to access datahub server - write_semantics="OVERRIDE", entities_enabled=DBTEntitiesEnabled( test_results=EmitDirective.ONLY ), + # this is just here to avoid needing to access datahub server + write_semantics="OVERRIDE", ), ), sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}), @@ -440,13 +440,13 @@ def test_dbt_only_test_definitions_and_results( test_results_path=str( (test_resources_dir / "jaffle_shop_test_results.json").resolve() ), - # this is just here to avoid needing to access datahub server - write_semantics="OVERRIDE", entities_enabled=DBTEntitiesEnabled( sources=EmitDirective.NO, seeds=EmitDirective.NO, models=EmitDirective.NO, ), + # this is just here to avoid needing to access datahub server + write_semantics="OVERRIDE", ), ), sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}), diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_cte_name_collision.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_cte_name_collision.json new file mode 100644 index 00000000000000..44f1075c058ad6 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_cte_name_collision.json @@ -0,0 +1,47 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table3,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR(16777216)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)", + "column": "col2" + } + ] + }, + { + "downstream": { + "table": null, + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR(16777216)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table3,PROD)", + "column": "col1" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_full_table_name_col_reference.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_full_table_name_col_reference.json new file mode 100644 index 00000000000000..f8301f1e8189ee --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_full_table_name_col_reference.json @@ -0,0 +1,55 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "id", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "id" + } + ] + }, + { + "downstream": { + "table": null, + "column": "id_gt_100", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "id" + } + ] + }, + { + "downstream": { + "table": null, + "column": "struct_field1", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "struct_field.field1" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_unused_cte.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_unused_cte.json new file mode 100644 index 00000000000000..3916c6dc7c5ef7 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_unused_cte.json @@ -0,0 +1,39 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,table2,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,table3,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "col1", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,table1,PROD)", + "column": "col1" + } + ] + }, + { + "downstream": { + "table": null, + "column": "col6", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,table3,PROD)", + "column": "col6" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index c420f2b8438ce0..7f69e358f8f119 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -3,11 +3,59 @@ import pytest from datahub.testing.check_sql_parser_result import assert_sql_result -from datahub.utilities.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT +from datahub.utilities.sqlglot_lineage import ( + _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT, + detach_ctes, +) RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens" +def test_detach_ctes_simple(): + original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id" + detached_expr = detach_ctes( + original, + platform="snowflake", + cte_mapping={"__cte_0": "_my_cte_table"}, + ) + detached = detached_expr.sql(dialect="snowflake") + + assert ( + detached + == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table ON table2.id = _my_cte_table.id" + ) + + +def test_detach_ctes_with_alias(): + original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 AS tablealias ON table2.id = tablealias.id" + detached_expr = detach_ctes( + original, + platform="snowflake", + cte_mapping={"__cte_0": "_my_cte_table"}, + ) + detached = detached_expr.sql(dialect="snowflake") + + assert ( + detached + == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN _my_cte_table AS tablealias ON table2.id = tablealias.id" + ) + + +def test_detach_ctes_with_multipart_replacement(): + original = "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN __cte_0 ON table2.id = __cte_0.id" + detached_expr = detach_ctes( + original, + platform="snowflake", + cte_mapping={"__cte_0": "my_db.my_schema.my_table"}, + ) + detached = detached_expr.sql(dialect="snowflake") + + assert ( + detached + == "WITH __cte_0 AS (SELECT * FROM table1) SELECT * FROM table2 JOIN my_db.my_schema.my_table ON table2.id = my_db.my_schema.my_table.id" + ) + + def test_select_max(): # The COL2 should get normalized to col2. assert_sql_result( @@ -630,6 +678,84 @@ def test_snowflake_column_cast(): ) +def test_snowflake_unused_cte(): + # For this, we expect table level lineage to include table1, but CLL should not. + assert_sql_result( + """ +WITH cte1 AS ( + SELECT col1, col2 + FROM table1 + WHERE col1 = 'value1' +), cte2 AS ( + SELECT col3, col4 + FROM table2 + WHERE col2 = 'value2' +) +SELECT cte1.col1, table3.col6 +FROM cte1 +JOIN table3 ON table3.col5 = cte1.col2 +""", + dialect="snowflake", + expected_file=RESOURCE_DIR / "test_snowflake_unused_cte.json", + ) + + +def test_snowflake_cte_name_collision(): + # In this example, output col1 should come from table3 and not table1, since the cte is unused. + # We'll still generate table-level lineage that includes table1. + assert_sql_result( + """ +WITH cte_alias AS ( + SELECT col1, col2 + FROM table1 +) +SELECT table2.col2, cte_alias.col1 +FROM table2 +JOIN table3 AS cte_alias ON cte_alias.col2 = cte_alias.col2 +""", + dialect="snowflake", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)": { + "col1": "NUMBER(38,0)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)": { + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table3,PROD)": { + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_cte_name_collision.json", + ) + + +def test_snowflake_full_table_name_col_reference(): + assert_sql_result( + """ +SELECT + my_db.my_schema.my_table.id, + case when my_db.my_schema.my_table.id > 100 then 1 else 0 end as id_gt_100, + my_db.my_schema.my_table.struct_field.field1 as struct_field1, +FROM my_db.my_schema.my_table +""", + dialect="snowflake", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_db.my_schema.my_table,PROD)": { + "id": "NUMBER(38,0)", + "struct_field": "struct", + }, + }, + expected_file=RESOURCE_DIR + / "test_snowflake_full_table_name_col_reference.json", + ) + + # TODO: Add a test for setting platform_instance or env diff --git a/metadata-ingestion/tests/unit/test_topological_sort.py b/metadata-ingestion/tests/unit/test_topological_sort.py new file mode 100644 index 00000000000000..4300816b6c48fc --- /dev/null +++ b/metadata-ingestion/tests/unit/test_topological_sort.py @@ -0,0 +1,33 @@ +import pytest + +from datahub.utilities.topological_sort import topological_sort + + +def test_topological_sort_valid(): + nodes = ["a", "b", "c", "d", "e", "f"] + edges = [ + ("a", "d"), + ("f", "b"), + ("b", "d"), + ("f", "a"), + ("d", "c"), + ] + + # This isn't the only valid topological sort order. + expected_order = ["e", "f", "b", "a", "d", "c"] + assert list(topological_sort(nodes, edges)) == expected_order + + +def test_topological_sort_invalid(): + nodes = ["a", "b", "c", "d", "e", "f"] + edges = [ + ("a", "d"), + ("f", "b"), + ("b", "d"), + ("f", "a"), + ("d", "c"), + ("c", "f"), + ] + + with pytest.raises(ValueError, match="cycle"): + list(topological_sort(nodes, edges))