From 9e35cfed35466a09cc00608004ea8d025664c1b6 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Sun, 14 Dec 2025 16:20:09 +0300 Subject: [PATCH 01/13] Store example data directly inside the datafusion-examples (#19141) --- datafusion-examples/data/README.md | 27 +++++ .../data/csv/aggregate_test_100.csv | 101 ++++++++++++++++++ datafusion-examples/data/csv/cars.csv | 26 +++++ datafusion-examples/data/csv/regex.csv | 12 +++ datafusion-examples/data/csv/window_1.csv | 101 ++++++++++++++++++ .../data/parquet/alltypes_plain.parquet | Bin 0 -> 1851 bytes .../examples/builtin_functions/regexp.rs | 32 ++---- .../custom_data_source/csv_json_opener.rs | 9 +- .../custom_data_source/csv_sql_streaming.rs | 10 +- .../examples/data_io/parquet_encrypted.rs | 10 +- .../examples/data_io/parquet_exec_visitor.rs | 8 +- .../examples/dataframe/dataframe.rs | 24 +++-- .../dataframe/deserialize_to_struct.rs | 11 +- .../examples/execution_monitoring/tracing.rs | 30 ++++-- datafusion-examples/examples/flight/client.rs | 8 +- datafusion-examples/examples/flight/server.rs | 8 +- .../examples/flight/sql_server.rs | 8 +- .../examples/query_planning/parse_sql_expr.rs | 32 +++--- .../examples/query_planning/plan_to_sql.rs | 39 ++++--- .../examples/query_planning/planner_api.rs | 13 ++- .../examples/query_planning/thread_pools.rs | 10 +- datafusion-examples/examples/sql_ops/query.rs | 21 ++-- .../examples/udf/advanced_udwf.rs | 48 ++------- .../examples/udf/simple_udtf.rs | 17 ++- 24 files changed, 443 insertions(+), 162 deletions(-) create mode 100644 datafusion-examples/data/README.md create mode 100644 datafusion-examples/data/csv/aggregate_test_100.csv create mode 100644 datafusion-examples/data/csv/cars.csv create mode 100644 datafusion-examples/data/csv/regex.csv create mode 100644 datafusion-examples/data/csv/window_1.csv create mode 100644 datafusion-examples/data/parquet/alltypes_plain.parquet diff --git a/datafusion-examples/data/README.md b/datafusion-examples/data/README.md new file mode 100644 index 0000000000000..38ce020a4a1de --- /dev/null +++ b/datafusion-examples/data/README.md @@ -0,0 +1,27 @@ + + +## Example datasets +| Filename | Path | Description | +| ------------------------ | ------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `aggregate_test_100.csv` | [`data/csv/aggregate_test_100.csv`](./csv/aggregate_test_100.csv) | Multi-column CSV dataset with mixed data types (strings, signed/unsigned integers, floats, doubles). Used for aggregation, projection, filtering, and query planning examples. Derived from Arrow-style aggregate test data. | +| `cars.csv` | [`data/csv/cars.csv`](./csv/cars.csv) | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames). | +| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. | +| `window_1.csv` | [`data/csv/window_1.csv`](./csv/window_1.csv) | Numeric dataset designed for window function demonstrations. Includes ordering keys and incremental values suitable for running totals, ranking, and frame-based calculations. | +| `alltypes_plain.parquet` | [`data/parquet/alltypes_plain.parquet`](./parquet/alltypes_plain.parquet) | Parquet file containing columns of many Arrow/DataFusion-supported types (boolean, integers, floating-point, strings, timestamps). Used to demonstrate Parquet scanning, schema inference, and typed execution. | diff --git a/datafusion-examples/data/csv/aggregate_test_100.csv b/datafusion-examples/data/csv/aggregate_test_100.csv new file mode 100644 index 0000000000000..e548b758bf54e --- /dev/null +++ b/datafusion-examples/data/csv/aggregate_test_100.csv @@ -0,0 +1,101 @@ +c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13 +c,2,1,18109,2033001162,-6513304855495910254,25,43062,1491205016,5863949479783605708,0.110830784,0.9294097332465232,6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW +d,5,-40,22614,706441268,-7542719935673075327,155,14337,3373581039,11720144131976083864,0.69632107,0.3114712539863804,C2GT5KVyOPZpgKVl110TyZO0NcJ434 +b,1,29,-18218,994303988,5983957848665088916,204,9489,3275293996,14857091259186476033,0.53840446,0.17909035118828576,AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz +a,1,-85,-15154,1171968280,1919439543497968449,77,52286,774637006,12101411955859039553,0.12285209,0.6864391962767343,0keZ5G8BffGwgF2RwQD59TFzMStxCB +b,5,-82,22080,1824882165,7373730676428214987,208,34331,3342719438,3330177516592499461,0.82634634,0.40975383525297016,Ig1QcuKsjHXkproePdERo2w0mYzIqd +b,4,-111,-1967,-4229382,1892872227362838079,67,9832,1243785310,8382489916947120498,0.06563997,0.152498292971736,Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH +e,3,104,-25136,1738331255,300633854973581194,139,20807,3577318119,13079037564113702254,0.40154034,0.7764360990307122,DuJNG8tufSqW0ZstHqWj3aGvFLMg4A +a,3,13,12613,1299719633,2020498574254265315,191,17835,3998790955,14881411008939145569,0.041445434,0.8813167497816289,Amn2K87Db5Es3dFQO9cw9cvpAM6h35 +d,1,38,18384,-335410409,-1632237090406591229,26,57510,2712615025,1842662804748246269,0.6064476,0.6404495093354053,4HX6feIvmNXBN7XGqgO4YVBkhu8GDI +a,4,-38,20744,762932956,308913475857409919,7,45465,1787652631,878137512938218976,0.7459874,0.02182578039211991,ydkwycaISlYSlEq3TlkS2m15I2pcp8 +d,1,57,28781,-1143802338,2662536767954229885,202,62167,879082834,4338034436871150616,0.7618384,0.42950521730777025,VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4 +a,4,-54,-2376,434021400,5502271306323260832,113,15777,2502326480,7966148640299601101,0.5720931,0.30585375151301186,KJFcmTVjdkCMv94wYCtfHMFhzyRsmH +e,3,112,-6823,-421042466,8535335158538929274,129,32712,3759340273,9916295859593918600,0.6424343,0.6316565296547284,BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE +d,2,113,3917,-108973366,-7220140168410319165,197,24380,63044568,4225581724448081782,0.11867094,0.2944158618048994,90gAtmGEeIqUTbo1ZrxCvWtsseukXC +b,1,54,-18410,1413111008,-7145106120930085900,249,5382,1842680163,17818611040257178339,0.8881188,0.24899794314659673,6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ +c,1,103,-22186,431378678,1346564663822463162,146,12393,3766999078,10901819591635583995,0.064453244,0.7784918983501654,2T3wSlHdEmASmO0xcXHnndkKEt6bz8 +e,2,49,24495,-587831330,9178511478067509438,129,12757,1289293657,10948666249269100825,0.5610077,0.5991138115095911,bgK1r6v3BCTh0aejJUhkA1Hn6idXGp +d,1,-98,13630,-1991133944,1184110014998006843,220,2986,225513085,9634106610243643486,0.89651865,0.1640882545084913,y7C453hRWd4E7ImjNDWlpexB8nUqjh +d,3,77,15091,-1302295658,8795481303066536947,154,35477,2093538928,17419098323248948387,0.11952883,0.7035635283169166,O66j6PaYuZhEUtqV6fuU7TyjM2WxC5 +e,2,97,18167,1593800404,-9112448817105133638,163,45185,3188005828,2792105417953811674,0.38175434,0.4094218353587008,ukOiFGGFnQJDHFgZxHMpvhD3zybF0M +e,4,-56,-31500,1544188174,3096047390018154410,220,417,557517119,2774306934041974261,0.15459597,0.19113293583306745,IZTkHMLvIKuiLjhDjYMmIHxh166we4 +d,1,-99,5613,1213926989,-8863698443222021480,19,18736,4216440507,14933742247195536130,0.6067944,0.33639590659276175,aDxBtor7Icd9C5hnTvvw5NrIre740e +a,5,36,-16974,623103518,6834444206535996609,71,29458,141047417,17448660630302620693,0.17100024,0.04429073092078406,OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh +e,4,-53,13788,2064155045,-691093532952651300,243,35106,2778168728,9463973906560740422,0.34515214,0.27159190516490006,0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm +c,2,-29,25305,-537142430,-7683452043175617798,150,31648,598822671,11759014161799384683,0.8315913,0.946325164889271,9UbObCsVkmYpJGcGrgfK90qOnwb2Lj +a,1,-25,15295,383352709,4980135132406487265,231,102,3276123488,12763583666216333412,0.53796273,0.17592486905979987,XemNcT1xp61xcM1Qz3wZ1VECCnq06O +c,4,123,16620,852509237,-3087630526856906991,196,33715,3566741189,4546434653720168472,0.07606989,0.819715865079681,8LIh0b6jmDGm87BmIyjdxNIpX4ugjD +a,5,-31,-12907,586844478,-4862189775214031241,170,28086,1013876852,11005002152861474932,0.35319167,0.05573662213439634,MeSTAXq8gVxVjbEjgkvU9YLte0X9uE +a,2,45,15673,-1899175111,398282800995316041,99,2555,145294611,8554426087132697832,0.17333257,0.6405262429561641,b3b9esRhTzFEawbs6XhpKnD9ojutHB +b,3,17,14457,670497898,-2390782464845307388,255,24770,1538863055,12662506238151717757,0.34077626,0.7614304100703713,6x93sxYioWuq5c9Kkk8oTAAORM7cH0 +e,4,97,-13181,2047637360,6176835796788944083,158,53000,2042457019,9726016502640071617,0.7085086,0.12357539988406441,oHJMNvWuunsIMIWFnYG31RCfkOo2V7 +c,2,-60,-16312,-1808210365,-3368300253197863813,71,39635,2844041986,7045482583778080653,0.805363,0.6425694115212065,BJqx5WokrmrrezZA0dUbleMYkG5U2O +e,1,36,-21481,-928766616,-3471238138418013024,150,52569,2610290479,7788847578701297242,0.2578469,0.7670021786149205,gpo8K5qtYePve6jyPt6xgJx4YOVjms +b,5,-5,24896,1955646088,2430204191283109071,118,43655,2424630722,11429640193932435507,0.87989986,0.7328050041291218,JafwVLSVk5AVoXFuzclesQ000EE2k1 +a,3,13,32064,912707948,3826618523497875379,42,21463,2214035726,10771380284714693539,0.6133468,0.7325106678655877,i6RQVXKUh7MzuGMDaNclUYnFUAireU +c,1,41,-4667,-644225469,7049620391314639084,196,48099,2125812933,15419512479294091215,0.5780736,0.9255031346434324,mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS +d,2,93,-12642,2053379412,6468763445799074329,147,50842,1000948272,5536487915963301239,0.4279275,0.28534428578703896,lqhzgLsXZ8JhtpeeUWWNbMz8PHI705 +c,3,73,-9565,-382483011,1765659477910680019,186,1535,1088543984,2906943497598597237,0.680652,0.6009475544728957,Ow5PGpfTm4dXCfTDsXAOTatXRoAydR +c,3,-2,-18655,-2141999138,-3154042970870838072,251,34970,3862393166,13062025193350212516,0.034291923,0.7697753383420857,IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr +c,3,22,13741,-2098805236,8604102724776612452,45,2516,1362369177,196777795886465166,0.94669616,0.0494924465469434,6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE +b,2,63,21456,-2138770630,-2380041687053733364,181,57594,2705709344,13144161537396946288,0.09683716,0.3051364088814128,nYVJnVicpGRqKZibHyBAmtmzBXAFfT +d,4,102,-24558,1991172974,-7823479531661596016,14,36599,1534194097,2240998421986827216,0.028003037,0.8824879447595726,0og6hSkhbX8AC1ktFS4kounvTzy8Vo +d,1,-8,27138,-1383162419,7682021027078563072,36,64517,2861376515,9904216782086286050,0.80954456,0.9463098243875633,AFGCj7OWlEB5QfniEFgonMq90Tq5uH +a,3,17,-22796,1337043149,-1282905594104562444,167,2809,754775609,732272194388185106,0.3884129,0.658671129040488,VDhtJkYjAYPykCgOU9x3v7v3t4SO1a +e,2,52,23388,715235348,605432070100399212,165,56980,3314983189,7386391799827871203,0.46076488,0.980809631269599,jQimhdepw3GKmioWUlVSWeBVRKFkY3 +b,5,68,21576,1188285940,5717755781990389024,224,27600,974297360,9865419128970328044,0.80895734,0.7973920072996036,ioEncce3mPOXD2hWhpZpCPWGATG6GU +b,2,31,23127,-800561771,-8706387435232961848,153,27034,1098639440,3343692635488765507,0.35692692,0.5590205548347534,okOkcWflkNXIy4R8LzmySyY1EC3sYd +c,1,-24,-24085,-1882293856,7385529783747709716,41,48048,520189543,2402288956117186783,0.39761502,0.3600766362333053,Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u +a,4,65,-28462,-1813935549,7602389238442209730,18,363,1865307672,11378396836996498283,0.09130204,0.5593249815276734,WHmjWk2AY4c6m7DA4GitUx6nmb1yYS +d,1,125,31106,-1176490478,-4306856842351827308,90,17910,3625286410,17869394731126786457,0.8882508,0.7631239070049998,dVdvo6nUD5FgCgsbOZLds28RyGTpnx +b,4,17,-28070,-673237643,1904316899655860234,188,27744,933879086,3732692885824435932,0.41860116,0.40342283197779727,JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ +c,2,-106,-1114,-1927628110,1080308211931669384,177,20421,141680161,7464432081248293405,0.56749094,0.565352842229935,Vp3gmWunM5A7wOC9YW2JroFqTWjvTi +d,5,-59,2045,-2117946883,1170799768349713170,189,63353,1365198901,2501626630745849169,0.75173044,0.18628859265874176,F7NSTjWvQJyBburN7CXRUlbgp2dIrA +d,4,55,-1471,1902023838,1252101628560265705,157,3691,811650497,1524771507450695976,0.2968701,0.5437595540422571,f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX +b,2,-60,-21739,-1908480893,-8897292622858103761,59,50009,2525744318,1719090662556698549,0.52930677,0.560333188635217,l7uwDoTepWwnAP0ufqtHJS3CRi7RfP +d,3,-76,8809,141218956,-9110406195556445909,58,5494,1824517658,12046662515387914426,0.8557294,0.6668423897406515,Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK +e,4,73,-22501,1282464673,2541794052864382235,67,21119,538589788,9575476605699527641,0.48515016,0.296036538664718,4JznSdBajNWhu4hRQwjV1FjTTxY68i +b,4,-117,19316,2051224722,-5534418579506232438,133,52046,3023531799,13684453606722360110,0.62608826,0.8506721053047003,mhjME0zBHbrK6NMkytMTQzOssOa1gF +a,4,-101,11640,1993193190,2992662416070659899,230,40566,466439833,16778113360088370541,0.3991115,0.574210838214554,NEhyk8uIx4kEULJGa8qIyFjjBcP2G6 +b,5,62,16337,41423756,-2274773899098124524,121,34206,2307004493,10575647935385523483,0.23794776,0.1754261586710173,qnPOOmslCJaT45buUisMRnM0rc77EK +c,4,-79,5281,-237425046,373011991904079451,121,55620,2818832252,2464584078983135763,0.49774808,0.9237877978193884,t6fQUjJejPcjc04wHvHTPe55S65B4V +b,2,68,15874,49866617,1179733259727844435,121,23948,3455216719,3898128009708892708,0.6306253,0.9185813970744787,802bgTGl6Bk5TlkPYYTxp5JkKyaYUA +c,1,70,27752,1325868318,1241882478563331892,63,61637,473294098,4976799313755010034,0.13801557,0.5081765563442366,Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn +e,2,-61,-2888,-1660426473,2553892468492435401,126,35429,4144173353,939909697866979632,0.4405142,0.9231889896940375,BPtQMxnuSPpxMExYV9YkDa6cAN7GP3 +e,4,74,-12612,-1885422396,1702850374057819332,130,3583,3198969145,10767179755613315144,0.5518061,0.5614503754617461,QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv +d,2,122,10130,-168758331,-3179091803916845592,30,794,4061635107,15695681119022625322,0.69592506,0.9748360509016578,OPwBqCEK5PWTjWaiOyL45u2NLTaDWv +e,3,71,194,1436496767,-5639533800082367925,158,44507,3105312559,3998472996619161534,0.930117,0.6108938307533,pTeu0WMjBRTaNRT15rLCuEh3tBJVc5 +c,5,-94,-15880,2025611582,-3348824099853919681,5,40622,4268716378,12849419495718510869,0.34163946,0.4830878559436823,RilTlL1tKkPOUFuzmLydHAVZwv1OGl +d,1,-72,25590,1188089983,3090286296481837049,241,832,3542840110,5885937420286765261,0.41980565,0.21535402343780985,wwXqSGKLyBQyPkonlzBNYUJTCo4LRS +e,1,71,-5479,-1339586153,-3920238763788954243,123,53012,4229654142,10297218950720052365,0.73473036,0.5773498217058918,cBGc0kSm32ylBDnxogG727C0uhZEYZ +e,4,96,-30336,427197269,7506304308750926996,95,48483,3521368277,5437030162957481122,0.58104324,0.42073125331890115,3BEOHQsMEFZ58VcNTOJYShTBpAPzbt +a,2,-48,-18025,439738328,-313657814587041987,222,13763,3717551163,9135746610908713318,0.055064857,0.9800193410444061,ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 +a,1,-56,8692,2106705285,-7811675384226570375,231,15573,1454057357,677091006469429514,0.42794758,0.2739938529235548,JN0VclewmjwYlSl8386MlWv5rEhWCz +e,2,52,-12056,-1090239422,9011500141803970147,238,4168,2013662838,12565360638488684051,0.6694766,0.39144436569161134,xipQ93429ksjNcXPX5326VSg1xJZcW +a,1,-5,12636,794623392,2909750622865366631,15,24022,2669374863,4776679784701509574,0.29877836,0.2537253407987472,waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs +b,1,12,7652,-1448995523,-5332734971209541785,136,49283,4076864659,15449267433866484283,0.6214579,0.05636955101974106,akiiY5N0I44CMwEnBL6RTBk7BRkxEj +e,5,64,-26526,1689098844,8950618259486183091,224,45253,662099130,16127995415060805595,0.2897315,0.5759450483859969,56MZa5O1hVtX4c5sbnCfxuX5kDChqI +c,4,-90,-2935,1579876740,6733733506744649678,254,12876,3593959807,4094315663314091142,0.5708688,0.5603062368164834,Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV +e,5,-86,32514,-467659022,-8012578250188146150,254,2684,2861911482,2126626171973341689,0.12559289,0.01479305307777301,gxfHWUF8XgY2KdFxigxvNEXe2V2XMl +c,2,-117,-30187,-1222533990,-191957437217035800,136,47061,2293105904,12659011877190539078,0.2047385,0.9706712283358269,pLk3i59bZwd5KBZrI1FiweYTd5hteG +a,3,14,28162,397430452,-452851601758273256,57,14722,431948861,8164671015278284913,0.40199697,0.07260475960924484,TtDKUZxzVxsq758G6AWPSYuZgVgbcl +c,2,29,-3855,1354539333,4742062657200940467,81,53815,3398507249,562977550464243101,0.7124534,0.991517828651004,Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0 +b,4,-59,25286,1423957796,2646602445954944051,0,61069,3570297463,15100310750150419896,0.49619365,0.04893135681998029,fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG +a,1,83,-14704,2143473091,-4387559599038777245,37,829,4015442341,4602675983996931623,0.89542526,0.9567595541247681,ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU +a,3,-12,-9168,1489733240,-1569376002217735076,206,33821,3959216334,16060348691054629425,0.9488028,0.9293883502480845,oLZ21P2JEDooxV1pU31cIxQHEeeoLu +c,4,3,-30508,659422734,-6455460736227846736,133,59663,2306130875,8622584762448622224,0.16999894,0.4273123318932347,EcCuckwsF3gV1Ecgmh5v4KM8g1ozif +a,3,-72,-11122,-2141451704,-2578916903971263854,83,30296,1995343206,17452974532402389080,0.94209343,0.3231750610081745,e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG +c,2,-107,-2904,-1011669561,782342092880993439,18,29527,1157161427,4403623840168496677,0.31988364,0.36936304600612724,QYlaIAnJA6r8rlAb6f59wcxvcPcWFf +c,5,118,19208,-134213907,-2120241105523909127,86,57751,1229567292,16493024289408725403,0.5536642,0.9723580396501548,TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX +c,3,97,29106,-903316089,2874859437662206732,207,42171,3473924576,8188072741116415408,0.32792538,0.2667177795079635,HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g +b,3,-101,-13217,-346989627,5456800329302529236,26,54276,243203849,17929716297117857676,0.05422181,0.09465635123783445,MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ +a,2,-43,13080,370975815,5881039805148485053,2,20120,2939920218,906367167997372130,0.42733806,0.16301110515739792,m6jD0LBIQWaMfenwRCTANI9eOdyyto +a,5,-101,-12484,-842693467,-6140627905445351305,57,57885,2496054700,2243924747182709810,0.59520596,0.9491397432856566,QJYm7YRA3YetcBHI5wkMZeLXVmfuNy +b,5,-44,15788,-629486480,5822642169425315613,13,11872,3457053821,2413406423648025909,0.44318348,0.32869374687050157,ALuRhobVWbnQTTWZdSOk0iVe8oYFhW +d,4,5,-7688,702611616,6239356364381313700,4,39363,3126475872,35363005357834672,0.3766935,0.061029375346466685,H5j5ZHy1FGesOAHjkQEDYCucbpKWRu +e,1,120,10837,-1331533190,6342019705133850847,245,3975,2830981072,16439861276703750332,0.6623719,0.9965400387585364,LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW +e,3,-95,13611,2030965207,927403809957470678,119,59134,559847112,10966649192992996919,0.5301289,0.047343434291126085,gTpyQnEODMcpsPnJMZC66gh33i3m0b +d,3,123,29533,240273900,1176001466590906949,117,30972,2592330556,12883447461717956514,0.39075065,0.38870280983958583,1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO +b,4,47,20690,-1009656194,-2027442591571700798,200,7781,326151275,2881913079548128905,0.57360977,0.2145232647388039,52mKlRE3aHCBZtjECq6sY9OqVf8Dze +e,4,30,-16110,61035129,-3356533792537910152,159,299,28774375,13526465947516666293,0.6999775,0.03968347085780355,cq4WSAIFwx3wwTUS5bp1wCe71R6U5I \ No newline at end of file diff --git a/datafusion-examples/data/csv/cars.csv b/datafusion-examples/data/csv/cars.csv new file mode 100644 index 0000000000000..bc40f3b01e7a5 --- /dev/null +++ b/datafusion-examples/data/csv/cars.csv @@ -0,0 +1,26 @@ +car,speed,time +red,20.0,1996-04-12T12:05:03.000000000 +red,20.3,1996-04-12T12:05:04.000000000 +red,21.4,1996-04-12T12:05:05.000000000 +red,21.5,1996-04-12T12:05:06.000000000 +red,19.0,1996-04-12T12:05:07.000000000 +red,18.0,1996-04-12T12:05:08.000000000 +red,17.0,1996-04-12T12:05:09.000000000 +red,7.0,1996-04-12T12:05:10.000000000 +red,7.1,1996-04-12T12:05:11.000000000 +red,7.2,1996-04-12T12:05:12.000000000 +red,3.0,1996-04-12T12:05:13.000000000 +red,1.0,1996-04-12T12:05:14.000000000 +red,0.0,1996-04-12T12:05:15.000000000 +green,10.0,1996-04-12T12:05:03.000000000 +green,10.3,1996-04-12T12:05:04.000000000 +green,10.4,1996-04-12T12:05:05.000000000 +green,10.5,1996-04-12T12:05:06.000000000 +green,11.0,1996-04-12T12:05:07.000000000 +green,12.0,1996-04-12T12:05:08.000000000 +green,14.0,1996-04-12T12:05:09.000000000 +green,15.0,1996-04-12T12:05:10.000000000 +green,15.1,1996-04-12T12:05:11.000000000 +green,15.2,1996-04-12T12:05:12.000000000 +green,8.0,1996-04-12T12:05:13.000000000 +green,2.0,1996-04-12T12:05:14.000000000 diff --git a/datafusion-examples/data/csv/regex.csv b/datafusion-examples/data/csv/regex.csv new file mode 100644 index 0000000000000..b249c39522b60 --- /dev/null +++ b/datafusion-examples/data/csv/regex.csv @@ -0,0 +1,12 @@ +values,patterns,replacement,flags +abc,^(a),bb\1bb,i +ABC,^(A).*,B,i +aBc,(b|d),e,i +AbC,(B|D),e, +aBC,^(b|c),d, +4000,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz, +4010,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz, +Düsseldorf,[\p{Letter}-]+,München, +Москва,[\p{L}-]+,Moscow, +Köln,[a-zA-Z]ö[a-zA-Z]{2},Koln, +اليوم,^\p{Arabic}+$,Today, \ No newline at end of file diff --git a/datafusion-examples/data/csv/window_1.csv b/datafusion-examples/data/csv/window_1.csv new file mode 100644 index 0000000000000..588af16d06863 --- /dev/null +++ b/datafusion-examples/data/csv/window_1.csv @@ -0,0 +1,101 @@ +ts,inc_col,desc_col +1,1,100 +1,5,98 +5,10,93 +9,15,91 +10,20,86 +11,21,84 +16,26,81 +21,29,77 +22,30,75 +26,33,71 +26,37,70 +28,40,69 +31,43,64 +33,44,62 +38,45,59 +42,49,55 +47,51,50 +51,53,45 +53,58,41 +53,61,40 +58,65,39 +63,70,36 +67,75,31 +68,78,28 +70,83,23 +72,88,22 +72,90,17 +76,91,13 +81,95,10 +85,97,6 +86,100,5 +88,105,2 +91,109,1 +96,111,-1 +97,115,-4 +98,119,-5 +100,120,-6 +101,124,-8 +102,126,-12 +104,129,-16 +104,131,-17 +108,135,-19 +112,140,-24 +113,143,-25 +113,144,-29 +114,147,-34 +114,148,-37 +117,149,-42 +122,151,-47 +126,155,-48 +131,156,-49 +131,159,-53 +136,160,-57 +136,163,-58 +136,165,-61 +139,170,-65 +141,172,-67 +146,177,-68 +147,181,-71 +147,182,-73 +152,186,-75 +154,187,-76 +159,192,-78 +161,196,-83 +163,197,-87 +164,199,-91 +167,203,-95 +172,207,-98 +173,209,-101 +177,213,-105 +180,214,-106 +185,216,-111 +186,219,-114 +191,221,-116 +195,222,-120 +195,225,-125 +199,226,-128 +203,231,-129 +207,236,-134 +210,237,-139 +213,242,-142 +218,245,-143 +221,247,-146 +224,248,-150 +226,253,-154 +230,254,-158 +232,259,-163 +235,261,-168 +238,266,-172 +238,269,-176 +239,272,-181 +244,275,-184 +245,278,-189 +247,283,-193 +250,286,-196 +254,289,-201 +258,291,-203 +262,296,-208 +264,301,-210 +264,305,-213 diff --git a/datafusion-examples/data/parquet/alltypes_plain.parquet b/datafusion-examples/data/parquet/alltypes_plain.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a63f5dca7c3821909748f34752966a0d7e08d47f GIT binary patch literal 1851 zcmb7F&ui0g6#pje+AM2l7<*qVv_zM;YQD;X!1G_`Ye@W}TbqmweOL$NPTX=lg!8G{0y<66Rp8 z2pS|Aqlfj;PSH-&mT4zwizU$p1|0Y`VGJoyS_YbwNId;`jBg|z+DN8!b`S=|Sr$Ee51+|8u<)E>*X#arx$Xz24Q}8O`6X`}Xhr%F1Zjm_hG6In z7b&rg?-Cs*15K~?$g4Hmpi6uSk7fKqck31Rd$NO@X;dxW?*`sZ;$!02EAVEj1Dx*0 z{MLuNloZ0uLp~A&5eQYhXi-eh3&y9k4#_aQs_m^t;cL8xuhMu#`94GW^Wlpd7r_2h zbWlRre%G&Crz3oz;A@fee~~T(>&n~(=)0;8>IvyeeZ%&hb^-l_Dsj zFvuM<3gd=3Zp;SqWJI2b$YdaF$o()3pD7?YQ98!mj1HO5|D}r6be0>LFTr05hN163Gw zN`^s(6x}&&X(N%RnM7u%??nSFr{~`PZ?MG}V6i6>#vU;kXJ%l`=Er#5j4|7?$M(UP z-GIH6Am3BD#zq&s>YC+S`G?MW!>iZw=2&6OxPE8h?#;!8`C@+5-thcNe#V-dsZ?y! oaow58so4p;;FgVPyFBeqnNHc9FdWl$-SX^Jc0`|z5`8xR0vs|-V*mgE literal 0 HcmV?d00001 diff --git a/datafusion-examples/examples/builtin_functions/regexp.rs b/datafusion-examples/examples/builtin_functions/regexp.rs index 4c6b6c67ac4d5..708924f224fee 100644 --- a/datafusion-examples/examples/builtin_functions/regexp.rs +++ b/datafusion-examples/examples/builtin_functions/regexp.rs @@ -18,12 +18,11 @@ //! See `main.rs` for how to run it. -use std::{fs::File, io::Write}; +use std::path::PathBuf; use datafusion::common::{assert_batches_eq, assert_contains}; use datafusion::error::Result; use datafusion::prelude::*; -use tempfile::tempdir; /// This example demonstrates how to use the regexp_* functions /// @@ -35,29 +34,12 @@ use tempfile::tempdir; /// https://docs.rs/regex/latest/regex/#grouping-and-flags pub async fn regexp() -> Result<()> { let ctx = SessionContext::new(); - // content from file 'datafusion/physical-expr/tests/data/regex.csv' - let csv_data = r#"values,patterns,replacement,flags -abc,^(a),bb\1bb,i -ABC,^(A).*,B,i -aBc,(b|d),e,i -AbC,(B|D),e, -aBC,^(b|c),d, -4000,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz, -4010,\b4([1-9]\d\d|\d[1-9]\d|\d\d[1-9])\b,xyz, -Düsseldorf,[\p{Letter}-]+,München, -Москва,[\p{L}-]+,Moscow, -Köln,[a-zA-Z]ö[a-zA-Z]{2},Koln, -اليوم,^\p{Arabic}+$,Today,"#; - let dir = tempdir()?; - let file_path = dir.path().join("regex.csv"); - { - let mut file = File::create(&file_path)?; - // write CSV data - file.write_all(csv_data.as_bytes())?; - } // scope closes the file - let file_path = file_path.to_str().unwrap(); - - ctx.register_csv("examples", file_path, CsvReadOptions::new()) + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("regex.csv"); + + ctx.register_csv("examples", path.to_str().unwrap(), CsvReadOptions::new()) .await?; // diff --git a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs index 48b885839f137..fcc7434a2a1f8 100644 --- a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs +++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs @@ -17,6 +17,7 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema}; @@ -52,10 +53,10 @@ async fn csv_opener() -> Result<()> { let object_store = Arc::new(LocalFileSystem::new()); let schema = aggr_test_schema(); - let testdata = datafusion::test_util::arrow_test_data(); - let path = format!("{testdata}/csv/aggregate_test_100.csv"); - - let path = std::path::Path::new(&path).canonicalize()?; + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("aggregate_test_100.csv"); let options = CsvOptions { has_header: Some(true), diff --git a/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs index 554382ea9549e..45150199c9430 100644 --- a/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs +++ b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs @@ -17,7 +17,8 @@ //! See `main.rs` for how to run it. -use datafusion::common::test_util::datafusion_test_data; +use std::path::PathBuf; + use datafusion::error::Result; use datafusion::prelude::*; @@ -27,7 +28,10 @@ pub async fn csv_sql_streaming() -> Result<()> { // create local execution context let ctx = SessionContext::new(); - let testdata = datafusion_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("window_1.csv"); // Register a table source and tell DataFusion the file is ordered by `ts ASC`. // Note it is the responsibility of the user to make sure @@ -38,7 +42,7 @@ pub async fn csv_sql_streaming() -> Result<()> { // register csv file with the execution context ctx.register_csv( "ordered_table", - &format!("{testdata}/window_1.csv"), + path.to_str().unwrap(), CsvReadOptions::new().file_sort_order(vec![sort_expr]), ) .await?; diff --git a/datafusion-examples/examples/data_io/parquet_encrypted.rs b/datafusion-examples/examples/data_io/parquet_encrypted.rs index e3070cdddeac6..5eb58caa8b51d 100644 --- a/datafusion-examples/examples/data_io/parquet_encrypted.rs +++ b/datafusion-examples/examples/data_io/parquet_encrypted.rs @@ -24,6 +24,7 @@ use datafusion::logical_expr::{col, lit}; use datafusion::parquet::encryption::decrypt::FileDecryptionProperties; use datafusion::parquet::encryption::encrypt::FileEncryptionProperties; use datafusion::prelude::{ParquetReadOptions, SessionContext}; +use std::path::PathBuf; use std::sync::Arc; use tempfile::TempDir; @@ -32,13 +33,14 @@ pub async fn parquet_encrypted() -> datafusion::common::Result<()> { // The SessionContext is the main high level API for interacting with DataFusion let ctx = SessionContext::new(); - // Find the local path of "alltypes_plain.parquet" - let testdata = datafusion::test_util::parquet_test_data(); - let filename = &format!("{testdata}/alltypes_plain.parquet"); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // Read the sample parquet file let parquet_df = ctx - .read_parquet(filename, ParquetReadOptions::default()) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await?; // Show information from the dataframe diff --git a/datafusion-examples/examples/data_io/parquet_exec_visitor.rs b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs index 925c202eac456..ffc68d69fe35e 100644 --- a/datafusion-examples/examples/data_io/parquet_exec_visitor.rs +++ b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs @@ -17,6 +17,7 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; use std::sync::Arc; use datafusion::datasource::file_format::parquet::ParquetFormat; @@ -35,7 +36,10 @@ use futures::StreamExt; pub async fn parquet_exec_visitor() -> datafusion::common::Result<()> { let ctx = SessionContext::new(); - let test_data = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(true); @@ -45,7 +49,7 @@ pub async fn parquet_exec_visitor() -> datafusion::common::Result<()> { let _ = ctx .register_listing_table( "my_table", - &format!("file://{test_data}/alltypes_plain.parquet"), + path.to_str().unwrap(), listing_options.clone(), None, None, diff --git a/datafusion-examples/examples/dataframe/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs index 6953170191724..bf2e19ca627fb 100644 --- a/datafusion-examples/examples/dataframe/dataframe.rs +++ b/datafusion-examples/examples/dataframe/dataframe.rs @@ -30,6 +30,7 @@ use datafusion::functions_aggregate::min_max::max; use datafusion::prelude::*; use std::fs::{create_dir_all, File}; use std::io::Write; +use std::path::PathBuf; use std::sync::Arc; use tempfile::{tempdir, TempDir}; @@ -77,13 +78,14 @@ pub async fn dataframe_example() -> Result<()> { /// 2. Show the schema /// 3. Select columns and rows async fn read_parquet(ctx: &SessionContext) -> Result<()> { - // Find the local path of "alltypes_plain.parquet" - let testdata = datafusion::test_util::parquet_test_data(); - let filename = &format!("{testdata}/alltypes_plain.parquet"); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // Read the parquet files and show its schema using 'describe' let parquet_df = ctx - .read_parquet(filename, ParquetReadOptions::default()) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await?; // show its schema using 'describe' @@ -331,12 +333,12 @@ async fn where_exist_subquery(ctx: &SessionContext) -> Result<()> { } async fn register_aggregate_test_data(name: &str, ctx: &SessionContext) -> Result<()> { - let testdata = datafusion::test_util::arrow_test_data(); - ctx.register_csv( - name, - &format!("{testdata}/csv/aggregate_test_100.csv"), - CsvReadOptions::default(), - ) - .await?; + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("aggregate_test_100.csv"); + + ctx.register_csv(name, path.to_str().unwrap(), CsvReadOptions::default()) + .await?; Ok(()) } diff --git a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs index e19d45554131a..af169a2e28f80 100644 --- a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs +++ b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs @@ -17,6 +17,8 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; + use arrow::array::{AsArray, PrimitiveArray}; use arrow::datatypes::{Float64Type, Int32Type}; use datafusion::common::assert_batches_eq; @@ -34,13 +36,18 @@ use futures::StreamExt; pub async fn deserialize_to_struct() -> Result<()> { // Run a query that returns two columns of data let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); + ctx.register_parquet( "alltypes_plain", - &format!("{testdata}/alltypes_plain.parquet"), + path.to_str().unwrap(), ParquetReadOptions::default(), ) .await?; + let df = ctx .sql("SELECT int_col, double_col FROM alltypes_plain") .await?; diff --git a/datafusion-examples/examples/execution_monitoring/tracing.rs b/datafusion-examples/examples/execution_monitoring/tracing.rs index f0e5a037151c3..f34330064d2a9 100644 --- a/datafusion-examples/examples/execution_monitoring/tracing.rs +++ b/datafusion-examples/examples/execution_monitoring/tracing.rs @@ -56,10 +56,10 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::error::Result; use datafusion::prelude::*; -use datafusion::test_util::parquet_test_data; use futures::future::BoxFuture; use futures::FutureExt; use std::any::Any; +use std::path::PathBuf; use std::sync::Arc; use tracing::{info, instrument, Instrument, Level, Span}; @@ -122,16 +122,28 @@ async fn run_instrumented_query() -> Result<()> { info!("Starting query execution"); let ctx = SessionContext::new(); - let test_data = parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); + let file_format = ParquetFormat::default().with_enable_pruning(true); - let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension("alltypes_tiny_pages_plain.parquet"); + let listing_options = + ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet"); - let table_path = format!("file://{test_data}/"); - info!("Registering table 'alltypes' from {}", table_path); - ctx.register_listing_table("alltypes", &table_path, listing_options, None, None) - .await - .expect("Failed to register table"); + info!( + "Registering table 'alltypes' from {}", + path.to_str().unwrap() + ); + ctx.register_listing_table( + "alltypes", + path.to_str().unwrap(), + listing_options, + None, + None, + ) + .await + .expect("Failed to register table"); let sql = "SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col"; info!(sql, "Executing SQL query"); diff --git a/datafusion-examples/examples/flight/client.rs b/datafusion-examples/examples/flight/client.rs index 484576975a6f2..0aa1a2f6f4549 100644 --- a/datafusion-examples/examples/flight/client.rs +++ b/datafusion-examples/examples/flight/client.rs @@ -18,6 +18,7 @@ //! See `main.rs` for how to run it. use std::collections::HashMap; +use std::path::PathBuf; use std::sync::Arc; use tonic::transport::Endpoint; @@ -33,7 +34,10 @@ use datafusion::arrow::util::pretty; /// Parquet files and executing SQL queries against them on a remote server. /// This example is run along-side the example `flight_server`. pub async fn client() -> Result<(), Box> { - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // Create Flight client let endpoint = Endpoint::new("http://localhost:50051")?; @@ -44,7 +48,7 @@ pub async fn client() -> Result<(), Box> { let request = tonic::Request::new(FlightDescriptor { r#type: flight_descriptor::DescriptorType::Path as i32, cmd: Default::default(), - path: vec![format!("{testdata}/alltypes_plain.parquet")], + path: vec![format!("{}", path.to_str().unwrap())], }); let schema_result = client.get_schema(request).await?.into_inner(); diff --git a/datafusion-examples/examples/flight/server.rs b/datafusion-examples/examples/flight/server.rs index 7f348e7d59411..dfb687eecc3e5 100644 --- a/datafusion-examples/examples/flight/server.rs +++ b/datafusion-examples/examples/flight/server.rs @@ -18,6 +18,7 @@ //! See `main.rs` for how to run it. use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator}; +use std::path::PathBuf; use std::sync::Arc; use arrow_flight::{PollInfo, SchemaAsIpc}; @@ -85,12 +86,15 @@ impl FlightService for FlightServiceImpl { // create local execution context let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // register parquet file with the execution context ctx.register_parquet( "alltypes_plain", - &format!("{testdata}/alltypes_plain.parquet"), + path.to_str().unwrap(), ParquetReadOptions::default(), ) .await diff --git a/datafusion-examples/examples/flight/sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs index 529296fac8690..c92f2635b2b44 100644 --- a/datafusion-examples/examples/flight/sql_server.rs +++ b/datafusion-examples/examples/flight/sql_server.rs @@ -42,6 +42,7 @@ use futures::{Stream, StreamExt, TryStreamExt}; use log::info; use mimalloc::MiMalloc; use prost::Message; +use std::path::PathBuf; use std::pin::Pin; use std::sync::Arc; use tonic::metadata::MetadataValue; @@ -100,12 +101,15 @@ impl FlightSqlServiceImpl { .with_information_schema(true); let ctx = Arc::new(SessionContext::new_with_config(session_config)); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // register parquet file with the execution context ctx.register_parquet( "alltypes_plain", - &format!("{testdata}/alltypes_plain.parquet"), + path.to_str().unwrap(), ParquetReadOptions::default(), ) .await diff --git a/datafusion-examples/examples/query_planning/parse_sql_expr.rs b/datafusion-examples/examples/query_planning/parse_sql_expr.rs index 376120de9d492..7ef4c035761d2 100644 --- a/datafusion-examples/examples/query_planning/parse_sql_expr.rs +++ b/datafusion-examples/examples/query_planning/parse_sql_expr.rs @@ -17,6 +17,8 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; + use arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::DFSchema; use datafusion::logical_expr::{col, lit}; @@ -76,12 +78,12 @@ async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> { .or(col("double_col").eq(lit(8.0_f64))); let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); let df = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await?; let parsed_expr = df.parse_sql_expr(sql)?; @@ -93,12 +95,12 @@ async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> { async fn query_parquet_demo() -> Result<()> { let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); let df = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await?; let df = df @@ -138,12 +140,12 @@ async fn round_trip_parse_sql_expr_demo() -> Result<()> { let sql = "((int_col < 5) OR (double_col = 8))"; let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); let df = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await?; let parsed_expr = df.parse_sql_expr(sql)?; diff --git a/datafusion-examples/examples/query_planning/plan_to_sql.rs b/datafusion-examples/examples/query_planning/plan_to_sql.rs index 95e04907d9be3..e3480a0f6377a 100644 --- a/datafusion-examples/examples/query_planning/plan_to_sql.rs +++ b/datafusion-examples/examples/query_planning/plan_to_sql.rs @@ -36,6 +36,7 @@ use datafusion::sql::unparser::extension_unparser::{ }; use datafusion::sql::unparser::{plan_to_sql, Unparser}; use std::fmt; +use std::path::PathBuf; use std::sync::Arc; /// This example demonstrates the programmatic construction of SQL strings using @@ -114,12 +115,13 @@ fn simple_expr_to_sql_demo_escape_mysql_style() -> Result<()> { async fn simple_plan_to_sql_demo() -> Result<()> { let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); + let df = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await? .select_columns(&["id", "int_col", "double_col", "date_string_col"])?; @@ -139,12 +141,15 @@ async fn simple_plan_to_sql_demo() -> Result<()> { async fn round_trip_plan_to_sql_demo() -> Result<()> { let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // register parquet file with the execution context ctx.register_parquet( "alltypes_plain", - &format!("{testdata}/alltypes_plain.parquet"), + path.to_str().unwrap(), ParquetReadOptions::default(), ) .await?; @@ -231,12 +236,12 @@ impl UserDefinedLogicalNodeUnparser for PlanToStatement { /// It can be unparse as a statement that reads from the same parquet file. async fn unparse_my_logical_plan_as_statement() -> Result<()> { let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); let inner_plan = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await? .select_columns(&["id", "int_col", "double_col", "date_string_col"])? .into_unoptimized_plan(); @@ -284,12 +289,12 @@ impl UserDefinedLogicalNodeUnparser for PlanToSubquery { /// It can be unparse as a subquery that reads from the same parquet file, with some columns projected. async fn unparse_my_logical_plan_as_subquery() -> Result<()> { let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); let inner_plan = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await? .select_columns(&["id", "int_col", "double_col", "date_string_col"])? .into_unoptimized_plan(); diff --git a/datafusion-examples/examples/query_planning/planner_api.rs b/datafusion-examples/examples/query_planning/planner_api.rs index 9b8aa1c2fe649..cf70b04be8edf 100644 --- a/datafusion-examples/examples/query_planning/planner_api.rs +++ b/datafusion-examples/examples/query_planning/planner_api.rs @@ -17,6 +17,8 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; + use datafusion::error::Result; use datafusion::logical_expr::LogicalPlan; use datafusion::physical_plan::displayable; @@ -37,12 +39,13 @@ use datafusion::prelude::*; pub async fn planner_api() -> Result<()> { // Set up a DataFusion context and load a Parquet file let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); + let df = ctx - .read_parquet( - &format!("{testdata}/alltypes_plain.parquet"), - ParquetReadOptions::default(), - ) + .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) .await?; // Construct the input logical plan using DataFrame API diff --git a/datafusion-examples/examples/query_planning/thread_pools.rs b/datafusion-examples/examples/query_planning/thread_pools.rs index 6fc7d51e91c1f..8b1ec4f2e676f 100644 --- a/datafusion-examples/examples/query_planning/thread_pools.rs +++ b/datafusion-examples/examples/query_planning/thread_pools.rs @@ -45,6 +45,7 @@ use datafusion::prelude::*; use futures::stream::StreamExt; use object_store::client::SpawnedReqwestConnector; use object_store::http::HttpBuilder; +use std::path::PathBuf; use std::sync::Arc; use tokio::runtime::Handle; use tokio::sync::Notify; @@ -70,10 +71,11 @@ pub async fn thread_pools() -> Result<()> { // The first two examples read local files. Enabling the URL table feature // lets us treat filenames as tables in SQL. let ctx = SessionContext::new().enable_url_table(); - let sql = format!( - "SELECT * FROM '{}/alltypes_plain.parquet'", - datafusion::test_util::parquet_test_data() - ); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); + let sql = format!("SELECT * FROM '{}'", path.to_str().unwrap()); // Run a query on the current runtime. Calling `await` means the future // (in this case the `async` function and all spawned work in DataFusion diff --git a/datafusion-examples/examples/sql_ops/query.rs b/datafusion-examples/examples/sql_ops/query.rs index 3e052f0823b97..28f2bbfa7f5fa 100644 --- a/datafusion-examples/examples/sql_ops/query.rs +++ b/datafusion-examples/examples/sql_ops/query.rs @@ -27,7 +27,7 @@ use datafusion::datasource::MemTable; use datafusion::error::{DataFusionError, Result}; use datafusion::prelude::*; use object_store::local::LocalFileSystem; -use std::path::Path; +use std::path::PathBuf; use std::sync::Arc; /// Examples of various ways to execute queries using SQL @@ -113,20 +113,20 @@ async fn query_parquet() -> Result<()> { // create local execution context let ctx = SessionContext::new(); - let test_data = datafusion::test_util::parquet_test_data(); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("parquet") + .join("alltypes_plain.parquet"); // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(true); - let listing_options = ListingOptions::new(Arc::new(file_format)) - // This is a workaround for this example since `test_data` contains - // many different parquet different files, - // in practice use FileType::PARQUET.get_ext(). - .with_file_extension("alltypes_plain.parquet"); + let listing_options = + ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet"); // First example were we use an absolute path, which requires no additional setup. ctx.register_listing_table( "my_table", - &format!("file://{test_data}/"), + path.to_str().unwrap(), listing_options.clone(), None, None, @@ -159,8 +159,7 @@ async fn query_parquet() -> Result<()> { // simulate a relative path, this requires registering an ObjectStore. let cur_dir = std::env::current_dir()?; - let test_data_path = Path::new(&test_data); - let test_data_path_parent = test_data_path + let test_data_path_parent = path .parent() .ok_or(exec_datafusion_err!("test_data path needs a parent"))?; @@ -176,7 +175,7 @@ async fn query_parquet() -> Result<()> { // for the query ctx.register_listing_table( "relative_table", - "./data", + path.to_str().unwrap(), listing_options.clone(), None, None, diff --git a/datafusion-examples/examples/udf/advanced_udwf.rs b/datafusion-examples/examples/udf/advanced_udwf.rs index e8d3a75b29dec..15eaf4e4521c3 100644 --- a/datafusion-examples/examples/udf/advanced_udwf.rs +++ b/datafusion-examples/examples/udf/advanced_udwf.rs @@ -17,7 +17,8 @@ //! See `main.rs` for how to run it. -use std::{any::Any, fs::File, io::Write, sync::Arc}; +use std::path::PathBuf; +use std::{any::Any, sync::Arc}; use arrow::datatypes::Field; use arrow::{ @@ -40,7 +41,6 @@ use datafusion::logical_expr::{ use datafusion::physical_expr::PhysicalExpr; use datafusion::prelude::*; use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility}; -use tempfile::tempdir; /// This example shows how to use the full WindowUDFImpl API to implement a user /// defined window function. As in the `simple_udwf.rs` example, this struct implements @@ -230,44 +230,12 @@ async fn create_context() -> Result { // declare a new context. In spark API, this corresponds to a new spark SQL session let ctx = SessionContext::new(); - // content from file 'datafusion/core/tests/data/cars.csv' - let csv_data = r#"car,speed,time -red,20.0,1996-04-12T12:05:03.000000000 -red,20.3,1996-04-12T12:05:04.000000000 -red,21.4,1996-04-12T12:05:05.000000000 -red,21.5,1996-04-12T12:05:06.000000000 -red,19.0,1996-04-12T12:05:07.000000000 -red,18.0,1996-04-12T12:05:08.000000000 -red,17.0,1996-04-12T12:05:09.000000000 -red,7.0,1996-04-12T12:05:10.000000000 -red,7.1,1996-04-12T12:05:11.000000000 -red,7.2,1996-04-12T12:05:12.000000000 -red,3.0,1996-04-12T12:05:13.000000000 -red,1.0,1996-04-12T12:05:14.000000000 -red,0.0,1996-04-12T12:05:15.000000000 -green,10.0,1996-04-12T12:05:03.000000000 -green,10.3,1996-04-12T12:05:04.000000000 -green,10.4,1996-04-12T12:05:05.000000000 -green,10.5,1996-04-12T12:05:06.000000000 -green,11.0,1996-04-12T12:05:07.000000000 -green,12.0,1996-04-12T12:05:08.000000000 -green,14.0,1996-04-12T12:05:09.000000000 -green,15.0,1996-04-12T12:05:10.000000000 -green,15.1,1996-04-12T12:05:11.000000000 -green,15.2,1996-04-12T12:05:12.000000000 -green,8.0,1996-04-12T12:05:13.000000000 -green,2.0,1996-04-12T12:05:14.000000000 -"#; - let dir = tempdir()?; - let file_path = dir.path().join("cars.csv"); - { - let mut file = File::create(&file_path)?; - // write CSV data - file.write_all(csv_data.as_bytes())?; - } // scope closes the file - let file_path = file_path.to_str().unwrap(); - - ctx.register_csv("cars", file_path, CsvReadOptions::new()) + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("cars.csv"); + + ctx.register_csv("cars", path.to_str().unwrap(), CsvReadOptions::new()) .await?; Ok(ctx) diff --git a/datafusion-examples/examples/udf/simple_udtf.rs b/datafusion-examples/examples/udf/simple_udtf.rs index 12ee74fc52ee5..c44a323f35cc4 100644 --- a/datafusion-examples/examples/udf/simple_udtf.rs +++ b/datafusion-examples/examples/udf/simple_udtf.rs @@ -37,6 +37,7 @@ use datafusion::prelude::*; use std::fs::File; use std::io::Seek; use std::path::Path; +use std::path::PathBuf; use std::sync::Arc; // To define your own table function, you only need to do the following 3 things: // 1. Implement your own [`TableProvider`] @@ -51,18 +52,26 @@ pub async fn simple_udtf() -> Result<()> { // register the table function that will be called in SQL statements by `read_csv` ctx.register_udtf("read_csv", Arc::new(LocalCsvTableFunc {})); - let testdata = datafusion::test_util::arrow_test_data(); - let csv_file = format!("{testdata}/csv/aggregate_test_100.csv"); + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("aggregate_test_100.csv"); // Pass 2 arguments, read csv with at most 2 rows (simplify logic makes 1+1 --> 2) let df = ctx - .sql(format!("SELECT * FROM read_csv('{csv_file}', 1 + 1);").as_str()) + .sql( + format!( + "SELECT * FROM read_csv('{}', 1 + 1);", + path.to_str().unwrap() + ) + .as_str(), + ) .await?; df.show().await?; // just run, return all rows let df = ctx - .sql(format!("SELECT * FROM read_csv('{csv_file}');").as_str()) + .sql(format!("SELECT * FROM read_csv('{}');", path.to_str().unwrap()).as_str()) .await?; df.show().await?; From 6daedb6617de28b440fbd04684bf6c380be7182f Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Mon, 15 Dec 2025 13:35:53 +0300 Subject: [PATCH 02/13] run prettier --- datafusion-examples/data/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion-examples/data/README.md b/datafusion-examples/data/README.md index 38ce020a4a1de..ee20e11a55adc 100644 --- a/datafusion-examples/data/README.md +++ b/datafusion-examples/data/README.md @@ -18,6 +18,7 @@ --> ## Example datasets + | Filename | Path | Description | | ------------------------ | ------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `aggregate_test_100.csv` | [`data/csv/aggregate_test_100.csv`](./csv/aggregate_test_100.csv) | Multi-column CSV dataset with mixed data types (strings, signed/unsigned integers, floats, doubles). Used for aggregation, projection, filtering, and query planning examples. Derived from Arrow-style aggregate test data. | From 6603e3e951200d537bdce2c35894db7130100c2e Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Tue, 16 Dec 2025 08:53:40 +0300 Subject: [PATCH 03/13] preserve file:// & fix comments --- datafusion-examples/examples/builtin_functions/regexp.rs | 1 - .../examples/data_io/parquet_exec_visitor.rs | 4 +++- datafusion-examples/examples/sql_ops/query.rs | 9 ++++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datafusion-examples/examples/builtin_functions/regexp.rs b/datafusion-examples/examples/builtin_functions/regexp.rs index 708924f224fee..549f4dd4f7c18 100644 --- a/datafusion-examples/examples/builtin_functions/regexp.rs +++ b/datafusion-examples/examples/builtin_functions/regexp.rs @@ -1,5 +1,4 @@ // Licensed to the Apache Software Foundation (ASF) under one -// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file diff --git a/datafusion-examples/examples/data_io/parquet_exec_visitor.rs b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs index ffc68d69fe35e..ecae69c1d2006 100644 --- a/datafusion-examples/examples/data_io/parquet_exec_visitor.rs +++ b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs @@ -45,11 +45,13 @@ pub async fn parquet_exec_visitor() -> datafusion::common::Result<()> { let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)); + let table_path = format!("file://{}", path.to_str().unwrap()); + // First example were we use an absolute path, which requires no additional setup. let _ = ctx .register_listing_table( "my_table", - path.to_str().unwrap(), + &table_path, listing_options.clone(), None, None, diff --git a/datafusion-examples/examples/sql_ops/query.rs b/datafusion-examples/examples/sql_ops/query.rs index 28f2bbfa7f5fa..4f1b34769472f 100644 --- a/datafusion-examples/examples/sql_ops/query.rs +++ b/datafusion-examples/examples/sql_ops/query.rs @@ -123,10 +123,12 @@ async fn query_parquet() -> Result<()> { let listing_options = ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet"); + let table_path = format!("file://{}", path.to_str().unwrap()); + // First example were we use an absolute path, which requires no additional setup. ctx.register_listing_table( "my_table", - path.to_str().unwrap(), + &table_path, listing_options.clone(), None, None, @@ -155,8 +157,9 @@ async fn query_parquet() -> Result<()> { ], &results); - // Second example were we temporarily move into the test data's parent directory and - // simulate a relative path, this requires registering an ObjectStore. + // Second example where we change the current working directory and explicitly + // register a local filesystem object store. This demonstrates how listing tables + // resolve paths via an ObjectStore, even when using filesystem-backed data. let cur_dir = std::env::current_dir()?; let test_data_path_parent = path From 591f61c9c16da12c9d159c815ff12343bb8a1cfe Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Fri, 19 Dec 2025 17:51:43 +0300 Subject: [PATCH 04/13] replace aggregate_test_100.csv with cars.csv --- datafusion-examples/data/README.md | 13 ++- .../data/csv/aggregate_test_100.csv | 101 ------------------ .../custom_data_source/csv_json_opener.rs | 35 +++--- .../examples/dataframe/dataframe.rs | 42 ++++---- .../examples/udf/simple_udtf.rs | 2 +- 5 files changed, 50 insertions(+), 143 deletions(-) delete mode 100644 datafusion-examples/data/csv/aggregate_test_100.csv diff --git a/datafusion-examples/data/README.md b/datafusion-examples/data/README.md index ee20e11a55adc..df48c831c8d50 100644 --- a/datafusion-examples/data/README.md +++ b/datafusion-examples/data/README.md @@ -19,10 +19,9 @@ ## Example datasets -| Filename | Path | Description | -| ------------------------ | ------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `aggregate_test_100.csv` | [`data/csv/aggregate_test_100.csv`](./csv/aggregate_test_100.csv) | Multi-column CSV dataset with mixed data types (strings, signed/unsigned integers, floats, doubles). Used for aggregation, projection, filtering, and query planning examples. Derived from Arrow-style aggregate test data. | -| `cars.csv` | [`data/csv/cars.csv`](./csv/cars.csv) | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames). | -| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. | -| `window_1.csv` | [`data/csv/window_1.csv`](./csv/window_1.csv) | Numeric dataset designed for window function demonstrations. Includes ordering keys and incremental values suitable for running totals, ranking, and frame-based calculations. | -| `alltypes_plain.parquet` | [`data/parquet/alltypes_plain.parquet`](./parquet/alltypes_plain.parquet) | Parquet file containing columns of many Arrow/DataFusion-supported types (boolean, integers, floating-point, strings, timestamps). Used to demonstrate Parquet scanning, schema inference, and typed execution. | +| Filename | Path | Description | +| ------------------------ | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `cars.csv` | [`data/csv/cars.csv`](./csv/cars.csv) | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames). | +| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. | +| `window_1.csv` | [`data/csv/window_1.csv`](./csv/window_1.csv) | Numeric dataset designed for window function demonstrations. Includes ordering keys and incremental values suitable for running totals, ranking, and frame-based calculations. | +| `alltypes_plain.parquet` | [`data/parquet/alltypes_plain.parquet`](./parquet/alltypes_plain.parquet) | Parquet file containing columns of many Arrow/DataFusion-supported types (boolean, integers, floating-point, strings, timestamps). Used to demonstrate Parquet scanning, schema inference, and typed execution. | diff --git a/datafusion-examples/data/csv/aggregate_test_100.csv b/datafusion-examples/data/csv/aggregate_test_100.csv deleted file mode 100644 index e548b758bf54e..0000000000000 --- a/datafusion-examples/data/csv/aggregate_test_100.csv +++ /dev/null @@ -1,101 +0,0 @@ -c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13 -c,2,1,18109,2033001162,-6513304855495910254,25,43062,1491205016,5863949479783605708,0.110830784,0.9294097332465232,6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW -d,5,-40,22614,706441268,-7542719935673075327,155,14337,3373581039,11720144131976083864,0.69632107,0.3114712539863804,C2GT5KVyOPZpgKVl110TyZO0NcJ434 -b,1,29,-18218,994303988,5983957848665088916,204,9489,3275293996,14857091259186476033,0.53840446,0.17909035118828576,AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz -a,1,-85,-15154,1171968280,1919439543497968449,77,52286,774637006,12101411955859039553,0.12285209,0.6864391962767343,0keZ5G8BffGwgF2RwQD59TFzMStxCB -b,5,-82,22080,1824882165,7373730676428214987,208,34331,3342719438,3330177516592499461,0.82634634,0.40975383525297016,Ig1QcuKsjHXkproePdERo2w0mYzIqd -b,4,-111,-1967,-4229382,1892872227362838079,67,9832,1243785310,8382489916947120498,0.06563997,0.152498292971736,Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH -e,3,104,-25136,1738331255,300633854973581194,139,20807,3577318119,13079037564113702254,0.40154034,0.7764360990307122,DuJNG8tufSqW0ZstHqWj3aGvFLMg4A -a,3,13,12613,1299719633,2020498574254265315,191,17835,3998790955,14881411008939145569,0.041445434,0.8813167497816289,Amn2K87Db5Es3dFQO9cw9cvpAM6h35 -d,1,38,18384,-335410409,-1632237090406591229,26,57510,2712615025,1842662804748246269,0.6064476,0.6404495093354053,4HX6feIvmNXBN7XGqgO4YVBkhu8GDI -a,4,-38,20744,762932956,308913475857409919,7,45465,1787652631,878137512938218976,0.7459874,0.02182578039211991,ydkwycaISlYSlEq3TlkS2m15I2pcp8 -d,1,57,28781,-1143802338,2662536767954229885,202,62167,879082834,4338034436871150616,0.7618384,0.42950521730777025,VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4 -a,4,-54,-2376,434021400,5502271306323260832,113,15777,2502326480,7966148640299601101,0.5720931,0.30585375151301186,KJFcmTVjdkCMv94wYCtfHMFhzyRsmH -e,3,112,-6823,-421042466,8535335158538929274,129,32712,3759340273,9916295859593918600,0.6424343,0.6316565296547284,BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE -d,2,113,3917,-108973366,-7220140168410319165,197,24380,63044568,4225581724448081782,0.11867094,0.2944158618048994,90gAtmGEeIqUTbo1ZrxCvWtsseukXC -b,1,54,-18410,1413111008,-7145106120930085900,249,5382,1842680163,17818611040257178339,0.8881188,0.24899794314659673,6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ -c,1,103,-22186,431378678,1346564663822463162,146,12393,3766999078,10901819591635583995,0.064453244,0.7784918983501654,2T3wSlHdEmASmO0xcXHnndkKEt6bz8 -e,2,49,24495,-587831330,9178511478067509438,129,12757,1289293657,10948666249269100825,0.5610077,0.5991138115095911,bgK1r6v3BCTh0aejJUhkA1Hn6idXGp -d,1,-98,13630,-1991133944,1184110014998006843,220,2986,225513085,9634106610243643486,0.89651865,0.1640882545084913,y7C453hRWd4E7ImjNDWlpexB8nUqjh -d,3,77,15091,-1302295658,8795481303066536947,154,35477,2093538928,17419098323248948387,0.11952883,0.7035635283169166,O66j6PaYuZhEUtqV6fuU7TyjM2WxC5 -e,2,97,18167,1593800404,-9112448817105133638,163,45185,3188005828,2792105417953811674,0.38175434,0.4094218353587008,ukOiFGGFnQJDHFgZxHMpvhD3zybF0M -e,4,-56,-31500,1544188174,3096047390018154410,220,417,557517119,2774306934041974261,0.15459597,0.19113293583306745,IZTkHMLvIKuiLjhDjYMmIHxh166we4 -d,1,-99,5613,1213926989,-8863698443222021480,19,18736,4216440507,14933742247195536130,0.6067944,0.33639590659276175,aDxBtor7Icd9C5hnTvvw5NrIre740e -a,5,36,-16974,623103518,6834444206535996609,71,29458,141047417,17448660630302620693,0.17100024,0.04429073092078406,OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh -e,4,-53,13788,2064155045,-691093532952651300,243,35106,2778168728,9463973906560740422,0.34515214,0.27159190516490006,0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm -c,2,-29,25305,-537142430,-7683452043175617798,150,31648,598822671,11759014161799384683,0.8315913,0.946325164889271,9UbObCsVkmYpJGcGrgfK90qOnwb2Lj -a,1,-25,15295,383352709,4980135132406487265,231,102,3276123488,12763583666216333412,0.53796273,0.17592486905979987,XemNcT1xp61xcM1Qz3wZ1VECCnq06O -c,4,123,16620,852509237,-3087630526856906991,196,33715,3566741189,4546434653720168472,0.07606989,0.819715865079681,8LIh0b6jmDGm87BmIyjdxNIpX4ugjD -a,5,-31,-12907,586844478,-4862189775214031241,170,28086,1013876852,11005002152861474932,0.35319167,0.05573662213439634,MeSTAXq8gVxVjbEjgkvU9YLte0X9uE -a,2,45,15673,-1899175111,398282800995316041,99,2555,145294611,8554426087132697832,0.17333257,0.6405262429561641,b3b9esRhTzFEawbs6XhpKnD9ojutHB -b,3,17,14457,670497898,-2390782464845307388,255,24770,1538863055,12662506238151717757,0.34077626,0.7614304100703713,6x93sxYioWuq5c9Kkk8oTAAORM7cH0 -e,4,97,-13181,2047637360,6176835796788944083,158,53000,2042457019,9726016502640071617,0.7085086,0.12357539988406441,oHJMNvWuunsIMIWFnYG31RCfkOo2V7 -c,2,-60,-16312,-1808210365,-3368300253197863813,71,39635,2844041986,7045482583778080653,0.805363,0.6425694115212065,BJqx5WokrmrrezZA0dUbleMYkG5U2O -e,1,36,-21481,-928766616,-3471238138418013024,150,52569,2610290479,7788847578701297242,0.2578469,0.7670021786149205,gpo8K5qtYePve6jyPt6xgJx4YOVjms -b,5,-5,24896,1955646088,2430204191283109071,118,43655,2424630722,11429640193932435507,0.87989986,0.7328050041291218,JafwVLSVk5AVoXFuzclesQ000EE2k1 -a,3,13,32064,912707948,3826618523497875379,42,21463,2214035726,10771380284714693539,0.6133468,0.7325106678655877,i6RQVXKUh7MzuGMDaNclUYnFUAireU -c,1,41,-4667,-644225469,7049620391314639084,196,48099,2125812933,15419512479294091215,0.5780736,0.9255031346434324,mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS -d,2,93,-12642,2053379412,6468763445799074329,147,50842,1000948272,5536487915963301239,0.4279275,0.28534428578703896,lqhzgLsXZ8JhtpeeUWWNbMz8PHI705 -c,3,73,-9565,-382483011,1765659477910680019,186,1535,1088543984,2906943497598597237,0.680652,0.6009475544728957,Ow5PGpfTm4dXCfTDsXAOTatXRoAydR -c,3,-2,-18655,-2141999138,-3154042970870838072,251,34970,3862393166,13062025193350212516,0.034291923,0.7697753383420857,IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr -c,3,22,13741,-2098805236,8604102724776612452,45,2516,1362369177,196777795886465166,0.94669616,0.0494924465469434,6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE -b,2,63,21456,-2138770630,-2380041687053733364,181,57594,2705709344,13144161537396946288,0.09683716,0.3051364088814128,nYVJnVicpGRqKZibHyBAmtmzBXAFfT -d,4,102,-24558,1991172974,-7823479531661596016,14,36599,1534194097,2240998421986827216,0.028003037,0.8824879447595726,0og6hSkhbX8AC1ktFS4kounvTzy8Vo -d,1,-8,27138,-1383162419,7682021027078563072,36,64517,2861376515,9904216782086286050,0.80954456,0.9463098243875633,AFGCj7OWlEB5QfniEFgonMq90Tq5uH -a,3,17,-22796,1337043149,-1282905594104562444,167,2809,754775609,732272194388185106,0.3884129,0.658671129040488,VDhtJkYjAYPykCgOU9x3v7v3t4SO1a -e,2,52,23388,715235348,605432070100399212,165,56980,3314983189,7386391799827871203,0.46076488,0.980809631269599,jQimhdepw3GKmioWUlVSWeBVRKFkY3 -b,5,68,21576,1188285940,5717755781990389024,224,27600,974297360,9865419128970328044,0.80895734,0.7973920072996036,ioEncce3mPOXD2hWhpZpCPWGATG6GU -b,2,31,23127,-800561771,-8706387435232961848,153,27034,1098639440,3343692635488765507,0.35692692,0.5590205548347534,okOkcWflkNXIy4R8LzmySyY1EC3sYd -c,1,-24,-24085,-1882293856,7385529783747709716,41,48048,520189543,2402288956117186783,0.39761502,0.3600766362333053,Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u -a,4,65,-28462,-1813935549,7602389238442209730,18,363,1865307672,11378396836996498283,0.09130204,0.5593249815276734,WHmjWk2AY4c6m7DA4GitUx6nmb1yYS -d,1,125,31106,-1176490478,-4306856842351827308,90,17910,3625286410,17869394731126786457,0.8882508,0.7631239070049998,dVdvo6nUD5FgCgsbOZLds28RyGTpnx -b,4,17,-28070,-673237643,1904316899655860234,188,27744,933879086,3732692885824435932,0.41860116,0.40342283197779727,JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ -c,2,-106,-1114,-1927628110,1080308211931669384,177,20421,141680161,7464432081248293405,0.56749094,0.565352842229935,Vp3gmWunM5A7wOC9YW2JroFqTWjvTi -d,5,-59,2045,-2117946883,1170799768349713170,189,63353,1365198901,2501626630745849169,0.75173044,0.18628859265874176,F7NSTjWvQJyBburN7CXRUlbgp2dIrA -d,4,55,-1471,1902023838,1252101628560265705,157,3691,811650497,1524771507450695976,0.2968701,0.5437595540422571,f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX -b,2,-60,-21739,-1908480893,-8897292622858103761,59,50009,2525744318,1719090662556698549,0.52930677,0.560333188635217,l7uwDoTepWwnAP0ufqtHJS3CRi7RfP -d,3,-76,8809,141218956,-9110406195556445909,58,5494,1824517658,12046662515387914426,0.8557294,0.6668423897406515,Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK -e,4,73,-22501,1282464673,2541794052864382235,67,21119,538589788,9575476605699527641,0.48515016,0.296036538664718,4JznSdBajNWhu4hRQwjV1FjTTxY68i -b,4,-117,19316,2051224722,-5534418579506232438,133,52046,3023531799,13684453606722360110,0.62608826,0.8506721053047003,mhjME0zBHbrK6NMkytMTQzOssOa1gF -a,4,-101,11640,1993193190,2992662416070659899,230,40566,466439833,16778113360088370541,0.3991115,0.574210838214554,NEhyk8uIx4kEULJGa8qIyFjjBcP2G6 -b,5,62,16337,41423756,-2274773899098124524,121,34206,2307004493,10575647935385523483,0.23794776,0.1754261586710173,qnPOOmslCJaT45buUisMRnM0rc77EK -c,4,-79,5281,-237425046,373011991904079451,121,55620,2818832252,2464584078983135763,0.49774808,0.9237877978193884,t6fQUjJejPcjc04wHvHTPe55S65B4V -b,2,68,15874,49866617,1179733259727844435,121,23948,3455216719,3898128009708892708,0.6306253,0.9185813970744787,802bgTGl6Bk5TlkPYYTxp5JkKyaYUA -c,1,70,27752,1325868318,1241882478563331892,63,61637,473294098,4976799313755010034,0.13801557,0.5081765563442366,Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn -e,2,-61,-2888,-1660426473,2553892468492435401,126,35429,4144173353,939909697866979632,0.4405142,0.9231889896940375,BPtQMxnuSPpxMExYV9YkDa6cAN7GP3 -e,4,74,-12612,-1885422396,1702850374057819332,130,3583,3198969145,10767179755613315144,0.5518061,0.5614503754617461,QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv -d,2,122,10130,-168758331,-3179091803916845592,30,794,4061635107,15695681119022625322,0.69592506,0.9748360509016578,OPwBqCEK5PWTjWaiOyL45u2NLTaDWv -e,3,71,194,1436496767,-5639533800082367925,158,44507,3105312559,3998472996619161534,0.930117,0.6108938307533,pTeu0WMjBRTaNRT15rLCuEh3tBJVc5 -c,5,-94,-15880,2025611582,-3348824099853919681,5,40622,4268716378,12849419495718510869,0.34163946,0.4830878559436823,RilTlL1tKkPOUFuzmLydHAVZwv1OGl -d,1,-72,25590,1188089983,3090286296481837049,241,832,3542840110,5885937420286765261,0.41980565,0.21535402343780985,wwXqSGKLyBQyPkonlzBNYUJTCo4LRS -e,1,71,-5479,-1339586153,-3920238763788954243,123,53012,4229654142,10297218950720052365,0.73473036,0.5773498217058918,cBGc0kSm32ylBDnxogG727C0uhZEYZ -e,4,96,-30336,427197269,7506304308750926996,95,48483,3521368277,5437030162957481122,0.58104324,0.42073125331890115,3BEOHQsMEFZ58VcNTOJYShTBpAPzbt -a,2,-48,-18025,439738328,-313657814587041987,222,13763,3717551163,9135746610908713318,0.055064857,0.9800193410444061,ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 -a,1,-56,8692,2106705285,-7811675384226570375,231,15573,1454057357,677091006469429514,0.42794758,0.2739938529235548,JN0VclewmjwYlSl8386MlWv5rEhWCz -e,2,52,-12056,-1090239422,9011500141803970147,238,4168,2013662838,12565360638488684051,0.6694766,0.39144436569161134,xipQ93429ksjNcXPX5326VSg1xJZcW -a,1,-5,12636,794623392,2909750622865366631,15,24022,2669374863,4776679784701509574,0.29877836,0.2537253407987472,waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs -b,1,12,7652,-1448995523,-5332734971209541785,136,49283,4076864659,15449267433866484283,0.6214579,0.05636955101974106,akiiY5N0I44CMwEnBL6RTBk7BRkxEj -e,5,64,-26526,1689098844,8950618259486183091,224,45253,662099130,16127995415060805595,0.2897315,0.5759450483859969,56MZa5O1hVtX4c5sbnCfxuX5kDChqI -c,4,-90,-2935,1579876740,6733733506744649678,254,12876,3593959807,4094315663314091142,0.5708688,0.5603062368164834,Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV -e,5,-86,32514,-467659022,-8012578250188146150,254,2684,2861911482,2126626171973341689,0.12559289,0.01479305307777301,gxfHWUF8XgY2KdFxigxvNEXe2V2XMl -c,2,-117,-30187,-1222533990,-191957437217035800,136,47061,2293105904,12659011877190539078,0.2047385,0.9706712283358269,pLk3i59bZwd5KBZrI1FiweYTd5hteG -a,3,14,28162,397430452,-452851601758273256,57,14722,431948861,8164671015278284913,0.40199697,0.07260475960924484,TtDKUZxzVxsq758G6AWPSYuZgVgbcl -c,2,29,-3855,1354539333,4742062657200940467,81,53815,3398507249,562977550464243101,0.7124534,0.991517828651004,Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0 -b,4,-59,25286,1423957796,2646602445954944051,0,61069,3570297463,15100310750150419896,0.49619365,0.04893135681998029,fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG -a,1,83,-14704,2143473091,-4387559599038777245,37,829,4015442341,4602675983996931623,0.89542526,0.9567595541247681,ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU -a,3,-12,-9168,1489733240,-1569376002217735076,206,33821,3959216334,16060348691054629425,0.9488028,0.9293883502480845,oLZ21P2JEDooxV1pU31cIxQHEeeoLu -c,4,3,-30508,659422734,-6455460736227846736,133,59663,2306130875,8622584762448622224,0.16999894,0.4273123318932347,EcCuckwsF3gV1Ecgmh5v4KM8g1ozif -a,3,-72,-11122,-2141451704,-2578916903971263854,83,30296,1995343206,17452974532402389080,0.94209343,0.3231750610081745,e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG -c,2,-107,-2904,-1011669561,782342092880993439,18,29527,1157161427,4403623840168496677,0.31988364,0.36936304600612724,QYlaIAnJA6r8rlAb6f59wcxvcPcWFf -c,5,118,19208,-134213907,-2120241105523909127,86,57751,1229567292,16493024289408725403,0.5536642,0.9723580396501548,TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX -c,3,97,29106,-903316089,2874859437662206732,207,42171,3473924576,8188072741116415408,0.32792538,0.2667177795079635,HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g -b,3,-101,-13217,-346989627,5456800329302529236,26,54276,243203849,17929716297117857676,0.05422181,0.09465635123783445,MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ -a,2,-43,13080,370975815,5881039805148485053,2,20120,2939920218,906367167997372130,0.42733806,0.16301110515739792,m6jD0LBIQWaMfenwRCTANI9eOdyyto -a,5,-101,-12484,-842693467,-6140627905445351305,57,57885,2496054700,2243924747182709810,0.59520596,0.9491397432856566,QJYm7YRA3YetcBHI5wkMZeLXVmfuNy -b,5,-44,15788,-629486480,5822642169425315613,13,11872,3457053821,2413406423648025909,0.44318348,0.32869374687050157,ALuRhobVWbnQTTWZdSOk0iVe8oYFhW -d,4,5,-7688,702611616,6239356364381313700,4,39363,3126475872,35363005357834672,0.3766935,0.061029375346466685,H5j5ZHy1FGesOAHjkQEDYCucbpKWRu -e,1,120,10837,-1331533190,6342019705133850847,245,3975,2830981072,16439861276703750332,0.6623719,0.9965400387585364,LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW -e,3,-95,13611,2030965207,927403809957470678,119,59134,559847112,10966649192992996919,0.5301289,0.047343434291126085,gTpyQnEODMcpsPnJMZC66gh33i3m0b -d,3,123,29533,240273900,1176001466590906949,117,30972,2592330556,12883447461717956514,0.39075065,0.38870280983958583,1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO -b,4,47,20690,-1009656194,-2027442591571700798,200,7781,326151275,2881913079548128905,0.57360977,0.2145232647388039,52mKlRE3aHCBZtjECq6sY9OqVf8Dze -e,4,30,-16110,61035129,-3356533792537910152,159,299,28774375,13526465947516666293,0.6999775,0.03968347085780355,cq4WSAIFwx3wwTUS5bp1wCe71R6U5I \ No newline at end of file diff --git a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs index fcc7434a2a1f8..a2d6436b52b0a 100644 --- a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs +++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs @@ -20,7 +20,7 @@ use std::path::PathBuf; use std::sync::Arc; -use arrow::datatypes::{DataType, Field, Schema}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::common::config::CsvOptions; use datafusion::{ assert_batches_eq, @@ -32,7 +32,6 @@ use datafusion::{ }, error::Result, physical_plan::metrics::ExecutionPlanMetricsSet, - test_util::aggr_test_schema, }; use datafusion::datasource::physical_plan::FileScanConfigBuilder; @@ -51,12 +50,20 @@ pub async fn csv_json_opener() -> Result<()> { async fn csv_opener() -> Result<()> { let object_store = Arc::new(LocalFileSystem::new()); - let schema = aggr_test_schema(); + let schema = Arc::new(Schema::new(vec![ + Field::new("car", DataType::Utf8, false), + Field::new("speed", DataType::Float64, false), + Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + ])); let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") .join("csv") - .join("aggregate_test_100.csv"); + .join("cars.csv"); let options = CsvOptions { has_header: Some(true), @@ -72,7 +79,7 @@ async fn csv_opener() -> Result<()> { let scan_config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) - .with_projection_indices(Some(vec![12, 0]))? + .with_projection_indices(Some(vec![0, 1]))? .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) .build(); @@ -90,15 +97,15 @@ async fn csv_opener() -> Result<()> { } assert_batches_eq!( &[ - "+--------------------------------+----+", - "| c13 | c1 |", - "+--------------------------------+----+", - "| 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW | c |", - "| C2GT5KVyOPZpgKVl110TyZO0NcJ434 | d |", - "| AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz | b |", - "| 0keZ5G8BffGwgF2RwQD59TFzMStxCB | a |", - "| Ig1QcuKsjHXkproePdERo2w0mYzIqd | b |", - "+--------------------------------+----+", + "+-----+-------+", + "| car | speed |", + "+-----+-------+", + "| red | 20.0 |", + "| red | 20.3 |", + "| red | 21.4 |", + "| red | 21.5 |", + "| red | 19.0 |", + "+-----+-------+", ], &result ); diff --git a/datafusion-examples/examples/dataframe/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs index bf2e19ca627fb..e4dc802c144b5 100644 --- a/datafusion-examples/examples/dataframe/dataframe.rs +++ b/datafusion-examples/examples/dataframe/dataframe.rs @@ -65,8 +65,8 @@ pub async fn dataframe_example() -> Result<()> { read_memory(&ctx).await?; read_memory_macro().await?; write_out(&ctx).await?; - register_aggregate_test_data("t1", &ctx).await?; - register_aggregate_test_data("t2", &ctx).await?; + register_cars_test_data("t1", &ctx).await?; + register_cars_test_data("t2", &ctx).await?; where_scalar_subquery(&ctx).await?; where_in_subquery(&ctx).await?; where_exist_subquery(&ctx).await?; @@ -268,7 +268,7 @@ async fn write_out(ctx: &SessionContext) -> Result<()> { } /// Use the DataFrame API to execute the following subquery: -/// select c1,c2 from t1 where (select avg(t2.c2) from t2 where t1.c1 = t2.c1)>0 limit 3; +/// select car, speed from t1 where (select avg(t2.speed) from t2 where t1.car = t2.car) > 0 limit 3; async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> { ctx.table("t1") .await? @@ -276,14 +276,14 @@ async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> { scalar_subquery(Arc::new( ctx.table("t2") .await? - .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))? - .aggregate(vec![], vec![avg(col("t2.c2"))])? - .select(vec![avg(col("t2.c2"))])? + .filter(out_ref_col(DataType::Utf8, "t1.car").eq(col("t2.car")))? + .aggregate(vec![], vec![avg(col("t2.speed"))])? + .select(vec![avg(col("t2.speed"))])? .into_unoptimized_plan(), )) - .gt(lit(0u8)), + .gt(lit(0.0)), )? - .select(vec![col("t1.c1"), col("t1.c2")])? + .select(vec![col("t1.car"), col("t1.speed")])? .limit(0, Some(3))? .show() .await?; @@ -291,22 +291,24 @@ async fn where_scalar_subquery(ctx: &SessionContext) -> Result<()> { } /// Use the DataFrame API to execute the following subquery: -/// select t1.c1, t1.c2 from t1 where t1.c2 in (select max(t2.c2) from t2 where t2.c1 > 0 ) limit 3; +/// select t1.car, t1.speed from t1 where t1.speed in (select max(t2.speed) from t2 where t2.car = 'red') limit 3; async fn where_in_subquery(ctx: &SessionContext) -> Result<()> { ctx.table("t1") .await? .filter(in_subquery( - col("t1.c2"), + col("t1.speed"), Arc::new( ctx.table("t2") .await? - .filter(col("t2.c1").gt(lit(ScalarValue::UInt8(Some(0)))))? - .aggregate(vec![], vec![max(col("t2.c2"))])? - .select(vec![max(col("t2.c2"))])? + .filter( + col("t2.car").gt(lit(ScalarValue::Utf8(Some("red".to_string())))), + )? + .aggregate(vec![], vec![max(col("t2.speed"))])? + .select(vec![max(col("t2.speed"))])? .into_unoptimized_plan(), ), ))? - .select(vec![col("t1.c1"), col("t1.c2")])? + .select(vec![col("t1.car"), col("t1.speed")])? .limit(0, Some(3))? .show() .await?; @@ -314,29 +316,29 @@ async fn where_in_subquery(ctx: &SessionContext) -> Result<()> { } /// Use the DataFrame API to execute the following subquery: -/// select t1.c1, t1.c2 from t1 where exists (select t2.c2 from t2 where t1.c1 = t2.c1) limit 3; +/// select t1.car, t1.speed from t1 where exists (select t2.speed from t2 where t1.car = t2.car) limit 3; async fn where_exist_subquery(ctx: &SessionContext) -> Result<()> { ctx.table("t1") .await? .filter(exists(Arc::new( ctx.table("t2") .await? - .filter(out_ref_col(DataType::Utf8, "t1.c1").eq(col("t2.c1")))? - .select(vec![col("t2.c2")])? + .filter(out_ref_col(DataType::Utf8, "t1.car").eq(col("t2.car")))? + .select(vec![col("t2.speed")])? .into_unoptimized_plan(), )))? - .select(vec![col("t1.c1"), col("t1.c2")])? + .select(vec![col("t1.car"), col("t1.speed")])? .limit(0, Some(3))? .show() .await?; Ok(()) } -async fn register_aggregate_test_data(name: &str, ctx: &SessionContext) -> Result<()> { +async fn register_cars_test_data(name: &str, ctx: &SessionContext) -> Result<()> { let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") .join("csv") - .join("aggregate_test_100.csv"); + .join("cars.csv"); ctx.register_csv(name, path.to_str().unwrap(), CsvReadOptions::default()) .await?; diff --git a/datafusion-examples/examples/udf/simple_udtf.rs b/datafusion-examples/examples/udf/simple_udtf.rs index c44a323f35cc4..f729a21fd2426 100644 --- a/datafusion-examples/examples/udf/simple_udtf.rs +++ b/datafusion-examples/examples/udf/simple_udtf.rs @@ -55,7 +55,7 @@ pub async fn simple_udtf() -> Result<()> { let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") .join("csv") - .join("aggregate_test_100.csv"); + .join("cars.csv"); // Pass 2 arguments, read csv with at most 2 rows (simplify logic makes 1+1 --> 2) let df = ctx From d77de0199d227bc0c70e150e8c4ccd33da38b233 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Tue, 23 Dec 2025 15:06:34 +0300 Subject: [PATCH 05/13] replace alltypes_plain.parquet with cars.csv --- .../data/parquet/alltypes_plain.parquet | Bin 1851 -> 0 bytes .../examples/data_io/parquet_encrypted.rs | 48 ++- .../examples/data_io/parquet_exec_visitor.rs | 26 +- .../examples/dataframe/dataframe.rs | 51 ++- .../dataframe/deserialize_to_struct.rs | 337 +++++++++++++++--- .../examples/execution_monitoring/tracing.rs | 42 ++- datafusion-examples/examples/flight/client.rs | 35 +- datafusion-examples/examples/flight/server.rs | 45 ++- .../examples/flight/sql_server.rs | 41 ++- .../examples/query_planning/parse_sql_expr.rs | 112 ++++-- .../examples/query_planning/plan_to_sql.rs | 137 +++++-- .../examples/query_planning/planner_api.rs | 37 +- .../examples/query_planning/thread_pools.rs | 32 +- datafusion-examples/examples/sql_ops/query.rs | 62 +++- 14 files changed, 783 insertions(+), 222 deletions(-) delete mode 100644 datafusion-examples/data/parquet/alltypes_plain.parquet diff --git a/datafusion-examples/data/parquet/alltypes_plain.parquet b/datafusion-examples/data/parquet/alltypes_plain.parquet deleted file mode 100644 index a63f5dca7c3821909748f34752966a0d7e08d47f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1851 zcmb7F&ui0g6#pje+AM2l7<*qVv_zM;YQD;X!1G_`Ye@W}TbqmweOL$NPTX=lg!8G{0y<66Rp8 z2pS|Aqlfj;PSH-&mT4zwizU$p1|0Y`VGJoyS_YbwNId;`jBg|z+DN8!b`S=|Sr$Ee51+|8u<)E>*X#arx$Xz24Q}8O`6X`}Xhr%F1Zjm_hG6In z7b&rg?-Cs*15K~?$g4Hmpi6uSk7fKqck31Rd$NO@X;dxW?*`sZ;$!02EAVEj1Dx*0 z{MLuNloZ0uLp~A&5eQYhXi-eh3&y9k4#_aQs_m^t;cL8xuhMu#`94GW^Wlpd7r_2h zbWlRre%G&Crz3oz;A@fee~~T(>&n~(=)0;8>IvyeeZ%&hb^-l_Dsj zFvuM<3gd=3Zp;SqWJI2b$YdaF$o()3pD7?YQ98!mj1HO5|D}r6be0>LFTr05hN163Gw zN`^s(6x}&&X(N%RnM7u%??nSFr{~`PZ?MG}V6i6>#vU;kXJ%l`=Er#5j4|7?$M(UP z-GIH6Am3BD#zq&s>YC+S`G?MW!>iZw=2&6OxPE8h?#;!8`C@+5-thcNe#V-dsZ?y! oaow58so4p;;FgVPyFBeqnNHc9FdWl$-SX^Jc0`|z5`8xR0vs|-V*mgE diff --git a/datafusion-examples/examples/data_io/parquet_encrypted.rs b/datafusion-examples/examples/data_io/parquet_encrypted.rs index 5eb58caa8b51d..137c6486df7b3 100644 --- a/datafusion-examples/examples/data_io/parquet_encrypted.rs +++ b/datafusion-examples/examples/data_io/parquet_encrypted.rs @@ -17,30 +17,49 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; +use std::sync::Arc; + use datafusion::common::DataFusionError; use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::logical_expr::{col, lit}; use datafusion::parquet::encryption::decrypt::FileDecryptionProperties; use datafusion::parquet::encryption::encrypt::FileEncryptionProperties; +use datafusion::prelude::CsvReadOptions; use datafusion::prelude::{ParquetReadOptions, SessionContext}; -use std::path::PathBuf; -use std::sync::Arc; use tempfile::TempDir; +use tokio::fs::create_dir_all; /// Read and write encrypted Parquet files using DataFusion pub async fn parquet_encrypted() -> datafusion::common::Result<()> { // The SessionContext is the main high level API for interacting with DataFusion let ctx = SessionContext::new(); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; // Read the sample parquet file let parquet_df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; // Show information from the dataframe @@ -54,27 +73,27 @@ pub async fn parquet_encrypted() -> datafusion::common::Result<()> { let (encrypt, decrypt) = setup_encryption(&parquet_df)?; // Create a temporary file location for the encrypted parquet file - let tmp_dir = TempDir::new()?; - let tempfile = tmp_dir.path().join("alltypes_plain-encrypted.parquet"); - let tempfile_str = tempfile.into_os_string().into_string().unwrap(); + let tempfile = tmp_source.path().join("cars_encrypted"); // Write encrypted parquet let mut options = TableParquetOptions::default(); options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt)); parquet_df .write_parquet( - tempfile_str.as_str(), + tempfile.to_str().unwrap(), DataFrameWriteOptions::new().with_single_file_output(true), Some(options), ) .await?; - // Read encrypted parquet + // Read encrypted parquet back as a DataFrame using matching decryption config let ctx: SessionContext = SessionContext::new(); let read_options = ParquetReadOptions::default().file_decryption_properties((&decrypt).into()); - let encrypted_parquet_df = ctx.read_parquet(tempfile_str, read_options).await?; + let encrypted_parquet_df = ctx + .read_parquet(tempfile.to_str().unwrap(), read_options) + .await?; // Show information from the dataframe println!("\n\n==============================================================================="); @@ -91,11 +110,12 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> { df.clone().describe().await?.show().await?; // Select three columns and filter the results - // so that only rows where id > 1 are returned + // so that only rows where speed > 5 are returned + // select car, speed, time from t where speed > 5 println!("\nSelected rows and columns:"); df.clone() - .select_columns(&["id", "bool_col", "timestamp_col"])? - .filter(col("id").gt(lit(5)))? + .select_columns(&["car", "speed", "time"])? + .filter(col("speed").gt(lit(5)))? .show() .await?; diff --git a/datafusion-examples/examples/data_io/parquet_exec_visitor.rs b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs index ecae69c1d2006..00e704ee47759 100644 --- a/datafusion-examples/examples/data_io/parquet_exec_visitor.rs +++ b/datafusion-examples/examples/data_io/parquet_exec_visitor.rs @@ -20,6 +20,7 @@ use std::path::PathBuf; use std::sync::Arc; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::datasource::physical_plan::{FileGroup, ParquetSource}; @@ -30,22 +31,41 @@ use datafusion::physical_plan::metrics::MetricValue; use datafusion::physical_plan::{ execute_stream, visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor, }; +use datafusion::prelude::CsvReadOptions; use futures::StreamExt; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// Example of collecting metrics after execution by visiting the `ExecutionPlan` pub async fn parquet_exec_visitor() -> datafusion::common::Result<()> { let ctx = SessionContext::new(); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)); - let table_path = format!("file://{}", path.to_str().unwrap()); + let table_path = format!("file://{}", out_dir.to_str().unwrap()); // First example were we use an absolute path, which requires no additional setup. let _ = ctx diff --git a/datafusion-examples/examples/dataframe/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs index e4dc802c144b5..433b47a6819c7 100644 --- a/datafusion-examples/examples/dataframe/dataframe.rs +++ b/datafusion-examples/examples/dataframe/dataframe.rs @@ -17,6 +17,11 @@ //! See `main.rs` for how to run it. +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; +use std::sync::Arc; + use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray, StringViewArray}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::catalog::MemTable; @@ -28,11 +33,8 @@ use datafusion::error::Result; use datafusion::functions_aggregate::average::avg; use datafusion::functions_aggregate::min_max::max; use datafusion::prelude::*; -use std::fs::{create_dir_all, File}; -use std::io::Write; -use std::path::PathBuf; -use std::sync::Arc; use tempfile::{tempdir, TempDir}; +use tokio::fs::create_dir_all; /// This example demonstrates using DataFusion's DataFrame API /// @@ -78,24 +80,41 @@ pub async fn dataframe_example() -> Result<()> { /// 2. Show the schema /// 3. Select columns and rows async fn read_parquet(ctx: &SessionContext) -> Result<()> { + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; // Read the parquet files and show its schema using 'describe' let parquet_df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; // show its schema using 'describe' parquet_df.clone().describe().await?.show().await?; // Select three columns and filter the results - // so that only rows where id > 1 are returned + // so that only rows where speed > 1 are returned + // select car, speed, time from t where speed > 1 parquet_df - .select_columns(&["id", "bool_col", "timestamp_col"])? - .filter(col("id").gt(lit(1)))? + .select_columns(&["car", "speed", "time"])? + .filter(col("speed").gt(lit(1)))? .show() .await?; @@ -213,15 +232,15 @@ async fn write_out(ctx: &SessionContext) -> Result<()> { // Create a single temp root with subdirectories let tmp_root = TempDir::new()?; let examples_root = tmp_root.path().join("datafusion-examples"); - create_dir_all(&examples_root)?; + create_dir_all(&examples_root).await?; let table_dir = examples_root.join("test_table"); let parquet_dir = examples_root.join("test_parquet"); let csv_dir = examples_root.join("test_csv"); let json_dir = examples_root.join("test_json"); - create_dir_all(&table_dir)?; - create_dir_all(&parquet_dir)?; - create_dir_all(&csv_dir)?; - create_dir_all(&json_dir)?; + create_dir_all(&table_dir).await?; + create_dir_all(&parquet_dir).await?; + create_dir_all(&csv_dir).await?; + create_dir_all(&json_dir).await?; let create_sql = format!( "CREATE EXTERNAL TABLE test(tablecol1 varchar) @@ -301,7 +320,7 @@ async fn where_in_subquery(ctx: &SessionContext) -> Result<()> { ctx.table("t2") .await? .filter( - col("t2.car").gt(lit(ScalarValue::Utf8(Some("red".to_string())))), + col("t2.car").eq(lit(ScalarValue::Utf8(Some("red".to_string())))), )? .aggregate(vec![], vec![max(col("t2.speed"))])? .select(vec![max(col("t2.speed"))])? diff --git a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs index af169a2e28f80..2ece7063619c1 100644 --- a/datafusion-examples/examples/dataframe/deserialize_to_struct.rs +++ b/datafusion-examples/examples/dataframe/deserialize_to_struct.rs @@ -19,12 +19,14 @@ use std::path::PathBuf; -use arrow::array::{AsArray, PrimitiveArray}; -use arrow::datatypes::{Float64Type, Int32Type}; +use arrow::array::{Array, Float64Array, StringViewArray}; use datafusion::common::assert_batches_eq; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::Result; use datafusion::prelude::*; use futures::StreamExt; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// This example shows how to convert query results into Rust structs by using /// the Arrow APIs to convert the results into Rust native types. @@ -36,68 +38,120 @@ use futures::StreamExt; pub async fn deserialize_to_struct() -> Result<()> { // Run a query that returns two columns of data let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; ctx.register_parquet( - "alltypes_plain", - path.to_str().unwrap(), + "cars", + out_dir.to_str().unwrap(), ParquetReadOptions::default(), ) .await?; let df = ctx - .sql("SELECT int_col, double_col FROM alltypes_plain") + .sql("SELECT car, speed FROM cars ORDER BY speed LIMIT 50") .await?; - // print out the results showing we have an int32 and a float64 column + // print out the results showing we have car and speed columns and a deterministic ordering let results = df.clone().collect().await?; assert_batches_eq!( [ - "+---------+------------+", - "| int_col | double_col |", - "+---------+------------+", - "| 0 | 0.0 |", - "| 1 | 10.1 |", - "| 0 | 0.0 |", - "| 1 | 10.1 |", - "| 0 | 0.0 |", - "| 1 | 10.1 |", - "| 0 | 0.0 |", - "| 1 | 10.1 |", - "+---------+------------+", + "+-------+-------+", + "| car | speed |", + "+-------+-------+", + "| red | 0.0 |", + "| red | 1.0 |", + "| green | 2.0 |", + "| red | 3.0 |", + "| red | 7.0 |", + "| red | 7.1 |", + "| red | 7.2 |", + "| green | 8.0 |", + "| green | 10.0 |", + "| green | 10.3 |", + "| green | 10.4 |", + "| green | 10.5 |", + "| green | 11.0 |", + "| green | 12.0 |", + "| green | 14.0 |", + "| green | 15.0 |", + "| green | 15.1 |", + "| green | 15.2 |", + "| red | 17.0 |", + "| red | 18.0 |", + "| red | 19.0 |", + "| red | 20.0 |", + "| red | 20.3 |", + "| red | 21.4 |", + "| red | 21.5 |", + "+-------+-------+", ], &results ); // We will now convert the query results into a Rust struct let mut stream = df.execute_stream().await?; - let mut list = vec![]; + let mut list: Vec = vec![]; // DataFusion produces data in chunks called `RecordBatch`es which are // typically 8000 rows each. This loop processes each `RecordBatch` as it is // produced by the query plan and adds it to the list - while let Some(b) = stream.next().await.transpose()? { + while let Some(batch) = stream.next().await.transpose()? { // Each `RecordBatch` has one or more columns. Each column is stored as // an `ArrayRef`. To interact with data using Rust native types we need to // convert these `ArrayRef`s into concrete array types using APIs from // the arrow crate. // In this case, we know that each batch has two columns of the Arrow - // types Int32 and Float64, so first we cast the two columns to the + // types StringView and Float64, so first we cast the two columns to the // appropriate Arrow PrimitiveArray (this is a fast / zero-copy cast).: - let int_col: &PrimitiveArray = b.column(0).as_primitive(); - let float_col: &PrimitiveArray = b.column(1).as_primitive(); + let car_col = batch + .column(0) + .as_any() + .downcast_ref::() + .expect("car column must be Utf8View"); + + let speed_col = batch + .column(1) + .as_any() + .downcast_ref::() + .expect("speed column must be Float64"); // With PrimitiveArrays, we can access to the values as native Rust - // types i32 and f64, and forming the desired `Data` structs - for (i, f) in int_col.values().iter().zip(float_col.values()) { - list.push(Data { - int_col: *i, - double_col: *f, - }) + // types String and f64, and forming the desired `Data` structs + for i in 0..batch.num_rows() { + let car = if car_col.is_null(i) { + None + } else { + Some(car_col.value(i).to_string()) + }; + + let speed = if speed_col.is_null(i) { + None + } else { + Some(speed_col.value(i)) + }; + + list.push(Data { car, speed }); } } @@ -107,45 +161,220 @@ pub async fn deserialize_to_struct() -> Result<()> { res, r#"[ Data { - int_col: 0, - double_col: 0.0, + car: Some( + "red", + ), + speed: Some( + 0.0, + ), + }, + Data { + car: Some( + "red", + ), + speed: Some( + 1.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 2.0, + ), + }, + Data { + car: Some( + "red", + ), + speed: Some( + 3.0, + ), + }, + Data { + car: Some( + "red", + ), + speed: Some( + 7.0, + ), + }, + Data { + car: Some( + "red", + ), + speed: Some( + 7.1, + ), + }, + Data { + car: Some( + "red", + ), + speed: Some( + 7.2, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 8.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 10.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 10.3, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 10.4, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 10.5, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 11.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 12.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 14.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 15.0, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 15.1, + ), + }, + Data { + car: Some( + "green", + ), + speed: Some( + 15.2, + ), }, Data { - int_col: 1, - double_col: 10.1, + car: Some( + "red", + ), + speed: Some( + 17.0, + ), }, Data { - int_col: 0, - double_col: 0.0, + car: Some( + "red", + ), + speed: Some( + 18.0, + ), }, Data { - int_col: 1, - double_col: 10.1, + car: Some( + "red", + ), + speed: Some( + 19.0, + ), }, Data { - int_col: 0, - double_col: 0.0, + car: Some( + "red", + ), + speed: Some( + 20.0, + ), }, Data { - int_col: 1, - double_col: 10.1, + car: Some( + "red", + ), + speed: Some( + 20.3, + ), }, Data { - int_col: 0, - double_col: 0.0, + car: Some( + "red", + ), + speed: Some( + 21.4, + ), }, Data { - int_col: 1, - double_col: 10.1, + car: Some( + "red", + ), + speed: Some( + 21.5, + ), }, ]"# ); - // Use the fields in the struct to avoid clippy complaints - let int_sum = list.iter().fold(0, |acc, x| acc + x.int_col); - let double_sum = list.iter().fold(0.0, |acc, x| acc + x.double_col); - assert_eq!(int_sum, 4); - assert_eq!(double_sum, 40.4); + let speed_green_sum: f64 = list + .iter() + .filter(|data| data.car.as_deref() == Some("green")) + .filter_map(|data| data.speed) + .sum(); + let speed_red_sum: f64 = list + .iter() + .filter(|data| data.car.as_deref() == Some("red")) + .filter_map(|data| data.speed) + .sum(); + assert_eq!(speed_green_sum, 133.5); + assert_eq!(speed_red_sum, 162.5); Ok(()) } @@ -153,6 +382,6 @@ pub async fn deserialize_to_struct() -> Result<()> { /// This is target struct where we want the query results. #[derive(Debug)] struct Data { - int_col: i32, - double_col: f64, + car: Option, + speed: Option, } diff --git a/datafusion-examples/examples/execution_monitoring/tracing.rs b/datafusion-examples/examples/execution_monitoring/tracing.rs index f34330064d2a9..c3b9c6fdd9c15 100644 --- a/datafusion-examples/examples/execution_monitoring/tracing.rs +++ b/datafusion-examples/examples/execution_monitoring/tracing.rs @@ -51,16 +51,20 @@ //! 10:29:40.809 INFO main ThreadId(01) tracing: ***** WITH tracer: Non-main tasks DID inherit the `run_instrumented_query` span ***** //! ``` +use std::any::Any; +use std::path::PathBuf; +use std::sync::Arc; + use datafusion::common::runtime::{set_join_set_tracer, JoinSetTracer}; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::error::Result; use datafusion::prelude::*; use futures::future::BoxFuture; use futures::FutureExt; -use std::any::Any; -use std::path::PathBuf; -use std::sync::Arc; +use tempfile::TempDir; +use tokio::fs::create_dir_all; use tracing::{info, instrument, Instrument, Level, Span}; /// Demonstrates the tracing injection feature for the DataFusion runtime @@ -122,22 +126,36 @@ async fn run_instrumented_query() -> Result<()> { info!("Starting query execution"); let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet"); - info!( - "Registering table 'alltypes' from {}", - path.to_str().unwrap() - ); + info!("Registering table 'cars' from {}", path.to_str().unwrap()); ctx.register_listing_table( - "alltypes", - path.to_str().unwrap(), + "cars", + out_dir.to_str().unwrap(), listing_options, None, None, @@ -145,7 +163,7 @@ async fn run_instrumented_query() -> Result<()> { .await .expect("Failed to register table"); - let sql = "SELECT COUNT(*), string_col FROM alltypes GROUP BY string_col"; + let sql = "SELECT COUNT(*), car, sum(speed) FROM cars GROUP BY car"; info!(sql, "Executing SQL query"); let result = ctx.sql(sql).await?.collect().await?; info!("Query complete: {} batches returned", result.len()); diff --git a/datafusion-examples/examples/flight/client.rs b/datafusion-examples/examples/flight/client.rs index 0aa1a2f6f4549..9f75666316010 100644 --- a/datafusion-examples/examples/flight/client.rs +++ b/datafusion-examples/examples/flight/client.rs @@ -20,24 +20,45 @@ use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; -use tonic::transport::Endpoint; - -use datafusion::arrow::datatypes::Schema; use arrow_flight::flight_descriptor; use arrow_flight::flight_service_client::FlightServiceClient; use arrow_flight::utils::flight_data_to_arrow_batch; use arrow_flight::{FlightDescriptor, Ticket}; +use datafusion::arrow::datatypes::Schema; use datafusion::arrow::util::pretty; +use datafusion::dataframe::DataFrameWriteOptions; +use datafusion::prelude::{CsvReadOptions, SessionContext}; +use tempfile::TempDir; +use tokio::fs::create_dir_all; +use tonic::transport::Endpoint; /// This example shows how to wrap DataFusion with `FlightService` to support looking up schema information for /// Parquet files and executing SQL queries against them on a remote server. /// This example is run along-side the example `flight_server`. pub async fn client() -> Result<(), Box> { + let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; // Create Flight client let endpoint = Endpoint::new("http://localhost:50051")?; @@ -48,7 +69,7 @@ pub async fn client() -> Result<(), Box> { let request = tonic::Request::new(FlightDescriptor { r#type: flight_descriptor::DescriptorType::Path as i32, cmd: Default::default(), - path: vec![format!("{}", path.to_str().unwrap())], + path: vec![format!("{}", out_dir.to_str().unwrap())], }); let schema_result = client.get_schema(request).await?.into_inner(); @@ -57,7 +78,7 @@ pub async fn client() -> Result<(), Box> { // Call do_get to execute a SQL query and receive results let request = tonic::Request::new(Ticket { - ticket: "SELECT id FROM alltypes_plain".into(), + ticket: "SELECT car FROM cars".into(), }); let mut stream = client.do_get(request).await?.into_inner(); diff --git a/datafusion-examples/examples/flight/server.rs b/datafusion-examples/examples/flight/server.rs index dfb687eecc3e5..a82d7c77d54ff 100644 --- a/datafusion-examples/examples/flight/server.rs +++ b/datafusion-examples/examples/flight/server.rs @@ -17,26 +17,27 @@ //! See `main.rs` for how to run it. -use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator}; use std::path::PathBuf; use std::sync::Arc; +use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator}; +use arrow_flight::{ + flight_service_server::FlightService, flight_service_server::FlightServiceServer, + Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, +}; use arrow_flight::{PollInfo, SchemaAsIpc}; use datafusion::arrow::error::ArrowError; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ListingOptions, ListingTableUrl}; +use datafusion::prelude::*; use futures::stream::BoxStream; +use tempfile::TempDir; +use tokio::fs::create_dir_all; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; -use datafusion::prelude::*; - -use arrow_flight::{ - flight_service_server::FlightService, flight_service_server::FlightServiceServer, - Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, - HandshakeRequest, HandshakeResponse, PutResult, SchemaResult, Ticket, -}; - #[derive(Clone)] pub struct FlightServiceImpl {} @@ -86,15 +87,33 @@ impl FlightService for FlightServiceImpl { // create local execution context let ctx = SessionContext::new(); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await + .map_err(|_| Status::internal("Error reading cars.csv"))?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await + .map_err(|_| Status::internal("Error writing to parquet file"))?; // register parquet file with the execution context ctx.register_parquet( - "alltypes_plain", - path.to_str().unwrap(), + "cars", + out_dir.to_str().unwrap(), ParquetReadOptions::default(), ) .await diff --git a/datafusion-examples/examples/flight/sql_server.rs b/datafusion-examples/examples/flight/sql_server.rs index c92f2635b2b44..a23c9af196ece 100644 --- a/datafusion-examples/examples/flight/sql_server.rs +++ b/datafusion-examples/examples/flight/sql_server.rs @@ -17,6 +17,10 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; +use std::pin::Pin; +use std::sync::Arc; + use arrow::array::{ArrayRef, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use arrow::ipc::writer::IpcWriteOptions; @@ -36,15 +40,17 @@ use arrow_flight::{ HandshakeResponse, IpcMessage, SchemaAsIpc, Ticket, }; use dashmap::DashMap; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::logical_expr::LogicalPlan; -use datafusion::prelude::{DataFrame, ParquetReadOptions, SessionConfig, SessionContext}; +use datafusion::prelude::{ + CsvReadOptions, DataFrame, ParquetReadOptions, SessionConfig, SessionContext, +}; use futures::{Stream, StreamExt, TryStreamExt}; use log::info; use mimalloc::MiMalloc; use prost::Message; -use std::path::PathBuf; -use std::pin::Pin; -use std::sync::Arc; +use tempfile::TempDir; +use tokio::fs::create_dir_all; use tonic::metadata::MetadataValue; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; @@ -101,15 +107,34 @@ impl FlightSqlServiceImpl { .with_information_schema(true); let ctx = Arc::new(SessionContext::new_with_config(session_config)); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await + .map_err(|e| status!("Error reading cars.csv", e))?; + let tmp_source = + TempDir::new().map_err(|e| status!("Error creating temp dir", e))?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await + .map_err(|e| status!("Error writing to parquet", e))?; // register parquet file with the execution context ctx.register_parquet( - "alltypes_plain", - path.to_str().unwrap(), + "cars", + out_dir.to_str().unwrap(), ParquetReadOptions::default(), ) .await diff --git a/datafusion-examples/examples/query_planning/parse_sql_expr.rs b/datafusion-examples/examples/query_planning/parse_sql_expr.rs index 7ef4c035761d2..18b5d9432d35c 100644 --- a/datafusion-examples/examples/query_planning/parse_sql_expr.rs +++ b/datafusion-examples/examples/query_planning/parse_sql_expr.rs @@ -21,13 +21,18 @@ use std::path::PathBuf; use arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::DFSchema; +use datafusion::common::ScalarValue; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::logical_expr::{col, lit}; +use datafusion::prelude::CsvReadOptions; use datafusion::sql::unparser::Unparser; use datafusion::{ assert_batches_eq, error::Result, prelude::{ParquetReadOptions, SessionContext}, }; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// This example demonstrates the programmatic parsing of SQL expressions using /// the DataFusion [`SessionContext::parse_sql_expr`] API or the [`DataFrame::parse_sql_expr`] API. @@ -72,18 +77,36 @@ fn simple_session_context_parse_sql_expr_demo() -> Result<()> { /// DataFusion can parse a SQL text to an logical expression using schema at [`DataFrame`]. async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> { - let sql = "int_col < 5 OR double_col = 8.0"; - let expr = col("int_col") - .lt(lit(5_i64)) - .or(col("double_col").eq(lit(8.0_f64))); + let sql = "car = 'red' OR speed > 1.0"; + let expr = col("car") + .eq(lit(ScalarValue::Utf8(Some("red".to_string())))) + .or(col("speed").gt(lit(1.0_f64))); let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + let df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; let parsed_expr = df.parse_sql_expr(sql)?; @@ -95,39 +118,54 @@ async fn simple_dataframe_parse_sql_expr_demo() -> Result<()> { async fn query_parquet_demo() -> Result<()> { let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + let df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; let df = df .clone() - .select(vec![ - df.parse_sql_expr("int_col")?, - df.parse_sql_expr("double_col")?, - ])? - .filter(df.parse_sql_expr("int_col < 5 OR double_col = 8.0")?)? + .select(vec![df.parse_sql_expr("car")?, df.parse_sql_expr("speed")?])? + .filter(df.parse_sql_expr("car = 'red' OR speed > 1.0")?)? .aggregate( - vec![df.parse_sql_expr("double_col")?], - vec![df.parse_sql_expr("SUM(int_col) as sum_int_col")?], + vec![df.parse_sql_expr("car")?], + vec![df.parse_sql_expr("SUM(speed) as sum_speed")?], )? // Directly parsing the SQL text into a sort expression is not supported yet, so // construct it programmatically - .sort(vec![col("double_col").sort(false, false)])? + .sort(vec![col("car").sort(false, false)])? .limit(0, Some(1))?; let result = df.collect().await?; assert_batches_eq!( &[ - "+------------+-------------+", - "| double_col | sum_int_col |", - "+------------+-------------+", - "| 10.1 | 4 |", - "+------------+-------------+", + "+-----+--------------------+", + "| car | sum_speed |", + "+-----+--------------------+", + "| red | 162.49999999999997 |", + "+-----+--------------------+" ], &result ); @@ -137,15 +175,33 @@ async fn query_parquet_demo() -> Result<()> { /// DataFusion can parse a SQL text and convert it back to SQL using [`Unparser`]. async fn round_trip_parse_sql_expr_demo() -> Result<()> { - let sql = "((int_col < 5) OR (double_col = 8))"; + let sql = "((car = 'red') OR (speed > 1.0))"; let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + let df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; let parsed_expr = df.parse_sql_expr(sql)?; @@ -160,7 +216,7 @@ async fn round_trip_parse_sql_expr_demo() -> Result<()> { // difference in precedence rules between DataFusion and target engines. let unparser = Unparser::default().with_pretty(true); - let pretty = "int_col < 5 OR double_col = 8"; + let pretty = "car = 'red' OR speed > 1.0"; let pretty_round_trip_sql = unparser.expr_to_sql(&parsed_expr)?.to_string(); assert_eq!(pretty, pretty_round_trip_sql); diff --git a/datafusion-examples/examples/query_planning/plan_to_sql.rs b/datafusion-examples/examples/query_planning/plan_to_sql.rs index e3480a0f6377a..ceb5e026315b8 100644 --- a/datafusion-examples/examples/query_planning/plan_to_sql.rs +++ b/datafusion-examples/examples/query_planning/plan_to_sql.rs @@ -17,7 +17,13 @@ //! See `main.rs` for how to run it. +use std::fmt; +use std::path::PathBuf; +use std::sync::Arc; + use datafusion::common::DFSchemaRef; +use datafusion::common::ScalarValue; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::Result; use datafusion::logical_expr::sqlparser::ast::Statement; use datafusion::logical_expr::{ @@ -35,9 +41,8 @@ use datafusion::sql::unparser::extension_unparser::{ UnparseToStatementResult, UnparseWithinStatementResult, }; use datafusion::sql::unparser::{plan_to_sql, Unparser}; -use std::fmt; -use std::path::PathBuf; -use std::sync::Arc; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// This example demonstrates the programmatic construction of SQL strings using /// the DataFusion Expr [`Expr`] and LogicalPlan [`LogicalPlan`] API. @@ -115,22 +120,38 @@ fn simple_expr_to_sql_demo_escape_mysql_style() -> Result<()> { async fn simple_plan_to_sql_demo() -> Result<()> { let ctx = SessionContext::new(); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; let df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await? - .select_columns(&["id", "int_col", "double_col", "date_string_col"])?; + .select_columns(&["car", "speed", "time"])?; // Convert the data frame to a SQL string let sql = plan_to_sql(df.logical_plan())?.to_string(); assert_eq!( sql, - r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""# + r#"SELECT "?table?".car, "?table?".speed, "?table?"."time" FROM "?table?""# ); Ok(()) @@ -141,38 +162,52 @@ async fn simple_plan_to_sql_demo() -> Result<()> { async fn round_trip_plan_to_sql_demo() -> Result<()> { let ctx = SessionContext::new(); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; // register parquet file with the execution context ctx.register_parquet( - "alltypes_plain", - path.to_str().unwrap(), + "cars", + out_dir.to_str().unwrap(), ParquetReadOptions::default(), ) .await?; // create a logical plan from a SQL string and then programmatically add new filters + // select car, speed, time from cars where speed > 1 and car = 'red' let df = ctx // Use SQL to read some data from the parquet file - .sql( - "SELECT int_col, double_col, CAST(date_string_col as VARCHAR) \ - FROM alltypes_plain", - ) + .sql("SELECT car, speed, time FROM cars") .await? - // Add id > 1 and tinyint_col < double_col filter + // Add speed > 1 and car = 'red' filter .filter( - col("id") + col("speed") .gt(lit(1)) - .and(col("tinyint_col").lt(col("double_col"))), + .and(col("car").eq(lit(ScalarValue::Utf8(Some("red".to_string()))))), )?; let sql = plan_to_sql(df.logical_plan())?.to_string(); assert_eq!( sql, - r#"SELECT alltypes_plain.int_col, alltypes_plain.double_col, CAST(alltypes_plain.date_string_col AS VARCHAR) FROM alltypes_plain WHERE ((alltypes_plain.id > 1) AND (alltypes_plain.tinyint_col < alltypes_plain.double_col))"# + r#"SELECT cars.car, cars.speed, cars."time" FROM cars WHERE ((cars.speed > 1) AND (cars.car = 'red'))"# ); Ok(()) @@ -236,14 +271,32 @@ impl UserDefinedLogicalNodeUnparser for PlanToStatement { /// It can be unparse as a statement that reads from the same parquet file. async fn unparse_my_logical_plan_as_statement() -> Result<()> { let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + let inner_plan = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await? - .select_columns(&["id", "int_col", "double_col", "date_string_col"])? + .select_columns(&["car", "speed", "time"])? .into_unoptimized_plan(); let node = Arc::new(MyLogicalPlan { input: inner_plan }); @@ -254,7 +307,7 @@ async fn unparse_my_logical_plan_as_statement() -> Result<()> { let sql = unparser.plan_to_sql(&my_plan)?.to_string(); assert_eq!( sql, - r#"SELECT "?table?".id, "?table?".int_col, "?table?".double_col, "?table?".date_string_col FROM "?table?""# + r#"SELECT "?table?".car, "?table?".speed, "?table?"."time" FROM "?table?""# ); Ok(()) } @@ -289,14 +342,32 @@ impl UserDefinedLogicalNodeUnparser for PlanToSubquery { /// It can be unparse as a subquery that reads from the same parquet file, with some columns projected. async fn unparse_my_logical_plan_as_subquery() -> Result<()> { let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + let inner_plan = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await? - .select_columns(&["id", "int_col", "double_col", "date_string_col"])? + .select_columns(&["car", "speed", "time"])? .into_unoptimized_plan(); let node = Arc::new(MyLogicalPlan { input: inner_plan }); @@ -304,8 +375,8 @@ async fn unparse_my_logical_plan_as_subquery() -> Result<()> { let my_plan = LogicalPlan::Extension(Extension { node }); let plan = LogicalPlanBuilder::from(my_plan) .project(vec![ - col("id").alias("my_id"), - col("int_col").alias("my_int"), + col("car").alias("my_car"), + col("speed").alias("my_speed"), ])? .build()?; let unparser = @@ -313,8 +384,8 @@ async fn unparse_my_logical_plan_as_subquery() -> Result<()> { let sql = unparser.plan_to_sql(&plan)?.to_string(); assert_eq!( sql, - "SELECT \"?table?\".id AS my_id, \"?table?\".int_col AS my_int FROM \ - (SELECT \"?table?\".id, \"?table?\".int_col, \"?table?\".double_col, \"?table?\".date_string_col FROM \"?table?\")", + "SELECT \"?table?\".car AS my_car, \"?table?\".speed AS my_speed FROM \ + (SELECT \"?table?\".car, \"?table?\".speed, \"?table?\".\"time\" FROM \"?table?\")", ); Ok(()) } diff --git a/datafusion-examples/examples/query_planning/planner_api.rs b/datafusion-examples/examples/query_planning/planner_api.rs index cf70b04be8edf..46174709f64f5 100644 --- a/datafusion-examples/examples/query_planning/planner_api.rs +++ b/datafusion-examples/examples/query_planning/planner_api.rs @@ -19,11 +19,14 @@ use std::path::PathBuf; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::Result; use datafusion::logical_expr::LogicalPlan; use datafusion::physical_plan::displayable; use datafusion::physical_planner::DefaultPhysicalPlanner; use datafusion::prelude::*; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// This example demonstrates the process of converting logical plan /// into physical execution plans using DataFusion. @@ -39,26 +42,40 @@ use datafusion::prelude::*; pub async fn planner_api() -> Result<()> { // Set up a DataFusion context and load a Parquet file let ctx = SessionContext::new(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; let df = ctx - .read_parquet(path.to_str().unwrap(), ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; // Construct the input logical plan using DataFrame API let df = df .clone() - .select(vec![ - df.parse_sql_expr("int_col")?, - df.parse_sql_expr("double_col")?, - ])? - .filter(df.parse_sql_expr("int_col < 5 OR double_col = 8.0")?)? + .select(vec![df.parse_sql_expr("car")?, df.parse_sql_expr("speed")?])? + .filter(df.parse_sql_expr("car = 'red' OR speed > 1.0")?)? .aggregate( - vec![df.parse_sql_expr("double_col")?], - vec![df.parse_sql_expr("SUM(int_col) as sum_int_col")?], + vec![df.parse_sql_expr("car")?], + vec![df.parse_sql_expr("SUM(speed) as sum_speed")?], )? .limit(0, Some(1))?; let logical_plan = df.logical_plan().clone(); diff --git a/datafusion-examples/examples/query_planning/thread_pools.rs b/datafusion-examples/examples/query_planning/thread_pools.rs index 8b1ec4f2e676f..292e838c8e4c6 100644 --- a/datafusion-examples/examples/query_planning/thread_pools.rs +++ b/datafusion-examples/examples/query_planning/thread_pools.rs @@ -37,16 +37,20 @@ //! //! [Architecture section]: https://docs.rs/datafusion/latest/datafusion/index.html#thread-scheduling-cpu--io-thread-pools-and-tokio-runtimes +use std::path::PathBuf; +use std::sync::Arc; + use arrow::util::pretty::pretty_format_batches; use datafusion::common::runtime::JoinSet; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::Result; use datafusion::execution::SendableRecordBatchStream; use datafusion::prelude::*; use futures::stream::StreamExt; use object_store::client::SpawnedReqwestConnector; use object_store::http::HttpBuilder; -use std::path::PathBuf; -use std::sync::Arc; +use tempfile::TempDir; +use tokio::fs::create_dir_all; use tokio::runtime::Handle; use tokio::sync::Notify; use url::Url; @@ -71,11 +75,29 @@ pub async fn thread_pools() -> Result<()> { // The first two examples read local files. Enabling the URL table feature // lets us treat filenames as tables in SQL. let ctx = SessionContext::new().enable_url_table(); + + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); - let sql = format!("SELECT * FROM '{}'", path.to_str().unwrap()); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + + let sql = format!("SELECT * FROM '{}'", out_dir.to_str().unwrap()); // Run a query on the current runtime. Calling `await` means the future // (in this case the `async` function and all spawned work in DataFusion diff --git a/datafusion-examples/examples/sql_ops/query.rs b/datafusion-examples/examples/sql_ops/query.rs index 4f1b34769472f..b57feea3409cd 100644 --- a/datafusion-examples/examples/sql_ops/query.rs +++ b/datafusion-examples/examples/sql_ops/query.rs @@ -17,18 +17,22 @@ //! See `main.rs` for how to run it. +use std::path::PathBuf; +use std::sync::Arc; + use datafusion::arrow::array::{UInt64Array, UInt8Array}; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::common::{assert_batches_eq, exec_datafusion_err}; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::datasource::MemTable; use datafusion::error::{DataFusionError, Result}; use datafusion::prelude::*; use object_store::local::LocalFileSystem; -use std::path::PathBuf; -use std::sync::Arc; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// Examples of various ways to execute queries using SQL /// @@ -113,17 +117,33 @@ async fn query_parquet() -> Result<()> { // create local execution context let ctx = SessionContext::new(); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") - .join("parquet") - .join("alltypes_plain.parquet"); + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(true); let listing_options = ListingOptions::new(Arc::new(file_format)).with_file_extension(".parquet"); - let table_path = format!("file://{}", path.to_str().unwrap()); + let table_path = format!("file://{}", out_dir.to_str().unwrap()); // First example were we use an absolute path, which requires no additional setup. ctx.register_listing_table( @@ -141,6 +161,7 @@ async fn query_parquet() -> Result<()> { .sql( "SELECT * \ FROM my_table \ + ORDER BY speed \ LIMIT 1", ) .await?; @@ -149,20 +170,21 @@ async fn query_parquet() -> Result<()> { let results = df.collect().await?; assert_batches_eq!( [ - "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", - "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |", - "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", - "| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 |", - "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", + "+-----+-------+---------------------+", + "| car | speed | time |", + "+-----+-------+---------------------+", + "| red | 0.0 | 1996-04-12T12:05:15 |", + "+-----+-------+---------------------+", ], - &results); + &results + ); // Second example where we change the current working directory and explicitly // register a local filesystem object store. This demonstrates how listing tables // resolve paths via an ObjectStore, even when using filesystem-backed data. let cur_dir = std::env::current_dir()?; - let test_data_path_parent = path + let test_data_path_parent = out_dir .parent() .ok_or(exec_datafusion_err!("test_data path needs a parent"))?; @@ -178,7 +200,7 @@ async fn query_parquet() -> Result<()> { // for the query ctx.register_listing_table( "relative_table", - path.to_str().unwrap(), + out_dir.to_str().unwrap(), listing_options.clone(), None, None, @@ -190,6 +212,7 @@ async fn query_parquet() -> Result<()> { .sql( "SELECT * \ FROM relative_table \ + ORDER BY speed \ LIMIT 1", ) .await?; @@ -198,13 +221,14 @@ async fn query_parquet() -> Result<()> { let results = df.collect().await?; assert_batches_eq!( [ - "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", - "| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col | string_col | timestamp_col |", - "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", - "| 4 | true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 30332f30312f3039 | 30 | 2009-03-01T00:00:00 |", - "+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+", + "+-----+-------+---------------------+", + "| car | speed | time |", + "+-----+-------+---------------------+", + "| red | 0.0 | 1996-04-12T12:05:15 |", + "+-----+-------+---------------------+", ], - &results); + &results + ); // Reset the current directory std::env::set_current_dir(cur_dir)?; From 7d35fccfc97bac21ca1da118614a98a1a9888514 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Wed, 24 Dec 2025 09:19:48 +0300 Subject: [PATCH 06/13] fix fmt issues --- datafusion-examples/examples/dataframe/dataframe.rs | 2 +- datafusion-examples/examples/execution_monitoring/tracing.rs | 4 ++-- datafusion-examples/examples/query_planning/plan_to_sql.rs | 2 +- datafusion-examples/examples/sql_ops/query.rs | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion-examples/examples/dataframe/dataframe.rs b/datafusion-examples/examples/dataframe/dataframe.rs index 85cb7519e8b25..b6cc6540f5ce0 100644 --- a/datafusion-examples/examples/dataframe/dataframe.rs +++ b/datafusion-examples/examples/dataframe/dataframe.rs @@ -33,7 +33,7 @@ use datafusion::error::Result; use datafusion::functions_aggregate::average::avg; use datafusion::functions_aggregate::min_max::max; use datafusion::prelude::*; -use tempfile::{tempdir, TempDir}; +use tempfile::{TempDir, tempdir}; use tokio::fs::create_dir_all; /// This example demonstrates using DataFusion's DataFrame API diff --git a/datafusion-examples/examples/execution_monitoring/tracing.rs b/datafusion-examples/examples/execution_monitoring/tracing.rs index df011b6174f80..3d9445597019c 100644 --- a/datafusion-examples/examples/execution_monitoring/tracing.rs +++ b/datafusion-examples/examples/execution_monitoring/tracing.rs @@ -61,11 +61,11 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; use datafusion::error::Result; use datafusion::prelude::*; -use futures::future::BoxFuture; use futures::FutureExt; +use futures::future::BoxFuture; use tempfile::TempDir; use tokio::fs::create_dir_all; -use tracing::{info, instrument, Instrument, Level, Span}; +use tracing::{Instrument, Level, Span, info, instrument}; /// Demonstrates the tracing injection feature for the DataFusion runtime pub async fn tracing() -> Result<()> { diff --git a/datafusion-examples/examples/query_planning/plan_to_sql.rs b/datafusion-examples/examples/query_planning/plan_to_sql.rs index ceb5e026315b8..ba2db08780c23 100644 --- a/datafusion-examples/examples/query_planning/plan_to_sql.rs +++ b/datafusion-examples/examples/query_planning/plan_to_sql.rs @@ -40,7 +40,7 @@ use datafusion::sql::unparser::extension_unparser::UserDefinedLogicalNodeUnparse use datafusion::sql::unparser::extension_unparser::{ UnparseToStatementResult, UnparseWithinStatementResult, }; -use datafusion::sql::unparser::{plan_to_sql, Unparser}; +use datafusion::sql::unparser::{Unparser, plan_to_sql}; use tempfile::TempDir; use tokio::fs::create_dir_all; diff --git a/datafusion-examples/examples/sql_ops/query.rs b/datafusion-examples/examples/sql_ops/query.rs index 2f4d61446e5b0..c2a37bee6b068 100644 --- a/datafusion-examples/examples/sql_ops/query.rs +++ b/datafusion-examples/examples/sql_ops/query.rs @@ -20,7 +20,7 @@ use std::path::PathBuf; use std::sync::Arc; -use datafusion::arrow::array::{UInt64Array, UInt8Array}; +use datafusion::arrow::array::{UInt8Array, UInt64Array}; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::common::{assert_batches_eq, exec_datafusion_err}; From e6b30ac05d6b9e861b1cb11171d8a9d49b1c3f3d Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Wed, 24 Dec 2025 11:32:02 +0300 Subject: [PATCH 07/13] fix fmt issues --- datafusion-examples/examples/execution_monitoring/tracing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/examples/execution_monitoring/tracing.rs b/datafusion-examples/examples/execution_monitoring/tracing.rs index 3d9445597019c..bf68c6b5346fb 100644 --- a/datafusion-examples/examples/execution_monitoring/tracing.rs +++ b/datafusion-examples/examples/execution_monitoring/tracing.rs @@ -55,7 +55,7 @@ use std::any::Any; use std::path::PathBuf; use std::sync::Arc; -use datafusion::common::runtime::{set_join_set_tracer, JoinSetTracer}; +use datafusion::common::runtime::{JoinSetTracer, set_join_set_tracer}; use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; From 449f3aa787d5e1a88b63eb66b46e02c25a107a75 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Wed, 24 Dec 2025 12:59:19 +0300 Subject: [PATCH 08/13] Fix issues causing GitHub checks to fail --- datafusion-examples/examples/sql_ops/query.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion-examples/examples/sql_ops/query.rs b/datafusion-examples/examples/sql_ops/query.rs index c2a37bee6b068..d9c9a1c0d1274 100644 --- a/datafusion-examples/examples/sql_ops/query.rs +++ b/datafusion-examples/examples/sql_ops/query.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use datafusion::arrow::array::{UInt8Array, UInt64Array}; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::catalog::MemTable; use datafusion::common::{assert_batches_eq, exec_datafusion_err}; use datafusion::dataframe::DataFrameWriteOptions; use datafusion::datasource::file_format::parquet::ParquetFormat; From 7dc8a50e6b7e1edd308d41f6799b7c1d811bbb03 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Thu, 25 Dec 2025 16:30:06 +0300 Subject: [PATCH 09/13] replace window_1.csv with cars.csv --- .../custom_data_source/csv_sql_streaming.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs index 45150199c9430..ca4cac0d2c786 100644 --- a/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs +++ b/datafusion-examples/examples/custom_data_source/csv_sql_streaming.rs @@ -31,14 +31,14 @@ pub async fn csv_sql_streaming() -> Result<()> { let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("data") .join("csv") - .join("window_1.csv"); + .join("cars.csv"); - // Register a table source and tell DataFusion the file is ordered by `ts ASC`. + // Register a table source and tell DataFusion the file is ordered by `car ASC`. // Note it is the responsibility of the user to make sure // that file indeed satisfies this condition or else incorrect answers may be produced. let asc = true; let nulls_first = true; - let sort_expr = vec![col("ts").sort(asc, nulls_first)]; + let sort_expr = vec![col("car").sort(asc, nulls_first)]; // register csv file with the execution context ctx.register_csv( "ordered_table", @@ -48,16 +48,16 @@ pub async fn csv_sql_streaming() -> Result<()> { .await?; // execute the query - // Following query can be executed with unbounded sources because group by expressions (e.g ts) is + // Following query can be executed with unbounded sources because group by expressions (e.g car) is // already ordered at the source. // // Unbounded sources means that if the input came from a "never ending" source (such as a FIFO // file on unix) the query could produce results incrementally as data was read. let df = ctx .sql( - "SELECT ts, MIN(inc_col), MAX(inc_col) \ + "SELECT car, MIN(speed), MAX(speed) \ FROM ordered_table \ - GROUP BY ts", + GROUP BY car", ) .await?; @@ -68,7 +68,7 @@ pub async fn csv_sql_streaming() -> Result<()> { // its result in streaming fashion, because its required ordering is already satisfied at the source. let df = ctx .sql( - "SELECT ts, SUM(inc_col) OVER(ORDER BY ts ASC) \ + "SELECT car, SUM(speed) OVER(ORDER BY car ASC) \ FROM ordered_table", ) .await?; From fb1af66a5e5e10b729cda2ac877dd896e1ab0198 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Thu, 25 Dec 2025 17:11:26 +0300 Subject: [PATCH 10/13] remove window_1.csv --- datafusion-examples/data/csv/window_1.csv | 101 ---------------------- 1 file changed, 101 deletions(-) delete mode 100644 datafusion-examples/data/csv/window_1.csv diff --git a/datafusion-examples/data/csv/window_1.csv b/datafusion-examples/data/csv/window_1.csv deleted file mode 100644 index 588af16d06863..0000000000000 --- a/datafusion-examples/data/csv/window_1.csv +++ /dev/null @@ -1,101 +0,0 @@ -ts,inc_col,desc_col -1,1,100 -1,5,98 -5,10,93 -9,15,91 -10,20,86 -11,21,84 -16,26,81 -21,29,77 -22,30,75 -26,33,71 -26,37,70 -28,40,69 -31,43,64 -33,44,62 -38,45,59 -42,49,55 -47,51,50 -51,53,45 -53,58,41 -53,61,40 -58,65,39 -63,70,36 -67,75,31 -68,78,28 -70,83,23 -72,88,22 -72,90,17 -76,91,13 -81,95,10 -85,97,6 -86,100,5 -88,105,2 -91,109,1 -96,111,-1 -97,115,-4 -98,119,-5 -100,120,-6 -101,124,-8 -102,126,-12 -104,129,-16 -104,131,-17 -108,135,-19 -112,140,-24 -113,143,-25 -113,144,-29 -114,147,-34 -114,148,-37 -117,149,-42 -122,151,-47 -126,155,-48 -131,156,-49 -131,159,-53 -136,160,-57 -136,163,-58 -136,165,-61 -139,170,-65 -141,172,-67 -146,177,-68 -147,181,-71 -147,182,-73 -152,186,-75 -154,187,-76 -159,192,-78 -161,196,-83 -163,197,-87 -164,199,-91 -167,203,-95 -172,207,-98 -173,209,-101 -177,213,-105 -180,214,-106 -185,216,-111 -186,219,-114 -191,221,-116 -195,222,-120 -195,225,-125 -199,226,-128 -203,231,-129 -207,236,-134 -210,237,-139 -213,242,-142 -218,245,-143 -221,247,-146 -224,248,-150 -226,253,-154 -230,254,-158 -232,259,-163 -235,261,-168 -238,266,-172 -238,269,-176 -239,272,-181 -244,275,-184 -245,278,-189 -247,283,-193 -250,286,-196 -254,289,-201 -258,291,-203 -262,296,-208 -264,301,-210 -264,305,-213 From 396193bf10761b11838580baf27d1845b6364c3a Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Thu, 25 Dec 2025 17:31:15 +0300 Subject: [PATCH 11/13] update README with files examples --- datafusion-examples/data/README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion-examples/data/README.md b/datafusion-examples/data/README.md index df48c831c8d50..e8296a8856e60 100644 --- a/datafusion-examples/data/README.md +++ b/datafusion-examples/data/README.md @@ -19,9 +19,7 @@ ## Example datasets -| Filename | Path | Description | -| ------------------------ | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `cars.csv` | [`data/csv/cars.csv`](./csv/cars.csv) | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames). | -| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. | -| `window_1.csv` | [`data/csv/window_1.csv`](./csv/window_1.csv) | Numeric dataset designed for window function demonstrations. Includes ordering keys and incremental values suitable for running totals, ranking, and frame-based calculations. | -| `alltypes_plain.parquet` | [`data/parquet/alltypes_plain.parquet`](./parquet/alltypes_plain.parquet) | Parquet file containing columns of many Arrow/DataFusion-supported types (boolean, integers, floating-point, strings, timestamps). Used to demonstrate Parquet scanning, schema inference, and typed execution. | +| Filename | Path | Description | +| ----------- | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `cars.csv` | [`data/csv/cars.csv`](./csv/cars.csv) | Time-series–like dataset containing car identifiers, speed values, and timestamps. Used in window function and time-based query examples (e.g. ordering, window frames). | +| `regex.csv` | [`data/csv/regex.csv`](./csv/regex.csv) | Dataset for regular expression examples. Contains input values, regex patterns, replacement strings, and optional flags. Covers ASCII, Unicode, and locale-specific text processing. | From 40b1e60f719a5c2bd3e0a167aa764a112a669bdc Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Thu, 25 Dec 2025 18:15:54 +0300 Subject: [PATCH 12/13] update cache_factory example to use cars.csv instead of alltypes_plain.parquet --- datafusion-examples/README.md | 9 +++-- .../examples/dataframe/cache_factory.rs | 38 +++++++++++++++---- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 1469aad5417b8..88322fe09abe8 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -106,10 +106,11 @@ cargo run --example dataframe -- dataframe #### Category: Single Process -| Subcommand | File Path | Description | -| --------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------------ | -| dataframe | [`dataframe/dataframe.rs`](examples/dataframe/dataframe.rs) | Query DataFrames from various sources and write output | -| deserialize_to_struct | [`dataframe/deserialize_to_struct.rs`](examples/dataframe/deserialize_to_struct.rs) | Convert Arrow arrays into Rust structs | +| Subcommand | File Path | Description | +| --------------------- | ----------------------------------------------------------------------------------- | ------------------------------------------------------- | +| cache_factory | [`dataframe/cache_factory.rs`](examples/dataframe/cache_factory.rs) | Custom lazy caching for DataFrames using `CacheFactory` | +| dataframe | [`dataframe/dataframe.rs`](examples/dataframe/dataframe.rs) | Query DataFrames from various sources and write output | +| deserialize_to_struct | [`dataframe/deserialize_to_struct.rs`](examples/dataframe/deserialize_to_struct.rs) | Convert Arrow arrays into Rust structs | ## Execution Monitoring Examples diff --git a/datafusion-examples/examples/dataframe/cache_factory.rs b/datafusion-examples/examples/dataframe/cache_factory.rs index a6c465720c626..c76e3796b9165 100644 --- a/datafusion-examples/examples/dataframe/cache_factory.rs +++ b/datafusion-examples/examples/dataframe/cache_factory.rs @@ -19,6 +19,7 @@ use std::fmt::Debug; use std::hash::Hash; +use std::path::PathBuf; use std::sync::Arc; use std::sync::RwLock; @@ -26,6 +27,7 @@ use arrow::array::RecordBatch; use async_trait::async_trait; use datafusion::catalog::memory::MemorySourceConfig; use datafusion::common::DFSchemaRef; +use datafusion::dataframe::DataFrameWriteOptions; use datafusion::error::Result; use datafusion::execution::SessionState; use datafusion::execution::SessionStateBuilder; @@ -44,6 +46,8 @@ use datafusion::prelude::ParquetReadOptions; use datafusion::prelude::SessionContext; use datafusion::prelude::*; use datafusion_common::HashMap; +use tempfile::TempDir; +use tokio::fs::create_dir_all; /// This example demonstrates how to leverage [CacheFactory] to implement custom caching strategies for dataframes in DataFusion. /// By default, [DataFrame::cache] in Datafusion is eager and creates an in-memory table. This example shows a basic alternative implementation for lazy caching. @@ -53,28 +57,46 @@ use datafusion_common::HashMap; /// - A [CacheNodeQueryPlanner] that installs [CacheNodePlanner]. /// - A simple in-memory [CacheManager] that stores cached [RecordBatch]es. Note that the implementation for this example is very naive and only implements put, but for real production use cases cache eviction and drop should also be implemented. pub async fn cache_dataframe_with_custom_logic() -> Result<()> { - let testdata = datafusion::test_util::parquet_test_data(); - let filename = &format!("{testdata}/alltypes_plain.parquet"); - let session_state = SessionStateBuilder::new() .with_cache_factory(Some(Arc::new(CustomCacheFactory {}))) .with_query_planner(Arc::new(CacheNodeQueryPlanner::default())) .build(); let ctx = SessionContext::new_with_state(session_state); + // Load CSV into an in-memory DataFrame, then materialize it to Parquet. + // This replaces a static parquet fixture and makes the example self-contained + // without requiring DataFusion test files. + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("csv") + .join("cars.csv"); + let csv_df = ctx + .read_csv(path.to_str().unwrap(), CsvReadOptions::default()) + .await?; + let tmp_source = TempDir::new()?; + let out_dir = tmp_source.path().join("parquet_source"); + create_dir_all(&out_dir).await?; + csv_df + .write_parquet( + out_dir.to_str().unwrap(), + DataFrameWriteOptions::default(), + None, + ) + .await?; + // Read the parquet files and show its schema using 'describe' let parquet_df = ctx - .read_parquet(filename, ParquetReadOptions::default()) + .read_parquet(out_dir.to_str().unwrap(), ParquetReadOptions::default()) .await?; let df_cached = parquet_df - .select_columns(&["id", "bool_col", "timestamp_col"])? - .filter(col("id").gt(lit(1)))? + .select_columns(&["car", "speed", "time"])? + .filter(col("speed").gt(lit(1.0)))? .cache() .await?; - let df1 = df_cached.clone().filter(col("bool_col").is_true())?; - let df2 = df1.clone().sort(vec![col("id").sort(true, false)])?; + let df1 = df_cached.clone().filter(col("car").eq(lit("red")))?; + let df2 = df1.clone().sort(vec![col("car").sort(true, false)])?; // should see log for caching only once df_cached.show().await?; From 445dbff308dcbbc0ec57bdc71b3effc201c4c458 Mon Sep 17 00:00:00 2001 From: Sergey Zhukov Date: Fri, 26 Dec 2025 12:15:16 +0300 Subject: [PATCH 13/13] add cache_factory example in usage doc in main --- datafusion-examples/examples/dataframe/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/examples/dataframe/main.rs b/datafusion-examples/examples/dataframe/main.rs index 9a2604e97136d..cff18436a6a4e 100644 --- a/datafusion-examples/examples/dataframe/main.rs +++ b/datafusion-examples/examples/dataframe/main.rs @@ -21,7 +21,7 @@ //! //! ## Usage //! ```bash -//! cargo run --example dataframe -- [all|dataframe|deserialize_to_struct] +//! cargo run --example dataframe -- [all|dataframe|deserialize_to_struct|cache_factory] //! ``` //! //! Each subcommand runs a corresponding example: