|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +require 'duckdb' |
| 4 | +require 'csv' |
| 5 | + |
| 6 | +require 'stringio' |
| 7 | + |
| 8 | +module DuckDB |
| 9 | + class Connection |
| 10 | + def register_as_table_with_table_function(name, io, csv_options: {}) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength |
| 11 | + csv = CSV.new(io, **csv_options) |
| 12 | + headers = csv.first.headers |
| 13 | + csv.rewind |
| 14 | + columns = headers.each_with_object({}) { |header, hash| hash[header] = LogicalType::VARCHAR } |
| 15 | + tf = DuckDB::TableFunction.create( |
| 16 | + name:, |
| 17 | + columns: |
| 18 | + ) do |_func_info, output| |
| 19 | + line = csv.readline |
| 20 | + if line |
| 21 | + line.each_with_index do |cell, index| |
| 22 | + output.set_value(index, 0, cell[1]) |
| 23 | + end |
| 24 | + 1 |
| 25 | + else |
| 26 | + csv.rewind |
| 27 | + 0 |
| 28 | + end |
| 29 | + end |
| 30 | + register_table_function(tf) |
| 31 | + end |
| 32 | + |
| 33 | + def register_as_table_with_create_table(name, io, csv_options: {}) |
| 34 | + csv = CSV.new(io, **csv_options) |
| 35 | + headers = csv.first.headers |
| 36 | + csv.rewind |
| 37 | + execute("CREATE OR REPLACE TABLE #{name} (#{headers.map { |h| "#{h} VARCHAR" }.join(', ')})") |
| 38 | + csv.each do |row| |
| 39 | + values = row.map { |cell| "'#{cell[1]}'" }.join(', ') |
| 40 | + execute("INSERT INTO #{name} VALUES (#{values})") |
| 41 | + end |
| 42 | + end |
| 43 | + end |
| 44 | +end |
| 45 | + |
| 46 | +csv_data = 'id,name,age' |
| 47 | +csv_data += 100_000.times.map { |i| "\n#{i + 1},Name#{i + 1},#{rand(0..100)}" }.join |
| 48 | +csv_io = StringIO.new(csv_data) |
| 49 | + |
| 50 | +db = DuckDB::Database.open |
| 51 | +con = db.connect |
| 52 | +con.query('SET threads=1') |
| 53 | + |
| 54 | +start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) |
| 55 | +con.register_as_table_with_table_function('csv_tf', csv_io, csv_options: { headers: true }) |
| 56 | +con.query('SELECT * FROM csv_tf()').to_a |
| 57 | +end_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) |
| 58 | + |
| 59 | +start_time2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) |
| 60 | +con.register_as_table_with_create_table('csv_ct', csv_io, csv_options: { headers: true }) |
| 61 | +con.query('SELECT * FROM csv_ct').to_a |
| 62 | +end_time2 = Process.clock_gettime(Process::CLOCK_MONOTONIC) |
| 63 | + |
| 64 | +con.close |
| 65 | +db.close |
| 66 | + |
| 67 | +puts "Time taken for table function approach: #{end_time - start_time} seconds" |
| 68 | +puts "Time taken for CREATE TABLE approach: #{end_time2 - start_time2} seconds" |
0 commit comments