Check out our new case study with Taktile and dltHub
Check out our new case study with Taktile and dltHub
Check out our new case study with Taktile and dltHub
test
Dec 11, 2024
export PYICEBERG_CATALOG__DEFAULT__URI=https://<account-id>.<region>.snowflakecomputing.com/polaris/api/catalog;
export PYICEBERG_CATALOG__DEFAULT__CREDENTIAL=<client_id>:<client_secret>;
export PYICEBERG_CATALOG__DEFAULT__SCOPE=PRINCIPAL_ROLE:python_app_role;
export PYICEBERG_CATALOG__DEFAULT__WAREHOUSE
from pyiceberg.catalog import load_catalog
from pyarrow import csv
import fsspec
import os
# The catalog name "default" must be in the config file or in env variable names
catalog = load_catalog("default")
# get source and destination paths
csv_file_path = os.getenv("csv_file_path")
iceberg_table_name= os.getenv("iceberg_table")
# Download from s3 a large CSV file with Japan Trade Statistics (import and export of goods by month)
of = fsspec.open("filecache://{csv_file_path}".format(csv_file_path=csv_file_path), mode='rb',
cache_storage='/tmp/cache1',
target_protocol='s3', target_options={'anon': True})
# Read the csv file into an PyArrow table so that we can take its schema and pass it to iceberg
with of as f:
artable = csv.read_csv(f)
# create the "default" namespace in our open lakehouse
catalog.create_namespace_if_not_exists("default")
# create or retrieve the table to hold our Japan trade stats
icetable = catalog.create_table_if_not_exists(iceberg_table_name, schema=artable.schema)
# add the contents of the csv file to our iceberg table
icetable.append(artable)
# The scan will materialize the table on S3
l = len(icetable.scan().to_arrow())
# print the number of records in the table
print(l)
export PYICEBERG_CATALOG__DEFAULT__URI=https://<account-id>.<region>.snowflakecomputing.com/polaris/api/catalog;
export PYICEBERG_CATALOG__DEFAULT__CREDENTIAL=<client_id>:<client_secret>;
export PYICEBERG_CATALOG__DEFAULT__SCOPE=PRINCIPAL_ROLE:python_app_role;
export PYICEBERG_CATALOG__DEFAULT__WAREHOUSE
from pyiceberg.catalog import load_catalog
from pyarrow import csv
import fsspec
import os
# The catalog name "default" must be in the config file or in env variable names
catalog = load_catalog("default")
# get source and destination paths
csv_file_path = os.getenv("csv_file_path")
iceberg_table_name= os.getenv("iceberg_table")
# Download from s3 a large CSV file with Japan Trade Statistics (import and export of goods by month)
of = fsspec.open("filecache://{csv_file_path}".format(csv_file_path=csv_file_path), mode='rb',
cache_storage='/tmp/cache1',
target_protocol='s3', target_options={'anon': True})
# Read the csv file into an PyArrow table so that we can take its schema and pass it to iceberg
with of as f:
artable = csv.read_csv(f)
# create the "default" namespace in our open lakehouse
catalog.create_namespace_if_not_exists("default")
# create or retrieve the table to hold our Japan trade stats
icetable = catalog.create_table_if_not_exists(iceberg_table_name, schema=artable.schema)
# add the contents of the csv file to our iceberg table
icetable.append(artable)
# The scan will materialize the table on S3
l = len(icetable.scan().to_arrow())
# print the number of records in the table
print(l)