test

Dec 11, 2024

export PYICEBERG_CATALOG__DEFAULT__URI=https://<account-id>.<region>.snowflakecomputing.com/polaris/api/catalog;
export PYICEBERG_CATALOG__DEFAULT__CREDENTIAL=<client_id>:<client_secret>;
export PYICEBERG_CATALOG__DEFAULT__SCOPE=PRINCIPAL_ROLE:python_app_role;
export PYICEBERG_CATALOG__DEFAULT__WAREHOUSE

from pyiceberg.catalog import load_catalog
from pyarrow import csv
import fsspec
import os


# The catalog name "default" must be in the config file or in env variable names
catalog = load_catalog("default")


# get source and destination paths
csv_file_path = os.getenv("csv_file_path")
iceberg_table_name= os.getenv("iceberg_table")


# Download from s3 a large CSV file with Japan Trade Statistics (import and export of goods by month)

of = fsspec.open("filecache://{csv_file_path}".format(csv_file_path=csv_file_path), mode='rb',
                 cache_storage='/tmp/cache1',
                 target_protocol='s3', target_options={'anon': True})


# Read the csv file into an PyArrow table so that we can take its schema and pass it to iceberg


with of as f:
    artable = csv.read_csv(f)


# create the "default" namespace in our open lakehouse
catalog.create_namespace_if_not_exists("default")


# create or retrieve the table to hold our Japan trade stats
icetable = catalog.create_table_if_not_exists(iceberg_table_name, schema=artable.schema)


# add the contents of the csv file to our iceberg table
icetable.append(artable)


# The scan will materialize the table on S3
l = len(icetable.scan().to_arrow())


# print the number of records in the table
print(l)

export PYICEBERG_CATALOG__DEFAULT__URI=https://<account-id>.<region>.snowflakecomputing.com/polaris/api/catalog;
export PYICEBERG_CATALOG__DEFAULT__CREDENTIAL=<client_id>:<client_secret>;
export PYICEBERG_CATALOG__DEFAULT__SCOPE=PRINCIPAL_ROLE:python_app_role;
export PYICEBERG_CATALOG__DEFAULT__WAREHOUSE

from pyiceberg.catalog import load_catalog
from pyarrow import csv
import fsspec
import os


# The catalog name "default" must be in the config file or in env variable names
catalog = load_catalog("default")


# get source and destination paths
csv_file_path = os.getenv("csv_file_path")
iceberg_table_name= os.getenv("iceberg_table")


# Download from s3 a large CSV file with Japan Trade Statistics (import and export of goods by month)

of = fsspec.open("filecache://{csv_file_path}".format(csv_file_path=csv_file_path), mode='rb',
                 cache_storage='/tmp/cache1',
                 target_protocol='s3', target_options={'anon': True})


# Read the csv file into an PyArrow table so that we can take its schema and pass it to iceberg


with of as f:
    artable = csv.read_csv(f)


# create the "default" namespace in our open lakehouse
catalog.create_namespace_if_not_exists("default")


# create or retrieve the table to hold our Japan trade stats
icetable = catalog.create_table_if_not_exists(iceberg_table_name, schema=artable.schema)


# add the contents of the csv file to our iceberg table
icetable.append(artable)


# The scan will materialize the table on S3
l = len(icetable.scan().to_arrow())


# print the number of records in the table
print(l)