Reading Apache Parquet Files#
This tutorial provides an example of how to read data from Apache Parquet file, also covers how to process and transform the data with Grain.
Generate a test Parquet file on local#
import pyarrow as pa
import pyarrow.parquet as pq
# Generate a sample PyArrow table containing email subjects and bodies.
table = pa.table({
'email_subject': [
"Meeting Reminder: Project X Update",
"Important Announcement Regarding Company Policy",
"FWD: Quick Question",
"Your Order Confirmation #12345",
"Invitation to Team Building Activity"
],
'email_body': [
"Hi team,\n\nJust a reminder about our Project X update meeting tomorrow at 10 AM PST. Please come prepared to discuss your progress and any roadblocks.\n\nSee you there,\n[Your Name]",
"Dear employees,\n\nPlease be advised of a new company policy regarding remote work, effective May 1st, 2025. You can find the full details on the company intranet.\n\nRegards,\nManagement",
"Hi [Name],\n\nForwarding you this email as you might have the answer to this quick question:\n\n[Original Email Content]",
"Dear [Customer Name],\n\nThank you for your recent order! This email confirms your order #12345. You can view the details and track its shipment here: [Link]\n\nSincerely,\nThe [Company Name] Team",
"Hello everyone,\n\nYou're invited to participate in our upcoming team building activity on Friday, April 28th. It will be a fun afternoon of [Activity]. Please RSVP by Wednesday.\n\nBest,\n[Organizer Name]"
]
})
# Write this table to a parquet file.
writer = pq.ParquetWriter('emails.parquet', table.schema)
writer.write_table(table)
writer.close()
Load Dataset#
# Install Grain
!pip install grain
import grain
import pprint
ds = grain.experimental.ParquetIterDataset('./emails.parquet')
list(ds)[0]
Transform Dataset#
# Load a pre trained tokenizer.
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_pretrained("bert-base-cased")
class TokenizeText(grain.transforms.Map):
"""Tokenizes the text values within each element using a provided tokenizer."""
def __init__(self, tokenizer):
self._tokenizer = tokenizer
def map(self, element):
return [self._tokenizer.encode(item).tokens for item in element.values()]
# Tokenize the data using the provided tokenizer.
ds = ds.map(TokenizeText(tokenizer))
# Create an iterator object of the dataset.
ds_iter = iter(ds)
# Print the first element in the dataset.
pprint.pprint(next(ds_iter))