If you need more information about the objects and buckets in your S3 data lake, the quickest solution is likely to be the SDK and AWS CLI.
In this example, we’ll be using the boto3 Python SDK to achieve two goals:
- Find all bucket names and prefixes
- Find all bucket names and keys
Pre-requisites:
- Configure the AWS CLI on your machine
- Install Python
- Install boto3 (Run:
pip install boto3
)
Find All Bucket Names and Prefixes
import boto3 s3_client = boto3.client("s3") s3_resource = boto3.resource('s3') paginator = s3_client.get_paginator("list_objects_v2") def get_matching_s3_objects(bucket): """ Generate all CommonPrefixes in an S3 bucket. :param bucket: Name of the S3 bucket. """ kwargs = {'Bucket': bucket, 'Delimiter': '/'} for page in paginator.paginate(**kwargs): try: prefix = page["CommonPrefixes"] except KeyError: break for obj in prefix: yield obj def get_matching_s3_prefixes(bucket): """ Retrieve just the Prefix from CommonPrefixes. :param bucket: Name of the S3 bucket. """ for obj in get_matching_s3_objects(bucket): yield obj["Prefix"] def main(): for bucket in s3_resource.buckets.all(): try: for prefix in get_matching_s3_prefixes(bucket.name): prefix = prefix.replace('/','') print(f"{bucket.name},{prefix}") except: print(f"Cannot access bucket: {bucket.name}") if __name__ == '__main__': main()
Find All Bucket Names and Keys
import boto3 s3 = boto3.client("s3") paginator = s3.get_paginator("list_objects_v2") def get_matching_s3_objects(bucket): """ Generate objects in an S3 bucket. :param bucket: Name of the S3 bucket. """ kwargs = {'Bucket': bucket} for page in paginator.paginate(**kwargs): try: contents = page["Contents"] except KeyError: break for obj in contents: yield obj def get_matching_s3_keys(bucket): """ Generate the keys in an S3 bucket. :param bucket: Name of the S3 bucket. """ for obj in get_matching_s3_objects(bucket): yield obj["Key"] #for bucket in s3.buckets.all(): def main(): bucket='voyager-demo-curated' for key in get_matching_s3_keys(bucket): print(f"{bucket}/{key}") if __name__ == '__main__': main()