Skip to content

Textract Handler API

auris_tools.textractHandler

TextractHandler

Handler for Amazon Textract operations to extract text from documents.

This class provides methods to interact with AWS Textract service for text extraction from documents stored in S3.

Source code in auris_tools/textractHandler.py
class TextractHandler:
    """
    Handler for Amazon Textract operations to extract text from documents.

    This class provides methods to interact with AWS Textract service for
    text extraction from documents stored in S3.
    """

    def __init__(self, config=None):
        """
        Initialize the Textract handler with AWS configuration.

        Args:
            config: An AWSConfiguration object, or None to use environment variables
        """
        if config is None:
            config = AWSConfiguration()

        # Create a boto3 session with the configuration
        session = boto3.session.Session(**config.get_boto3_session_args())

        # Create a Textract client with additional configuration if needed
        self.client = session.client('textract', **config.get_client_args())
        logging.info(f'Initialized Textract client in region {config.region}')

    def start_job(self, s3_bucket_name, object_name):
        """
        Start an asynchronous text detection job for a document in S3.

        Args:
            s3_bucket_name: Name of the S3 bucket containing the document
            object_name: Object key of the document in the S3 bucket

        Returns:
            str: The JobId of the started Textract job

        Raises:
            Exception: If there is an error starting the job
        """
        try:
            response = self.client.start_document_text_detection(
                DocumentLocation={
                    'S3Object': {'Bucket': s3_bucket_name, 'Name': object_name}
                }
            )
            job_id = response['JobId']
            logging.info(
                f'Started Textract job {job_id} for {s3_bucket_name}/{object_name}'
            )
            return job_id
        except Exception as e:
            logging.error(
                f'Error starting Textract job for {s3_bucket_name}/{object_name}: {str(e)}'
            )
            raise

    def get_job_status(self, job_id):
        """
        Get the status of a Textract job.

        Args:
            job_id: ID of the Textract job

        Returns:
            str: The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')
        """
        try:
            response = self.client.get_document_text_detection(JobId=job_id)
            status = response['JobStatus']
            logging.info(f'Textract job {job_id} status: {status}')
            return status
        except Exception as e:
            logging.error(
                f'Error getting status for Textract job {job_id}: {str(e)}'
            )
            raise

    def is_job_complete(self, job_id):
        """
        Check if a Textract job has completed.

        Args:
            job_id: ID of the Textract job

        Returns:
            str: The job status
        """
        time.sleep(1)  # Avoid rate limiting
        return self.get_job_status(job_id)

    def get_job_results(self, job_id):
        """
        Get the results of a completed Textract job.

        This method handles pagination of results automatically.

        Args:
            job_id: ID of the Textract job

        Returns:
            list: List of response pages from Textract
        """
        pages = []
        next_token = None

        try:
            # Get first page
            response = self.client.get_document_text_detection(JobId=job_id)
            pages.append(response)
            logging.info(f'Received page 1 of results for job {job_id}')

            # Get next token if available
            if 'NextToken' in response:
                next_token = response['NextToken']

            # Get additional pages if available
            page_num = 2
            while next_token:
                time.sleep(1)  # Avoid rate limiting
                response = self.client.get_document_text_detection(
                    JobId=job_id, NextToken=next_token
                )
                pages.append(response)
                logging.info(
                    f'Received page {page_num} of results for job {job_id}'
                )
                page_num += 1

                next_token = response.get('NextToken')

            return pages
        except Exception as e:
            logging.error(
                f'Error getting results for Textract job {job_id}: {str(e)}'
            )
            raise

    def get_full_text(self, response):
        """
        Extract the full text from Textract response pages.

        Args:
            response: List of response pages from Textract

        Returns:
            str: The full extracted text as a string
        """
        try:
            text_lines = []
            for result_page in response:
                for item in result_page.get('Blocks', []):
                    if item.get('BlockType') == 'LINE':
                        text_lines.append(item.get('Text', ''))

            full_text = ' '.join(text_lines)
            return full_text
        except Exception as e:
            logging.error(
                f'Error extracting full text from Textract response: {str(e)}'
            )
            return ''
__init__
__init__(config = None)

Initialize the Textract handler with AWS configuration.

Parameters:

Name Type Description Default
config

An AWSConfiguration object, or None to use environment variables

None
Source code in auris_tools/textractHandler.py
def __init__(self, config=None):
    """
    Initialize the Textract handler with AWS configuration.

    Args:
        config: An AWSConfiguration object, or None to use environment variables
    """
    if config is None:
        config = AWSConfiguration()

    # Create a boto3 session with the configuration
    session = boto3.session.Session(**config.get_boto3_session_args())

    # Create a Textract client with additional configuration if needed
    self.client = session.client('textract', **config.get_client_args())
    logging.info(f'Initialized Textract client in region {config.region}')
get_full_text
get_full_text(response)

Extract the full text from Textract response pages.

Parameters:

Name Type Description Default
response

List of response pages from Textract

required

Returns:

Name Type Description
str

The full extracted text as a string

Source code in auris_tools/textractHandler.py
def get_full_text(self, response):
    """
    Extract the full text from Textract response pages.

    Args:
        response: List of response pages from Textract

    Returns:
        str: The full extracted text as a string
    """
    try:
        text_lines = []
        for result_page in response:
            for item in result_page.get('Blocks', []):
                if item.get('BlockType') == 'LINE':
                    text_lines.append(item.get('Text', ''))

        full_text = ' '.join(text_lines)
        return full_text
    except Exception as e:
        logging.error(
            f'Error extracting full text from Textract response: {str(e)}'
        )
        return ''
get_job_results
get_job_results(job_id)

Get the results of a completed Textract job.

This method handles pagination of results automatically.

Parameters:

Name Type Description Default
job_id

ID of the Textract job

required

Returns:

Name Type Description
list

List of response pages from Textract

Source code in auris_tools/textractHandler.py
def get_job_results(self, job_id):
    """
    Get the results of a completed Textract job.

    This method handles pagination of results automatically.

    Args:
        job_id: ID of the Textract job

    Returns:
        list: List of response pages from Textract
    """
    pages = []
    next_token = None

    try:
        # Get first page
        response = self.client.get_document_text_detection(JobId=job_id)
        pages.append(response)
        logging.info(f'Received page 1 of results for job {job_id}')

        # Get next token if available
        if 'NextToken' in response:
            next_token = response['NextToken']

        # Get additional pages if available
        page_num = 2
        while next_token:
            time.sleep(1)  # Avoid rate limiting
            response = self.client.get_document_text_detection(
                JobId=job_id, NextToken=next_token
            )
            pages.append(response)
            logging.info(
                f'Received page {page_num} of results for job {job_id}'
            )
            page_num += 1

            next_token = response.get('NextToken')

        return pages
    except Exception as e:
        logging.error(
            f'Error getting results for Textract job {job_id}: {str(e)}'
        )
        raise
get_job_status
get_job_status(job_id)

Get the status of a Textract job.

Parameters:

Name Type Description Default
job_id

ID of the Textract job

required

Returns:

Name Type Description
str

The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')

Source code in auris_tools/textractHandler.py
def get_job_status(self, job_id):
    """
    Get the status of a Textract job.

    Args:
        job_id: ID of the Textract job

    Returns:
        str: The job status (e.g., 'IN_PROGRESS', 'SUCCEEDED', 'FAILED')
    """
    try:
        response = self.client.get_document_text_detection(JobId=job_id)
        status = response['JobStatus']
        logging.info(f'Textract job {job_id} status: {status}')
        return status
    except Exception as e:
        logging.error(
            f'Error getting status for Textract job {job_id}: {str(e)}'
        )
        raise
is_job_complete
is_job_complete(job_id)

Check if a Textract job has completed.

Parameters:

Name Type Description Default
job_id

ID of the Textract job

required

Returns:

Name Type Description
str

The job status

Source code in auris_tools/textractHandler.py
def is_job_complete(self, job_id):
    """
    Check if a Textract job has completed.

    Args:
        job_id: ID of the Textract job

    Returns:
        str: The job status
    """
    time.sleep(1)  # Avoid rate limiting
    return self.get_job_status(job_id)
start_job
start_job(s3_bucket_name, object_name)

Start an asynchronous text detection job for a document in S3.

Parameters:

Name Type Description Default
s3_bucket_name

Name of the S3 bucket containing the document

required
object_name

Object key of the document in the S3 bucket

required

Returns:

Name Type Description
str

The JobId of the started Textract job

Raises:

Type Description
Exception

If there is an error starting the job

Source code in auris_tools/textractHandler.py
def start_job(self, s3_bucket_name, object_name):
    """
    Start an asynchronous text detection job for a document in S3.

    Args:
        s3_bucket_name: Name of the S3 bucket containing the document
        object_name: Object key of the document in the S3 bucket

    Returns:
        str: The JobId of the started Textract job

    Raises:
        Exception: If there is an error starting the job
    """
    try:
        response = self.client.start_document_text_detection(
            DocumentLocation={
                'S3Object': {'Bucket': s3_bucket_name, 'Name': object_name}
            }
        )
        job_id = response['JobId']
        logging.info(
            f'Started Textract job {job_id} for {s3_bucket_name}/{object_name}'
        )
        return job_id
    except Exception as e:
        logging.error(
            f'Error starting Textract job for {s3_bucket_name}/{object_name}: {str(e)}'
        )
        raise