Получайте строки и абзацы, а не символы из OCR Google Vision API в формате PDF

Я пытаюсь использовать теперь поддерживаемый PDF/TIFF Document Text Detection из API Google Cloud Vision. Используя свой примерный код, я могу отправить PDF файл и получить обратно объект JSON с извлеченным текстом. Моя проблема заключается в том, что файл JSON, который сохраняется в GCS, содержит только ограничивающие поля и текст для "символов", т.е. каждый символ в каждом слове. Это делает объект JSON довольно громоздким и очень сложным в использовании. Я хотел бы получить текст и ограничивающие поля для "LINES", "PARAGRAPHS" и "BLOCKS", но я не могу найти способ сделать это с помощью AsyncAnnotateFileRequest().

Код примера выглядит следующим образом:

def async_detect_document(gcs_source_uri, gcs_destination_uri): """OCR with PDF/TIFF as source files on GCS""" # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = 2 client = vision.ImageAnnotatorClient() feature = vision.types.Feature( type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.types.GcsSource(uri=gcs_source_uri) input_config = vision.types.InputConfig( gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri) output_config = vision.types.OutputConfig( gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.types.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=180) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name=bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name) # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. output = blob_list[0] json_string = output.download_as_string() response = json_format.Parse( json_string, vision.types.AnnotateFileResponse()) # The actual response for the first page of the input file. first_page_response = response.responses[0] annotation = first_page_response.full_text_annotation # Here we print the full text from the first page. # The response contains more information: # annotation/pages/blocks/paragraphs/words/symbols # including confidence scores and bounding boxes print(u'Full text:\n{}'.format( annotation.text))

Ответы

Ответ 1

К сожалению, при использовании типа DOCUMENT_TEXT_DETECTION вы можете получить только полный текст на странице или отдельные символы. Не сложно собрать параграфы и строки из символов, но что-то вроде этого должно работать (от вашего примера):

breaks = vision.enums.TextAnnotation.DetectedBreak.BreakType
paragraphs = []
lines = []

for page in annotation.pages:
    for block in page.blocks:
        for paragraph in block.paragraphs:
            para = ""
            line = ""
            for word in paragraph.words:
                for symbol in word.symbols:
                    line += symbol.text
                    if symbol.property.detected_break.type == breaks.SPACE:
                        line += ' '
                    if symbol.property.detected_break.type == breaks.EOL_SURE_SPACE:
                        line += ' '
                        lines.append(line)
                        para += line
                        line = ''
                    if symbol.property.detected_break.type == breaks.LINE_BREAK:
                        lines.append(line)
                        para += line
                        line = ''
            paragraphs.append(para)

print(paragraphs)
print(lines)