Organize documents in folders

This page gives a brief overview of folders and explains how to manage documents using folders.

What are Document AI Warehouse folders

A folder is a special type of document. It can't include inline contents or have any associated contents, but users can still add properties to the folder. A folder serves as a container to group and label documents. Users can attach a document to multiple folders and a folder can contain multiple documents. Folders can also be used in documents.search API to filter child documents under a specific folder.

Before you begin

Before you begin, make sure you have completed the Quickstart page.

Create a Document AI Warehouse folder with a schema

To create a schema for instantiating folders, do the following:

Request:

curl -X POST \
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
-H "Content-Type: application/json" -d '{
"display_name": "abc",
"property_definitions": [
  {
    "name": "Name",
    "display_name": "Name",
    "is_repeatable": false,
    "is_filterable": true,
    "is_searchable": true,
    "is_metadata": true,
    "is_required": true,
    "text_type_options": {},
    "schema_sources": []
  }
],
"document_is_folder": true
}' \
"https://contentwarehouse.googleapis.com/v1/projects/406397197218/locations/us/documentSchemas"

Response:

{
  "name": "SCHEMA_NAME",
  "displayName": "abc",
  "documentIsFolder": true,
  "updateTime": "2022-08-31T16:10:43.111978Z",
  "createTime": "2022-08-31T16:10:43.111978Z"
}
from google.cloud import contentwarehouse_v1

def create_folder_schema():
    # Create a client
    client = contentwarehouse_v1.DocumentSchemaServiceClient()

    # Initialize request argument(s)
    document_schema = contentwarehouse_v1.DocumentSchema()
    document_schema.display_name = "display_name_value"
    document_schema.document_is_folder = True

    request = contentwarehouse_v1.CreateDocumentSchemaRequest(
        parent="projects/533503808294/locations/us",
        document_schema=document_schema,
    )

    # Make the request
    return client.create_document_schema(request=request)

Add a document to a Document AI Warehouse folder

To add a document to a folder, do the following:

Request:

curl -X POST \
-H "Authorization: Bearer $(gcloud auth print-access-token)" \
-H "Content-Type: application/json; charset=utf-8" -d '{
"document_link": {
  "source_document_reference": {
    "document_name": "projects/PROJECT_NUMBER/locations/LOCATION/documents/{document_id}"
  },
  "target_document_reference": {
    "document_name": "projects/PROJECT_NUMBER/locations/LOCATION/documents/{document_id}"
  }
},
"requestMetadata": {
  "userInfo": {
    "id": "user:USER_EMAIL_ID"
  }
}
}' \
"https://contentwarehouse.googleapis.com/v1/projects/PROJECT_NUMBER/locations/LOCATION/documents/{document_id}/documentLinks"

Response:

{
  "name": "LINK_NAME",
  "source_document_reference": {
   "document_name": "FOLDER_NAME"
  },
  "target_document_reference": {
   "document_name": "DOCUMENT_NAME"
  }
}
from google.cloud import contentwarehouse_v1

def add_to_folder(folder:str, doc:str):
    # Create a client
    client = contentwarehouse_v1.DocumentLinkServiceClient()

    # Initialize request argument(s)
    link = contentwarehouse_v1.DocumentLink()
    link.source_document_reference = contentwarehouse_v1.DocumentReference()
    link.source_document_reference.document_name = folder
    link.target_document_reference = contentwarehouse_v1.DocumentReference()
    link.target_document_reference.document_name = doc

    request = contentwarehouse_v1.CreateDocumentLinkRequest(
        parent=folder,
        document_link=link,
    )

    # Make the request
    return client.create_document_link(request=request)
Python

For more information, see the Document AI Warehouse Python API reference documentation.

To authenticate to Document AI Warehouse, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


from google.cloud import contentwarehouse

# TODO(developer): Uncomment these variables before running the sample.
# project_number = "YOUR_PROJECT_NUMBER"
# location = "us" # Format is 'us' or 'eu'
# user_id = "user:xxxx@example.com" # Format is "user:xxxx@example.com"


def create_folder(
    project_number: str, location: str, user_id: str
) -> contentwarehouse.Document:
    # Create a Schema Service client
    document_schema_client = contentwarehouse.DocumentSchemaServiceClient()

    # The full resource name of the location, e.g.:
    # projects/{project_number}/locations/{location}
    parent = document_schema_client.common_location_path(
        project=project_number, location=location
    )

    # Define Folder Schema Request
    create_folder_schema_request = contentwarehouse.CreateDocumentSchemaRequest(
        parent=parent,
        document_schema=contentwarehouse.DocumentSchema(
            display_name="Test Folder Schema ",
            document_is_folder=True,
        ),
    )

    # Create a Folder Schema
    folder_schema = document_schema_client.create_document_schema(
        request=create_folder_schema_request
    )

    # Create a Document(Folder) Service client
    folder_client = contentwarehouse.DocumentServiceClient()

    # Define Folder
    folder = contentwarehouse.Document(
        display_name="My Test Folder",
        document_schema_name=folder_schema.name,
    )

    # Define Request to create Folder
    create_folder_request = contentwarehouse.CreateDocumentRequest(
        parent=parent,
        document=folder,
        request_metadata=contentwarehouse.RequestMetadata(
            user_info=contentwarehouse.UserInfo(id=user_id)
        ),
    )

    # Create a Folder for the given schema
    folder_response = folder_client.create_document(request=create_folder_request)

    # Read the output
    print(f"Rule Engine Output: {folder_response.rule_engine_output}")
    print(f"Folder Created: {folder_response.document}")

    return folder_response


def create_document(
    project_number: str, location: str, user_id: str
) -> contentwarehouse.Document:
    # Create a Schema Service client
    document_schema_client = contentwarehouse.DocumentSchemaServiceClient()

    # The full resource name of the location, e.g.:
    # projects/{project_number}/locations/{location}
    parent = document_schema_client.common_location_path(
        project=project_number, location=location
    )

    # Define Schema Property of Text Type
    property_definition = contentwarehouse.PropertyDefinition(
        name="stock_symbol",  # Must be unique within a document schema (case insensitive)
        display_name="Searchable text",
        is_searchable=True,
        text_type_options=contentwarehouse.TextTypeOptions(),
    )

    # Define Document Schema Request
    create_document_schema_request = contentwarehouse.CreateDocumentSchemaRequest(
        parent=parent,
        document_schema=contentwarehouse.DocumentSchema(
            display_name="My Test Schema",
            property_definitions=[property_definition],
        ),
    )

    # Create a Document schema
    document_schema = document_schema_client.create_document_schema(
        request=create_document_schema_request
    )

    # Create a Document Service client
    document_client = contentwarehouse.DocumentServiceClient()

    # The full resource name of the location, e.g.:
    # projects/{project_number}/locations/{location}
    parent = document_client.common_location_path(
        project=project_number, location=location
    )

    # Define Document Property Value
    document_property = contentwarehouse.Property(
        name=document_schema.property_definitions[0].name,
        text_values=contentwarehouse.TextArray(values=["GOOG"]),
    )

    # Define Document
    document = contentwarehouse.Document(
        display_name="My Test Document",
        document_schema_name=document_schema.name,
        plain_text="This is a sample of a document's text.",
        properties=[document_property],
    )

    # Define Request
    create_document_request = contentwarehouse.CreateDocumentRequest(
        parent=parent,
        document=document,
        request_metadata=contentwarehouse.RequestMetadata(
            user_info=contentwarehouse.UserInfo(id=user_id)
        ),
    )

    # Create a Document for the given schema
    document_response = document_client.create_document(request=create_document_request)

    # Read the output
    print(f"Rule Engine Output: {document_response.rule_engine_output}")
    print(f"Document Created: {document_response.document}")

    return document_response


def create_folder_link_document(
    project_number: str, location: str, user_id: str
) -> None:
    # Function call to create a folder
    folder = create_folder(project_number, location, user_id)

    # Function call to create a Document
    document = create_document(project_number, location, user_id)

    # Create a Link Service client
    link_client = contentwarehouse.DocumentLinkServiceClient()

    # Initialize request argument(s)
    link = contentwarehouse.DocumentLink(
        source_document_reference=contentwarehouse.DocumentReference(
            document_name=folder.document.name,
        ),
        target_document_reference=contentwarehouse.DocumentReference(
            document_name=document.document.name,
        ),
    )

    # Initialize document link request
    create_document_link_request = contentwarehouse.CreateDocumentLinkRequest(
        parent=folder.document.name,
        document_link=link,
        request_metadata=contentwarehouse.RequestMetadata(
            user_info=contentwarehouse.UserInfo(id=user_id)
        ),
    )

    # Make the Document Link request
    create_link_response = link_client.create_document_link(
        request=create_document_link_request
    )

    print(f"Link Created: {create_link_response}")

    # Initialize list linked targets request
    linked_targets_request = contentwarehouse.ListLinkedTargetsRequest(
        parent=folder.document.name,
        request_metadata=contentwarehouse.RequestMetadata(
            user_info=contentwarehouse.UserInfo(id=user_id)
        ),
    )

    # Make the request
    linked_targets_response = link_client.list_linked_targets(
        request=linked_targets_request
    )

    print(f"Validate Link Created: {linked_targets_response}")

List child documents under a Document AI Warehouse folder

To list immediate child documents under a folder, do the following:

Request:

curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" https://contentwarehouse.googleapis.com/v1/projects/PROJECT_NUMBER/locations/LOCATION/documents/FOLDER/linkedTargets

Response:

{
  "documentLinks": [
    {
      "name": "LINK_NAME1"
      "source_document_reference": {
       "document_name": "FOLDER_NAME"
      },
      "target_document_reference": {
       "document_name": "DOCUMENT_NAME1"
      }
    },
    {
      "name": "LINK_NAME2"
      "source_document_reference": {
       "document_name": "FOLDER_NAME"
      },
      "target_document_reference": {
       "document_name": "DOCUMENT_NAME2"
      }
    }
    ...
  ]
}
from google.cloud import contentwarehouse_v1

def list_sub_docs(folder:str):
    # Create a client
    client = contentwarehouse_v1.DocumentLinkServiceClient()

    # Initialize request argument(s)
    request = contentwarehouse_v1.ListLinkedTargetsRequest(
        parent=folder,
    )

    # Make the request
    return client.list_linked_targets(request=request)

Remove a document from a Document AI Warehouse folder

To remove a document from a folder, you need the link name. You can retrieve the link name using the documents.linkedTargets method in the preceding step.

Request:

curl -X POST -H "Authorization: Bearer $(gcloud auth print-access-token)" https://contentwarehouse.googleapis.com/v1/projects/PROJECT_NUMBER/locations/LOCATION/documents/FOLDER/documentLinks/LINK:delete
from google.cloud import contentwarehouse_v1

def remove_doc_from_folder(link:str):
    # Create a client
    client = contentwarehouse_v1.DocumentLinkServiceClient()

    # Initialize request argument(s)
    request = contentwarehouse_v1.DeleteDocumentLinkRequest(
        name=link,
    )

    # Make the request
    return client.delete_document_link(request=request)