docarray.document package#

Subpackages#

Submodules#

Module contents#

class docarray.document.Document[source]#
class docarray.document.Document(_obj: Optional[Document] = None, copy: bool = False)
class docarray.document.Document(_obj: Optional[Any] = None)
class docarray.document.Document(_obj: Optional[Dict], copy: bool = False, field_resolver: Optional[Dict[str, str]] = None, unknown_fields_handler: str = 'catch')
class docarray.document.Document(blob: Optional[bytes] = None, **kwargs)
class docarray.document.Document(tensor: Optional[ArrayType] = None, **kwargs)
class docarray.document.Document(text: Optional[str] = None, **kwargs)
class docarray.document.Document(uri: Optional[str] = None, **kwargs)
class docarray.document.Document(parent_id: Optional[str] = None, granularity: Optional[int] = None, adjacency: Optional[int] = None, blob: Optional[bytes] = None, tensor: Optional[ArrayType] = None, mime_type: Optional[str] = None, text: Optional[str] = None, content: Optional[DocumentContentType] = None, weight: Optional[float] = None, uri: Optional[str] = None, tags: Optional[Dict[str, StructValueType]] = None, offset: Optional[float] = None, location: Optional[List[float]] = None, embedding: Optional[ArrayType] = None, modality: Optional[str] = None, evaluations: Optional[Dict[str, Dict[str, StructValueType]]] = None, scores: Optional[Dict[str, Dict[str, StructValueType]]] = None, chunks: Optional[Sequence[Document]] = None, matches: Optional[Sequence[Document]] = None)

Bases: AllMixins, BaseDCType

Document is the basic data type in DocArray. A Document is a container for any kind of data, be it text, image, audio, video, or 3D meshes.

You can initialize a Document object with given attributes:

from docarray import Document
import numpy

d1 = Document(text='hello')
d3 = Document(tensor=numpy.array([1, 2, 3]))
d4 = Document(
    uri='https://jina.ai',
    mime_type='text/plain',
    granularity=1,
    adjacency=3,
    tags={'foo': 'bar'},
)

Documents support a nested structure, which can also be specified during construction:

d = Document(
    id='d0',
    chunks=[Document(id='d1', chunks=Document(id='d2'))],
    matches=[Document(id='d3')],
)

A Document can embed its contents using the embed() method and a provided embedding model:

import torchvision

q = (
    Document(uri='/Users/usr/path/to/image.jpg')
    .load_uri_to_image_tensor()
    .set_image_tensor_normalization()
    .set_image_tensor_channel_axis(-1, 0)
)
model = torchvision.models.resnet50(pretrained=True)
q.embed(model)

Multiple Documents can be organized into a DocumentArray.

See also

For further details, see our user guide.

property adjacency: Optional[int]#
Return type:

Optional[int]

property blob: Optional[bytes]#
Return type:

Optional[bytes]

property chunks: Optional[ChunkArray]#
Return type:

Optional[ChunkArray]

clear()#

Clear all fields from this Document to their default values.

Return type:

None

property content: Optional[DocumentContentType]#
Return type:

Optional[DocumentContentType]

property content_hash: int#

Get the document hash according to its content.

Return type:

int

Returns:

the unique hash code to represent this Document

property content_type: Optional[str]#
Return type:

Optional[str]

convert_blob_to_datauri(charset='utf-8', base64=False)#

Convert blob to data uri in place. Internally it first reads into blob and then converts it to data URI.

Parameters:
  • charset (str) – charset may be any character set registered with IANA

  • base64 (bool) – used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.

Return type:

T

Returns:

itself after processed

convert_blob_to_image_tensor(width=None, height=None, channel_axis=-1)#

Convert an image blob to a ndarray tensor.

Parameters:
  • width (Optional[int]) – the width of the image tensor.

  • height (Optional[int]) – the height of the tensor.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

Return type:

T

Returns:

itself after processed

convert_blob_to_tensor(dtype=None, count=-1, offset=0)#

Assuming the blob is a _valid_ buffer of Numpy ndarray, set tensor accordingly.

Parameters:
  • dtype (Optional[str]) – Data-type of the returned array; default: float.

  • count (int) – Number of items to read. -1 means all data in the buffer.

  • offset (int) – Start reading the buffer from this offset (in bytes); default: 0.

Return type:

T

Returns:

itself after processed

convert_content_to_datauri()#

Convert content in uri inplace with best effort

Return type:

T

Returns:

itself after processed

convert_image_tensor_to_blob(channel_axis=-1, image_format='png')#

Assuming tensor is a _valid_ image, set blob accordingly

Parameters:
  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • image_format (str) – either png or jpeg

Return type:

T

Returns:

itself after processed

convert_image_tensor_to_sliding_windows(window_shape=(64, 64), strides=None, padding=False, channel_axis=-1, as_chunks=False)#

Convert tensor into a sliding window view with the given window shape tensor inplace.

Parameters:
  • window_shape (Tuple[int, int]) – desired output size. If size is a sequence like (h, w), the output size will be matched to this. If size is an int, the output will have the same height and width as the target_size.

  • strides (Optional[Tuple[int, int]]) – the strides between two neighboring sliding windows. strides is a sequence like (h, w), in which denote the strides on the vertical and the horizontal axis. When not given, using window_shape

  • padding (bool) – If False, only patches which are fully contained in the input image are included. If True, all patches whose starting point is inside the input are included, and areas outside the input default to zero. The padding argument has no effect on the size of each patch, it determines how many patches are extracted. Default is False.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis.

  • as_chunks (bool) – If set, each sliding window will be stored in the chunk of the current Document

Return type:

T

Returns:

Document itself after processed

convert_image_tensor_to_uri(channel_axis=-1, image_format='png')#

Assuming tensor is a _valid_ image, set uri accordingly

Parameters:
  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • image_format (str) – either png or jpeg

Return type:

T

Returns:

itself after processed

convert_tensor_to_blob()#

Convert tensor to blob inplace.

Return type:

T

Returns:

itself after processed

convert_tensor_to_text(vocab, delimiter=' ')#

Convert tensor to text inplace.

Parameters:
  • vocab (Union[Dict[str, int], Dict[int, str]]) – a dictionary that maps a word to an integer index, 0 is reserved for padding, 1 is reserved for unknown words in text

  • delimiter (str) – the delimiter that used to connect all words into text

Return type:

T

Returns:

Document itself after processed

convert_text_to_datauri(charset='utf-8', base64=False)#

Convert text to data uri.

Parameters:
  • charset (str) – charset may be any character set registered with IANA

  • base64 (bool) – used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.

Return type:

T

Returns:

itself after processed

convert_text_to_tensor(vocab, max_length=None, dtype='int64')#

Convert text to tensor inplace.

In the end tensor will be a 1D array where D is max_length.

To get the vocab of a DocumentArray, you can use jina.types.document.converters.build_vocab to

Parameters:
  • vocab (Dict[str, int]) – a dictionary that maps a word to an integer index, 0 is reserved for padding, 1 is reserved for unknown words in text. So you should not include these two entries in vocab.

  • max_length (Optional[int]) – the maximum length of the sequence. Sequence longer than this are cut off from beginning. Sequence shorter than this will be padded with 0 from right hand side.

  • dtype (str) – the dtype of the generated tensor

Return type:

T

Returns:

Document itself after processed

convert_uri_to_datauri(charset='utf-8', base64=False)#

Convert uri to dataURI and store it in uri inplace.

Parameters:
  • charset (str) – charset may be any character set registered with IANA

  • base64 (bool) – used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.

Return type:

T

Returns:

itself after processed

copy_from(other)#

Overwrite self by copying from another Document.

Parameters:

other (T) – the other Document to copy from

Return type:

None

display(from_=None)#

Plot image data from uri or from tensor if uri is empty .

Parameters:

from – an optional string to decide if a document should display using either the uri or the tensor field.

display_tensor()#

Plot image data from tensor

display_uri()#

Plot image data from uri

embed(*args, **kwargs)#
Return type:

T

embed_feature_hashing(n_dim=256, sparse=False, fields=('text', 'tags'), max_value=1000000)#

Convert an arbitrary set of attributes into a fixed-dimensional matrix using the hashing trick.

Parameters:
  • n_dim (int) – the dimensionality of each document in the output embedding. Small numbers of features are likely to cause hash collisions, but large numbers will cause larger overall parameter dimensions.

  • sparse (bool) – whether the resulting feature matrix should be a sparse csr_matrix or dense ndarray. Note that this feature requires scipy

  • fields (Tuple[str, ...]) – which attributes to be considered as for feature hashing.

Return type:

T

property embedding: Optional[ArrayType]#
Return type:

Optional[ArrayType]

property evaluations: Optional[Dict[str, NamedScore]]#
Return type:

Optional[Dict[str, NamedScore]]

classmethod from_base64(data, protocol='pickle', compress=None)#

Build Document object from binary bytes

Parameters:
  • data (str) – a base64 encoded string

  • protocol (str) – protocol to use

  • compress (Optional[str]) – compress method to use

Return type:

T

Returns:

a Document object

classmethod from_bytes(data, protocol='pickle', compress=None)#

Build Document object from binary bytes

Parameters:
  • data (bytes) – binary bytes

  • protocol (str) – protocol to use

  • compress (Optional[str]) – compress method to use

Return type:

T

Returns:

a Document object

classmethod from_dict(obj, protocol='jsonschema', **kwargs)#

Convert a dict object into a Document.

Parameters:
  • obj (Dict) – a Python dict object

  • protocol (str) – jsonschema or protobuf

  • kwargs – extra key-value args pass to pydantic and protobuf parser.

Return type:

T

Returns:

the parsed Document object

classmethod from_json(obj, protocol='jsonschema', **kwargs)#

Convert a JSON string into a Document.

Parameters:
  • obj (Union[str, bytes, bytearray]) – a valid JSON string

  • protocol (str) – jsonschema or protobuf

  • kwargs – extra key-value args pass to pydantic and protobuf parser.

Return type:

T

Returns:

the parsed Document object

classmethod from_protobuf(pb_msg)#
Return type:

T

classmethod from_pydantic_model(model)#

Build a Document object from a Pydantic model

Parameters:

model (BaseModel) – the pydantic data model object that represents a Document

Return type:

T

Returns:

a Document object

classmethod from_strawberry_type(model)#

Build a Document object from a Strawberry model

Parameters:

model – the Strawberry data model object that represents a Document

Return type:

T

Returns:

a Document object

classmethod generator_from_webcam(height_width=None, show_window=True, window_title='webcam', fps=30, exit_key=27, exit_event=None, tags=None)#

Create a generator that yields a Document object from the webcam.

This feature requires the opencv-python package.

Parameters:
  • height_width (Optional[Tuple[int, int]]) – the shape of the video frame, if not provided, the shape will be determined from the first frame. Note that this is restricted by the hardware of the camera.

  • show_window (bool) – if to show preview window of the webcam video

  • window_title (str) – the window title of the preview window

  • fps (int) – expected frames per second, note that this is not guaranteed, as the actual fps depends on the hardware limit

  • exit_key (int) – the key to press to exit the preview window

  • exit_event – the multiprocessing/threading/asyncio event that once set to exit the preview window

  • tags (Optional[Dict]) – the tags to attach to the document

Return type:

Generator[T, None, None]

Returns:

a generator that yields a Document object from a webcam

classmethod get_json_schema(indent=2)#

Return a JSON Schema of Document class.

Return type:

str

get_multi_modal_attribute(attribute)#
Return type:

DocumentArray

get_vocabulary(text_attrs=('text',))#

Get the text vocabulary in a counter dict that maps from the word to its frequency from all text_fields.

Parameters:

text_attrs (Tuple[str, ...]) – the textual attributes where vocabulary will be derived from

Return type:

Dict[str, int]

Returns:

a vocabulary in dictionary where key is the word, value is the frequency of that word in all text fields.

property granularity: Optional[int]#
Return type:

Optional[int]

property id: str#
Return type:

str

property is_multimodal: bool#

Return true if this Document can be represented by a class wrapped by docarray.dataclasses.types.dataclass().

Return type:

bool

load_pil_image_to_datauri(image)#

Convert a pillow image into a datauri with header data:image/png.

Parameters:

image (PILImage) – a pillow image

Returns:

itself after processed

load_uri_to_audio_tensor()#

Convert an audio uri into tensor inplace

Return type:

T

Returns:

Document itself after processed

load_uri_to_blob(**kwargs)#

Convert uri to blob inplace. Internally it downloads from the URI and set blob.

Parameters:

kwargs – keyword arguments to pass to :meth:_uri_to_blob such as timeout

Return type:

T

Returns:

itself after processed

load_uri_to_image_tensor(width=None, height=None, channel_axis=-1, **kwargs)#

Convert the image-like uri into tensor

Parameters:
  • width (Optional[int]) – the width of the image tensor.

  • height (Optional[int]) – the height of the tensor.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • kwargs – keyword arguments to pass to :meth:_uri_to_blob such as timeout

Return type:

T

Returns:

itself after processed

load_uri_to_point_cloud_tensor(samples, as_chunks=False)#

Convert a 3d mesh-like uri into tensor

Parameters:
  • samples (int) – number of points to sample from the mesh

  • as_chunks (bool) – when multiple geometry stored in one mesh file, then store each geometry into different chunks

Return type:

T

Returns:

itself after processed

load_uri_to_text(charset='utf-8', **kwargs)#

Convert uri to :attr`.text` inplace.

Parameters:
  • charset (str) – charset may be any character set registered with IANA

  • kwargs – keyword arguments to pass to :meth:_uri_to_blob such as timeout

Return type:

T

Returns:

itself after processed

load_uri_to_video_tensor(only_keyframes=False)#

Convert a uri to a video ndarray tensor.

Parameters:

only_keyframes (bool) – only keep the keyframes in the video

Return type:

T

Returns:

Document itself after processed

property location: Optional[List[float]]#
Return type:

Optional[List[float]]

match(*args, **kwargs)#
Return type:

T

property matches: Optional[MatchArray]#
Return type:

Optional[MatchArray]

property mime_type: Optional[str]#
Return type:

Optional[str]

property modality: Optional[str]#
Return type:

Optional[str]

property nbytes: int#

Return total bytes consumed by protobuf.

Return type:

int

Returns:

number of bytes

property non_empty_fields: Tuple[str]#

Get all non-emtpy fields of this Document.

Non-empty fields are the fields with not-None and not-default values.

Return type:

Tuple[str]

Returns:

field names in a tuple.

property offset: Optional[float]#
Return type:

Optional[float]

property parent_id: Optional[str]#
Return type:

Optional[str]

plot_matches_sprites(top_k=10, channel_axis=-1, inv_normalize=False, skip_empty=False, canvas_size=1920, min_size=100, output=None)#

Generate a sprite image for the query and its matching images in this Document object.

An image sprite is a collection of images put into a single image. Query image is on the left followed by matching images. The Document object should contain matches.

Parameters:
  • top_k (int) – the number of top matching documents to show in the sprite.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • inv_normalize (bool) – If set to True, inverse the normalization of a float32 image tensor into a uint8 image tensor inplace.

  • skip_empty (bool) – skip matches which has no .uri or .tensor.

  • canvas_size (int) – the width of the canvas

  • min_size (int) – the minimum size of the image

  • output (Optional[str]) – Optional path to store the visualization. If not given, show in UI

pop(*fields)#

Clear some fields from this Document to their default values.

Parameters:

fields – field names to clear.

Return type:

None

post(*args, **kwargs)#
Return type:

T

save_audio_tensor_to_file(file, sample_rate=44100, sample_width=2)#

Save tensor into an wav file. Mono/stereo is preserved.

Parameters:
  • file (Union[str, BinaryIO]) – if file is a string, open the file by that name, otherwise treat it as a file-like object.

  • sample_rate (int) – sampling frequency

  • sample_width (int) – sample width in bytes

Return type:

T

Returns:

Document itself after processed

save_blob_to_file(file)#

Save blob into a file

Parameters:

file (Union[str, BinaryIO]) – File or filename to which the data is saved.

Return type:

T

Returns:

itself after processed

save_image_tensor_to_file(file, channel_axis=-1, image_format='png')#

Save tensor into a file

Parameters:
  • file (Union[str, BinaryIO]) – File or filename to which the data is saved.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • image_format (str) – either png or jpeg

Return type:

T

Returns:

itself after processed

save_uri_to_file(file)#

Save uri into a file

Parameters:

file (Union[str, BinaryIO]) – File or filename to which the data is saved.

Return type:

T

Returns:

itself after processed

save_video_tensor_to_file(file, frame_rate=30, codec='h264')#

Save tensor as a video mp4/h264 file.

Parameters:
  • file (Union[str, BinaryIO]) – The file to open, which can be either a string or a file-like object.

  • frame_rate (int) – frames per second

  • codec (str) – the name of a decoder/encoder

Return type:

T

Returns:

itself after processed

property scores: Optional[Dict[str, NamedScore]]#
Return type:

Optional[Dict[str, NamedScore]]

set_image_tensor_channel_axis(original_channel_axis, new_channel_axis)#

Move the channel axis of the image tensor inplace.

Parameters:
  • original_channel_axis (int) – the original axis of the channel

  • new_channel_axis (int) – the new axis of the channel

Return type:

T

Returns:

itself after processed

set_image_tensor_inv_normalization(channel_axis=-1, img_mean=(0.485, 0.456, 0.406), img_std=(0.229, 0.224, 0.225))#

Inverse the normalization of a float32 image tensor into a uint8 image tensor inplace.

Parameters:
  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • img_mean (Tuple[float]) – the mean of all images

  • img_std (Tuple[float]) – the standard deviation of all images

Return type:

T

Returns:

itself after processed

set_image_tensor_normalization(channel_axis=-1, img_mean=(0.485, 0.456, 0.406), img_std=(0.229, 0.224, 0.225))#

Normalize a uint8 image tensor into a float32 image tensor inplace.

Following Pytorch standard, the image must be in the shape of shape (3 x H x W) and will be normalized in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]. These two arrays are computed based on millions of images. If you want to train from scratch on your own dataset, you can calculate the new mean and std. Otherwise, using the Imagenet pretrianed model with its own mean and std is recommended.

Parameters:
  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

  • img_mean (Tuple[float]) – the mean of all images

  • img_std (Tuple[float]) – the standard deviation of all images

Return type:

T

Returns:

itself after processed

Warning

Please do NOT generalize this function to gray scale, black/white image, it does not make any sense for non RGB image. if you look at their MNIST examples, the mean and stddev are 1-dimensional (since the inputs are greyscale– no RGB channels).

set_image_tensor_resample(ratio, channel_axis=-1)#

Resample the image tensor into different size inplace.

Parameters:
  • ratio (float) – scale ratio of the resampled image tensor.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

Return type:

T

Returns:

itself after processed

set_image_tensor_shape(shape, channel_axis=-1)#

Resample the image tensor into different size inplace.

If your current image tensor has shape [H,W,C], then the new tensor will be [*shape, C]

Parameters:
  • shape (Tuple[int, int]) – the new shape of the image tensor.

  • channel_axis (int) – the axis id of the color channel, -1 indicates the color channel info at the last axis

Return type:

T

Returns:

itself after processed

set_multi_modal_attribute(attribute, value)#
summary()#

Print non-empty fields and nested structure of this Document object.

Return type:

None

property tags: Optional[Dict[str, StructValueType]]#
Return type:

Optional[Dict[str, StructValueType]]

property tensor: Optional[ArrayType]#
Return type:

Optional[ArrayType]

property text: Optional[str]#
Return type:

Optional[str]

to_base64(protocol='pickle', compress=None)#

Serialize a Document object into as base64 string

Parameters:
  • protocol (str) – protocol to use

  • compress (Optional[str]) – compress method to use

Return type:

str

Returns:

a base64 encoded string

to_bytes(protocol='pickle', compress=None)#
Return type:

bytes

to_dict(protocol='jsonschema', **kwargs)#

Convert itself into a Python dict object.

Parameters:
  • protocol (str) – jsonschema or protobuf

  • kwargs – extra key-value args pass to pydantic and protobuf dumper.

Return type:

Dict[str, Any]

Returns:

the dumped Document as a dict object

to_json(protocol='jsonschema', **kwargs)#

Convert itself into a JSON string.

Parameters:
  • protocol (str) – jsonschema or protobuf

  • kwargs – extra key-value args pass to pydantic and protobuf dumper.

Return type:

str

Returns:

the dumped JSON string

to_protobuf(ndarray_type=None)#

Convert Document into a Protobuf message.

Parameters:

ndarray_type (Optional[str]) – can be list or numpy, if set it will force all ndarray-like object to be List or numpy.ndarray.

Return type:

DocumentProto

Returns:

the protobuf message

to_pydantic_model()#

Convert a Document object into a Pydantic model.

Return type:

PydanticDocument

to_strawberry_type()#

Convert a Document object into a Strawberry type.

Return type:

StrawberryDocument

property uri: Optional[str]#
Return type:

Optional[str]

property weight: Optional[float]#
Return type:

Optional[float]