UML PDF model

.. uml::

    @startuml

        skinparam object {
            AttributeFontSize 11
        }

        ' increase distance between nodes
        skinparam nodesep 50

        ' default arrow color and arrow font color
        !define COL_ARROW_DEFAULT darkblue
        skinparam arrowcolor COL_ARROW_DEFAULT
        skinparam arrowfontcolor COL_ARROW_DEFAULT
        skinparam shadowing false

        skinparam package<<Layout>> {
            borderColor Transparent
            backgroundColor Transparent
            fontColor Transparent
            stereotypeFontColor Transparent
        }
        ' color of arrows showing inheritance
        !define COL_DERIVE red

        
        class Root {
        --
        **content**: container for elements of type
        ""Chapter/Table/Figure/Paragraph""
        --
        **library/API**: All ""Root"" elements are exposed as given in this diagram.

        **CLI/YAML/JSON**: ""Root.pages"" contain a list of ""Page"" instances without
        ""Page.contents"" attribute. Instead, all content goes to ""Root.contents""
        and the nested ""Chapter.contents"".
        }

        class Element {
            + [str] id
            + [str] type
            --
            **id**: unique in given scope (root or chapter)
            **type**: parent class name

            **b_root**: set, if the element
            is directly located under root
            **b_chapter**: set, if the element
            is located under a chapter
        }

        class Position {
            + [float] x0
            + [float] y0
            + [float] x1
            + [float] y1
            --
            **page**: reference to a ""Page"" instance;
            For YAML/JSON output it is serialized to
            an attribute holding ""Page.id""

            ""Position"" is either referenced by a ""Cell""
            or by an ""Element"" (never both).

            **b_cell**: set, if the Position is referenced
            by a ""Cell""
            **b_element**: set, if the Position is
            referenced by an ""Element""
        }

        class File {
            + [str] id
            + [str] name
            + [str] path
            + [int] page_count
            + [float] crop_top [points]
            + [float] crop_bottom [points]
            + [float] crop_left [points]
            + [float] crop_right [points]
            --
            **id**: ""file.<name-as-identifier>""
        }

        class FileMeta {
            + [str] author
            + [str] title
            + [str] subject
            + [str] creator
            + [str] producer
            + [str] keywords
            + [utc_date] creation_date
            + [utc_date] modified_date
            + [bool] trapped
        }

        class Page {
            + [str] id
            + [int] number
            + [float] width [points]
            + [float] height [points]
            --
            **id**: ""page.<1,2,3,n>""
            **number**: 1-based
            **content**: container for elements of type
            ""Chapter/Table/Figure/Paragraph""
        }

        class Link {
            + [int] idx_start
            + [int] idx_end
            + [dict] pos_target
            + [str] libpdf_target
            --
            **pos_target**: it's a dictionary with the position info
            e.g.
                      page: 3,
                      x: 300.454
                      y: 300.454

            **libpdf_target**: points either to an ""Element"" or
            to a ""Page"". The link is built by concatenating
            nested elements separated by '/', e.g.
            ""  chapter.3/chapter.3.2/table.2""
            For case where the pos_target can not be resolved,
            the target is set to the target coordinates given as page.<id>/<X>:<Y>
            ""  page.4/56:789 ""
        }

        class Paragraph {
            --
            **id**: ""paragraph.<1,2,3,n>""

            A paragraph gets detected by
            layout analysis. Characters are
            merged into words, words into
            lines and lines into paragraphs.

            A new word, line or paragraph
            is started if the gap changes
            significantly.
        }

        class Chapter {
            + [str] title
            + [str] number
            --
            **id**: ""chapter.<number>""
            **number**: globally unique (e.g. 3.2.4)
            **content**: container for elements of type
            ""Chapter/Table/Figure/Paragraph""

            HorizontalBox is None, when the search algorithm 
            did not find a suitable text box 
            on the page (aka. ghost chapter).
        }

        class Cell {
            + [int] row
            + [int] col
            --
            **row**: 1-based
            **col**: 1-based
        }

        class Table {
    
            **id**: ""table.<1,2,3,n>""
        }

        class Figure {
            + [str] rel_path
            + [str] caption
            --
            **id**: ""figure.<1,2,3,n>""
            **rel_path**: figures/<image>
        }

        class Rect {
            + [tuple] non_stroking_color
            --
            **id**: ""rect.<1,2,3,n>""
            **non_stroking_color**: RGB color space,
            given as floats between 0 and 1
        }

        package x <<layout>> {

           class HorizontalBox {
                + [float] x0
                + [float] y0
                + [float] x1
                + [float] y1
                + [str] text
                --
                **text**: plain text in the box
            }

            class HorizontalLine {
                + [float] x0
                + [float] y0
                + [float] x1
                + [float] y1
                + [str] text
                --
                **text**: plain text in the line
            }

            class Word {
                + [float] x0
                + [float] y0
                + [float] x1
                + [float] y1
                + [str] text
                --
                **text**: plain text in the word
            }

            class Char {
                + [float] x0
                + [float] y0
                + [float] x1
                + [float] y1
                + [str] text
                --
                **text**: the character itself
            }
        }
        Char "+chars  1..*" ---left*  Word
        Word "+words  1..*" ---left*  HorizontalLine
        HorizontalLine "+lines  1..*" --*  HorizontalBox
        
        HorizontalBox "+textbox  1" --* Paragraph    
        HorizontalBox "+textbox  0..1" --* Cell
        HorizontalBox "+textboxes  0..*" --* Figure
        HorizontalBox "+textbox  0..1" --* Chapter
        HorizontalBox "+textbox  0..1" --* Rect

        Paragraph "+b_source  1" *-- "+links  *" Link
        Figure "+b_source  1" *-- "+links  *" Link
        Cell "+b_source  1" *-- "+links  *" Link

        Cell "+cells  1..*" --* "+b_table  1" Table
        Cell "+b_cell  1" *-down- "+position  1" Position

        Table -[#COL_DERIVE]|> Element
        Figure -[#COL_DERIVE]-|> Element
        Chapter -[#COL_DERIVE]-|> Element
        Paragraph -[#COL_DERIVE]|> Element
        Rect -[#COL_DERIVE]|> Element
        
        Chapter "1" *-- "+content  *" Element: ordered
        Page "1" *-- "+content  *" Element: ordered

        ' all root structures derive from Element
        Position "+position  1" --* "+b_element  1" Element
        Element "+content *" --* "+b_root  1" Root: ordered
        File "+file  1" --* "+b_root  1" Root
        Page "+pages  1..*" --* "+b_root  1" Root
        Position "+b_positions  *" --* "+page  1" Page
        FileMeta "+file_meta  1" --* "+b_file  1" File

    @enduml