pdf

package
v0.0.0-...-71ffc55 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 23, 2025 License: MIT Imports: 15 Imported by: 0

Documentation

Overview

Package pdf provides low-level PDF parsing and manipulation functionality.

Index

Constants

This section is empty.

Variables

View Source
var (
	WinAnsiEncoding = []string{}/* 256 elements not displayed */

	MacRomanEncoding = []string{}/* 256 elements not displayed */

	StandardEncoding = []string{}/* 256 elements not displayed */

)

Standard encodings

View Source
var GlyphToUnicode = map[string]string{}/* 564 elements not displayed */

GlyphToUnicode maps glyph names to Unicode strings

Functions

func DecodeStream

func DecodeStream(data []byte, filter Name, params Dictionary) ([]byte, error)

DecodeStream decodes a stream based on its filter and optional parameters

func DecodeStreamFromDict

func DecodeStreamFromDict(data []byte, dict Dictionary) ([]byte, error)

DecodeStreamFromDict decodes stream data using filter(s) specified in the dictionary. Handles both single filters and arrays of filters.

Types

type Array

type Array []Object

Array represents an array object [...]

func (Array) String

func (a Array) String() string

type Boolean

type Boolean bool

Boolean represents a boolean object

func (Boolean) String

func (b Boolean) String() string

type ColorSpaceInfo

type ColorSpaceInfo struct {
	Components int
	IsIndexed  bool
	Palette    []color.RGBA // RGB palette for indexed colors
}

ColorSpaceInfo holds color space information including palette

type Dictionary

type Dictionary map[Name]Object

Dictionary represents a dictionary object << ... >>

func (Dictionary) String

func (d Dictionary) String() string

type Font

type Font struct {
	BaseFont     string
	Subtype      string
	ToUnicode    map[int]string
	Encoding     map[int]string // Custom encoding map (code -> glyph name)
	Widths       []float64
	FirstChar    int
	LastChar     int
	CIDWidths    map[int]float64
	DefaultWidth float64
}

Font represents a PDF font

func (*Font) CalculateWidth

func (f *Font) CalculateWidth(s string) float64

CalculateWidth calculates the width of the string in text space (1000 units)

func (*Font) DecodeString

func (f *Font) DecodeString(s string) string

DecodeString decodes a PDF string using the font's encoding/CMap

func (*Font) GetWidth

func (f *Font) GetWidth(code int) float64

GetWidth returns the width of the character code

type FontManager

type FontManager struct {
	Fonts map[Name]*Font
	// contains filtered or unexported fields
}

FontManager manages fonts for a page

func NewFontManager

func NewFontManager(r *Reader) *FontManager

NewFontManager creates a new font manager

func (*FontManager) CalculateWidth

func (fm *FontManager) CalculateWidth(fontName Name, s string) float64

CalculateWidth calculates the width of the string using the specified font

func (*FontManager) DecodeString

func (fm *FontManager) DecodeString(fontName Name, s string) string

DecodeString decodes a string using the specified font

func (*FontManager) LoadFonts

func (fm *FontManager) LoadFonts(resources Dictionary) error

LoadFonts loads fonts from the page resources

type GraphicsState

type GraphicsState struct {
	CTM Matrix

	// Text State
	Tc    float64 // Character spacing
	Tw    float64 // Word spacing
	Th    float64 // Horizontal scaling
	Tl    float64 // Leading
	Tf    Name    // Font name
	Tfs   float64 // Font size
	Tmode int     // Text rendering mode
	Tr    float64 // Text rise
	Ts    float64 // Text knockout (unused mostly)

	// Path Construction
	CurrentPath []types.PathOperation
	LineWidth   float64
}

GraphicsState holds the current graphics state parameters

type HexString

type HexString string

HexString represents a hex string object <...>

func (HexString) String

func (s HexString) String() string

type IndirectObject

type IndirectObject struct {
	ObjectNumber     int
	GenerationNumber int
	Object           Object
}

IndirectObject represents a definition (ObjNum GenNum obj ... endobj)

type IndirectRef

type IndirectRef struct {
	ObjectNumber     int
	GenerationNumber int
}

IndirectRef represents an indirect reference (ObjNum GenNum R)

func (IndirectRef) String

func (r IndirectRef) String() string

type Integer

type Integer int64

Integer represents an integer object

func (Integer) String

func (i Integer) String() string

type Interpreter

type Interpreter struct {
	FontManager *FontManager
	Resources   Dictionary
	Stack       []Object

	// Graphics State Stack
	State      GraphicsState
	StateStack []GraphicsState

	// Text Object State (reset at BT)
	Tm  Matrix
	Tlm Matrix

	TextBlocks []types.TextBlock
	Images     []types.Image
	Graphics   []types.VectorGraphic
}

Interpreter interprets content streams

func NewInterpreter

func NewInterpreter(fm *FontManager, resources Dictionary) *Interpreter

NewInterpreter creates a new interpreter

func (*Interpreter) Process

func (in *Interpreter) Process(content []byte) ([]types.TextBlock, []types.Image, []types.VectorGraphic, error)

Process interprets the content stream

type Matrix

type Matrix [6]float64

Matrix represents a 3x3 affine transformation matrix [a b c d e f] a b 0 c d 0 e f 1

func IdentityMatrix

func IdentityMatrix() Matrix

IdentityMatrix returns [1 0 0 1 0 0]

func (Matrix) Multiply

func (m Matrix) Multiply(other Matrix) Matrix

Multiply multiplies two matrices: m x other

func (Matrix) Transform

func (m Matrix) Transform(x, y float64) (float64, float64)

Transform transforms a point (x, y)

type Name

type Name string

Name represents a name object (e.g. /Type)

func (Name) String

func (n Name) String() string

type Null

type Null struct{}

Null represents a null object

func (Null) String

func (n Null) String() string

type Object

type Object interface {
	String() string
}

Object represents any PDF object

type Parser

type Parser struct {
	// contains filtered or unexported fields
}

Parser parses PDF objects from a tokenizer

func NewParser

func NewParser(t *Tokenizer) *Parser

NewParser creates a new parser

func (*Parser) ParseObject

func (p *Parser) ParseObject() (Object, error)

ParseObject parses the next object from the stream

type Reader

type Reader struct {
	Trailer   Dictionary
	XrefTable map[int]XrefEntry
	Root      Dictionary
	// contains filtered or unexported fields
}

Reader reads a PDF file

func NewReader

func NewReader(path string) (*Reader, error)

NewReader creates a new PDF reader

func (*Reader) Close

func (r *Reader) Close() error

Close closes the file

func (*Reader) ExtractContent

func (r *Reader) ExtractContent(page Dictionary) ([]byte, error)

ExtractContent extracts the raw content stream from a page

func (r *Reader) ExtractLinks(pageDict Dictionary) ([]types.Link, error)

ExtractLinks extracts links from a page dictionary

func (*Reader) GetPage

func (r *Reader) GetPage(pageIndex int) (Dictionary, error)

GetPage returns the dictionary for the specified page (1-based index)

func (*Reader) GetPageCount

func (r *Reader) GetPageCount() (int, error)

GetPageCount returns the total number of pages

func (*Reader) ReadObject

func (r *Reader) ReadObject(objNum int) (Object, error)

ReadObject reads an indirect object by number

func (*Reader) Resolve

func (r *Reader) Resolve(obj Object) (Object, error)

Resolve resolves an indirect reference to its actual object. If the object is not a reference, it is returned as is.

type Real

type Real float64

Real represents a real number object

func (Real) String

func (r Real) String() string

type Stream

type Stream struct {
	Dictionary Dictionary
	Data       []byte // Raw data (possibly compressed)
}

Stream represents a stream object

func (Stream) String

func (s Stream) String() string

type StringLiteral

type StringLiteral string

StringLiteral represents a string object (...)

func (StringLiteral) String

func (s StringLiteral) String() string

type Token

type Token struct {
	Type  TokenType
	Value string
}

Token represents a lexical token

type TokenType

type TokenType int

TokenType represents the type of a token

const (
	TokenError TokenType = iota
	TokenEOF
	TokenKeyword    // obj, endobj, stream, endstream, xref, trailer, startxref, true, false, null, R
	TokenNumeric    // 123, -12.34
	TokenInteger    // 123 (integer only - for inline image parsing)
	TokenReal       // -12.34 (real only - for inline image parsing)
	TokenName       // /Name
	TokenString     // (String)
	TokenHexString  // <Hex>
	TokenArrayStart // [
	TokenArrayEnd   // ]
	TokenDictStart  // <<
	TokenDictEnd    // >>
)

Token types for PDF parsing.

type Tokenizer

type Tokenizer struct {
	// contains filtered or unexported fields
}

Tokenizer reads tokens from a reader

func NewTokenizer

func NewTokenizer(r io.Reader) *Tokenizer

NewTokenizer creates a new tokenizer

func (*Tokenizer) NextToken

func (t *Tokenizer) NextToken() (Token, error)

NextToken returns the next token from the stream

func (*Tokenizer) ReadStream

func (t *Tokenizer) ReadStream(length int64) ([]byte, error)

ReadStream reads stream content from the PDF.

func (*Tokenizer) ReadUntilEI

func (t *Tokenizer) ReadUntilEI() ([]byte, error)

ReadUntilEI reads raw bytes until the EI (end inline image) marker is found. EI must be preceded by whitespace and followed by whitespace/delimiter/EOF. Returns the image data without the trailing whitespace before EI.

func (*Tokenizer) UnreadToken

func (t *Tokenizer) UnreadToken(tok Token)

UnreadToken pushes a token back to the stream

type XrefEntry

type XrefEntry struct {
	Type         int   // 0=free, 1=in-use, 2=compressed
	Offset       int64 // For Type 1
	Gen          int   // For Type 1
	StreamObjNum int   // For Type 2
	StreamIndex  int   // For Type 2
}

XrefEntry represents an entry in the cross-reference table

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL