Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cdfa1ac5b2 | ||
|
|
ffcdf5ee10 |
@@ -3,7 +3,23 @@ module github.com/libnovel/backend
|
||||
go 1.26.1
|
||||
|
||||
require (
|
||||
github.com/getsentry/sentry-go v0.43.0
|
||||
github.com/hibiken/asynq v0.26.0
|
||||
github.com/hibiken/asynq/x v0.0.0-20260203063626-d704b68a426d
|
||||
github.com/meilisearch/meilisearch-go v0.36.1
|
||||
github.com/minio/minio-go/v7 v7.0.98
|
||||
github.com/pdfcpu/pdfcpu v0.11.1
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/redis/go-redis/v9 v9.18.0
|
||||
github.com/yuin/goldmark v1.8.2
|
||||
go.opentelemetry.io/contrib/bridges/otelslog v0.17.0
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0
|
||||
go.opentelemetry.io/otel v1.42.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.18.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.42.0
|
||||
go.opentelemetry.io/otel/log v0.18.0
|
||||
go.opentelemetry.io/otel/sdk v1.42.0
|
||||
go.opentelemetry.io/otel/sdk/log v0.18.0
|
||||
golang.org/x/net v0.51.0
|
||||
)
|
||||
|
||||
@@ -13,12 +29,9 @@ require (
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/clipperhouse/uax29/v2 v2.2.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/dslipak/pdf v0.0.2 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/getsentry/sentry-go v0.43.0 // indirect
|
||||
github.com/go-ini/ini v1.67.0 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
@@ -28,41 +41,25 @@ require (
|
||||
github.com/hhrutter/lzw v1.0.0 // indirect
|
||||
github.com/hhrutter/pkcs7 v0.2.0 // indirect
|
||||
github.com/hhrutter/tiff v1.0.2 // indirect
|
||||
github.com/hibiken/asynq v0.26.0 // indirect
|
||||
github.com/hibiken/asynq/x v0.0.0-20260203063626-d704b68a426d // indirect
|
||||
github.com/klauspost/compress v1.18.2 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.11 // indirect
|
||||
github.com/klauspost/crc32 v1.3.0 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.19 // indirect
|
||||
github.com/meilisearch/meilisearch-go v0.36.1 // indirect
|
||||
github.com/minio/crc64nvme v1.1.1 // indirect
|
||||
github.com/minio/md5-simd v1.1.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/pdfcpu/pdfcpu v0.11.1 // indirect
|
||||
github.com/philhofer/fwd v1.2.0 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/prometheus/client_golang v1.23.2 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.66.1 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/redis/go-redis/v9 v9.18.0 // indirect
|
||||
github.com/robfig/cron/v3 v3.0.1 // indirect
|
||||
github.com/rs/xid v1.6.0 // indirect
|
||||
github.com/spf13/cast v1.10.0 // indirect
|
||||
github.com/tinylib/msgp v1.6.1 // indirect
|
||||
github.com/yuin/goldmark v1.8.2 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||
go.opentelemetry.io/contrib/bridges/otelslog v0.17.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect
|
||||
go.opentelemetry.io/otel v1.42.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.18.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.42.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.42.0 // indirect
|
||||
go.opentelemetry.io/otel/log v0.18.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.42.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk v1.42.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk/log v0.18.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.42.0 // indirect
|
||||
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
|
||||
go.uber.org/atomic v1.11.0 // indirect
|
||||
@@ -78,5 +75,4 @@ require (
|
||||
google.golang.org/grpc v1.79.2 // indirect
|
||||
google.golang.org/protobuf v1.36.11 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
||||
@@ -2,6 +2,10 @@ github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7X
|
||||
github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
|
||||
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
||||
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
@@ -12,14 +16,16 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/dslipak/pdf v0.0.2 h1:djAvcM5neg9Ush+zR6QXB+VMJzR6TdnX766HPIg1JmI=
|
||||
github.com/dslipak/pdf v0.0.2/go.mod h1:2L3SnkI9cQwnAS9gfPz2iUoLC0rUZwbucpbKi5R1mUo=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
|
||||
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
|
||||
github.com/getsentry/sentry-go v0.43.0 h1:XbXLpFicpo8HmBDaInk7dum18G9KSLcjZiyUKS+hLW4=
|
||||
github.com/getsentry/sentry-go v0.43.0/go.mod h1:XDotiNZbgf5U8bPDUAfvcFmOnMQQceESxyKaObSssW0=
|
||||
github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA=
|
||||
github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og=
|
||||
github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A=
|
||||
github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8=
|
||||
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
|
||||
@@ -29,6 +35,10 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
|
||||
@@ -50,6 +60,12 @@ github.com/klauspost/cpuid/v2 v2.2.11 h1:0OwqZRYI2rFrjS4kvkDnqJkKHdHaRnCm68/DY4O
|
||||
github.com/klauspost/cpuid/v2 v2.2.11/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
|
||||
github.com/klauspost/crc32 v1.3.0 h1:sSmTt3gUt81RP655XGZPElI0PelVTZ6YwCRnPSupoFM=
|
||||
github.com/klauspost/crc32 v1.3.0/go.mod h1:D7kQaZhnkX/Y0tstFGf8VUzv2UofNGqCjnC3zdHB0Hw=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
|
||||
github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
|
||||
github.com/meilisearch/meilisearch-go v0.36.1 h1:mJTCJE5g7tRvaqKco6DfqOuJEjX+rRltDEnkEC02Y0M=
|
||||
@@ -66,42 +82,40 @@ github.com/pdfcpu/pdfcpu v0.11.1 h1:htHBSkGH5jMKWC6e0sihBFbcKZ8vG1M67c8/dJxhjas=
|
||||
github.com/pdfcpu/pdfcpu v0.11.1/go.mod h1:pP3aGga7pRvwFWAm9WwFvo+V68DfANi9kxSQYioNYcw=
|
||||
github.com/philhofer/fwd v1.2.0 h1:e6DnBTl7vGY+Gz322/ASL4Gyp1FspeMvx1RNDoToZuM=
|
||||
github.com/philhofer/fwd v1.2.0/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
|
||||
github.com/pingcap/errors v0.11.4 h1:lFuQV/oaUMGcD2tqt+01ROSmJs75VG1ToEOkZIZ4nE4=
|
||||
github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y=
|
||||
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
|
||||
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
|
||||
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
|
||||
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
|
||||
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
|
||||
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
|
||||
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
|
||||
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
|
||||
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
|
||||
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
|
||||
github.com/redis/go-redis/v9 v9.18.0 h1:pMkxYPkEbMPwRdenAzUNyFNrDgHx9U+DrBabWNfSRQs=
|
||||
github.com/redis/go-redis/v9 v9.18.0/go.mod h1:k3ufPphLU5YXwNTUcCRXGxUoF1fqxnhFQmscfkCoDA0=
|
||||
github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
|
||||
github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
|
||||
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
|
||||
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
|
||||
github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU=
|
||||
github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
|
||||
github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY=
|
||||
github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/tinylib/msgp v1.6.1 h1:ESRv8eL3u+DNHUoSAAQRE50Hm162zqAnBoGv9PzScPY=
|
||||
github.com/tinylib/msgp v1.6.1/go.mod h1:RSp0LW9oSxFut3KzESt5Voq4GVWyS+PSulT77roAqEA=
|
||||
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
|
||||
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
|
||||
github.com/yuin/goldmark v1.8.2 h1:kEGpgqJXdgbkhcOgBxkC0X0PmoPG1ZyoZ117rDVp4zE=
|
||||
github.com/yuin/goldmark v1.8.2/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
|
||||
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
|
||||
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
||||
go.opentelemetry.io/contrib/bridges/otelslog v0.17.0 h1:NFIS6x7wyObQ7cR84x7bt1sr8nYBx89s3x3GwRjw40k=
|
||||
@@ -124,12 +138,18 @@ go.opentelemetry.io/otel/sdk v1.42.0 h1:LyC8+jqk6UJwdrI/8VydAq/hvkFKNHZVIWuslJXY
|
||||
go.opentelemetry.io/otel/sdk v1.42.0/go.mod h1:rGHCAxd9DAph0joO4W6OPwxjNTYWghRWmkHuGbayMts=
|
||||
go.opentelemetry.io/otel/sdk/log v0.18.0 h1:n8OyZr7t7otkeTnPTbDNom6rW16TBYGtvyy2Gk6buQw=
|
||||
go.opentelemetry.io/otel/sdk/log v0.18.0/go.mod h1:C0+wxkTwKpOCZLrlJ3pewPiiQwpzycPI/u6W0Z9fuYk=
|
||||
go.opentelemetry.io/otel/sdk/log/logtest v0.18.0 h1:l3mYuPsuBx6UKE47BVcPrZoZ0q/KER57vbj2qkgDLXA=
|
||||
go.opentelemetry.io/otel/sdk/log/logtest v0.18.0/go.mod h1:7cHtiVJpZebB3wybTa4NG+FUo5NPe3PROz1FqB0+qdw=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc=
|
||||
go.opentelemetry.io/otel/trace v1.42.0 h1:OUCgIPt+mzOnaUTpOQcBiM/PLQ/Op7oq6g4LenLmOYY=
|
||||
go.opentelemetry.io/otel/trace v1.42.0/go.mod h1:f3K9S+IFqnumBkKhRJMeaZeNk9epyhnCmQh/EysQCdc=
|
||||
go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
|
||||
go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
|
||||
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
|
||||
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
|
||||
@@ -146,6 +166,8 @@ golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
|
||||
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
|
||||
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
|
||||
golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
|
||||
@@ -154,9 +176,9 @@ google.golang.org/grpc v1.79.2 h1:fRMD94s2tITpyJGtBBn7MkMseNpOZU8ZxgC3MMBaXRU=
|
||||
google.golang.org/grpc v1.79.2/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
|
||||
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
|
||||
@@ -6,16 +6,17 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/dslipak/pdf"
|
||||
"github.com/libnovel/backend/internal/bookstore"
|
||||
"github.com/libnovel/backend/internal/domain"
|
||||
minio "github.com/minio/minio-go/v7"
|
||||
"github.com/pdfcpu/pdfcpu/pkg/api"
|
||||
pdfcpu "github.com/pdfcpu/pdfcpu/pkg/pdfcpu"
|
||||
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
@@ -60,6 +61,7 @@ func (i *importer) Import(ctx context.Context, objectKey, fileType string) ([]bo
|
||||
// chapter count and up to 3 preview lines (first non-empty line of each of
|
||||
// the first 3 chapters). It is used by the analyze-only endpoint so users
|
||||
// can preview chapter count before committing the import.
|
||||
// Note: uses parsePDF which is backed by pdfcpu ExtractContent — fast, no hang risk.
|
||||
func AnalyzeFile(data []byte, fileType string) (chapterCount int, firstLines []string, err error) {
|
||||
var chapters []bookstore.Chapter
|
||||
switch fileType {
|
||||
@@ -139,48 +141,469 @@ func ParseImportFile(ctx context.Context, data []byte, fileType string) ([]books
|
||||
}
|
||||
}
|
||||
|
||||
// parsePDF extracts chapters from PDF bytes using dslipak/pdf.
|
||||
// It first attempts to decrypt the PDF with an empty password in case the file
|
||||
// uses owner-only encryption (copy/print restrictions), which is common for
|
||||
// publisher PDFs that open normally in readers but confuse raw parsers.
|
||||
// pdfSkipBookmarks lists bookmark titles that are front/back matter, not story chapters.
|
||||
// These are skipped when building the chapter list.
|
||||
var pdfSkipBookmarks = map[string]bool{
|
||||
"cover": true, "insert": true, "title page": true, "copyright": true,
|
||||
"appendix": true, "color insert": true, "color illustrations": true,
|
||||
}
|
||||
|
||||
// parsePDF extracts chapters from PDF bytes.
|
||||
//
|
||||
// Strategy:
|
||||
// 1. Decrypt owner-protected PDFs (empty user password).
|
||||
// 2. Read the PDF outline (bookmarks) — these give chapter titles and page ranges.
|
||||
// 3. Extract raw content streams for every page using pdfcpu ExtractContent.
|
||||
// 4. For each story bookmark, concatenate the extracted text of its pages.
|
||||
//
|
||||
// Falls back to paragraph-splitting when no bookmarks are found.
|
||||
// This is fast (~100ms for a 250-page PDF) because it avoids font-glyph
|
||||
// resolution which causes older PDF libraries to hang on publisher PDFs.
|
||||
func parsePDF(data []byte) ([]bookstore.Chapter, error) {
|
||||
// If the PDF is encrypted, try to decrypt it with an empty password.
|
||||
// Many publisher PDFs use owner-only encryption (copy/print restrictions)
|
||||
// with an empty user password, so they open normally but confuse parsers.
|
||||
// Decrypt owner-protected PDFs (empty user password).
|
||||
decrypted, err := decryptPDF(data)
|
||||
if err == nil {
|
||||
data = decrypted
|
||||
}
|
||||
// (if decryption fails we still attempt to parse — maybe it works anyway)
|
||||
|
||||
r, err := pdf.NewReader(bytes.NewReader(data), int64(len(data)))
|
||||
conf := model.NewDefaultConfiguration()
|
||||
conf.UserPW = ""
|
||||
conf.OwnerPW = ""
|
||||
|
||||
// Extract all page content streams to a temp directory.
|
||||
tmpDir, err := os.MkdirTemp("", "pdf-extract-*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open PDF: %w", err)
|
||||
return nil, fmt.Errorf("create temp dir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
if err := api.ExtractContent(bytes.NewReader(data), tmpDir, "out", nil, conf); err != nil {
|
||||
return nil, fmt.Errorf("extract PDF content: %w", err)
|
||||
}
|
||||
|
||||
// Extract per-page text so we can detect chapter boundaries.
|
||||
numPages := r.NumPage()
|
||||
if numPages == 0 {
|
||||
return nil, fmt.Errorf("PDF has no pages")
|
||||
entries, err := os.ReadDir(tmpDir)
|
||||
if err != nil || len(entries) == 0 {
|
||||
return nil, fmt.Errorf("PDF has no content pages")
|
||||
}
|
||||
|
||||
// Collect full text first with page markers so we can split by chapter.
|
||||
// pdfcpu names files "out_Content_page_N.txt" — parse the page number
|
||||
// from the filename so the map is correct regardless of lexicographic order.
|
||||
pageTexts := make(map[int]string, len(entries))
|
||||
for _, e := range entries {
|
||||
pageNum := pageNumFromFilename(e.Name())
|
||||
if pageNum <= 0 {
|
||||
continue
|
||||
}
|
||||
raw, readErr := os.ReadFile(tmpDir + "/" + e.Name())
|
||||
if readErr != nil {
|
||||
continue
|
||||
}
|
||||
pageTexts[pageNum] = fixWin1252(extractTextFromContentStream(raw))
|
||||
}
|
||||
|
||||
// Try to use bookmarks (outline) for chapter structure.
|
||||
bookmarks, bmErr := api.Bookmarks(bytes.NewReader(data), conf)
|
||||
if bmErr == nil && len(bookmarks) > 0 {
|
||||
chapters := chaptersFromBookmarks(bookmarks, pageTexts)
|
||||
if len(chapters) > 0 {
|
||||
return chapters, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: concatenate all page texts in page order and split by heading patterns.
|
||||
var sb strings.Builder
|
||||
fonts := make(map[string]*pdf.Font)
|
||||
for i := 1; i <= numPages; i++ {
|
||||
page := r.Page(i)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
maxPage := 0
|
||||
for p := range pageTexts {
|
||||
if p > maxPage {
|
||||
maxPage = p
|
||||
}
|
||||
text, err := page.GetPlainText(fonts)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sb.WriteString(text)
|
||||
}
|
||||
for p := 1; p <= maxPage; p++ {
|
||||
sb.WriteString(pageTexts[p])
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
chapters := extractChaptersFromText(sb.String())
|
||||
if len(chapters) == 0 {
|
||||
return nil, fmt.Errorf("could not extract any chapters from PDF")
|
||||
}
|
||||
return chapters, nil
|
||||
}
|
||||
|
||||
return extractChaptersFromText(sb.String()), nil
|
||||
// chaptersFromBookmarks builds a chapter list from PDF bookmarks + per-page text.
|
||||
// It flattens the bookmark tree, skips front/back matter entries, and assigns
|
||||
// page ranges so each chapter spans from its own start page to the next
|
||||
// bookmark's start page minus one.
|
||||
func chaptersFromBookmarks(bookmarks []pdfcpu.Bookmark, pageTexts map[int]string) []bookstore.Chapter {
|
||||
// Flatten bookmark tree.
|
||||
var flat []pdfcpu.Bookmark
|
||||
var flatten func([]pdfcpu.Bookmark)
|
||||
flatten = func(bms []pdfcpu.Bookmark) {
|
||||
for _, bm := range bms {
|
||||
flat = append(flat, bm)
|
||||
flatten(bm.Kids)
|
||||
}
|
||||
}
|
||||
flatten(bookmarks)
|
||||
|
||||
// Sort by page number.
|
||||
sort.Slice(flat, func(i, j int) bool { return flat[i].PageFrom < flat[j].PageFrom })
|
||||
|
||||
// Assign PageThru for entries where it's 0 (last bookmark or missing).
|
||||
maxPage := 0
|
||||
for p := range pageTexts {
|
||||
if p > maxPage {
|
||||
maxPage = p
|
||||
}
|
||||
}
|
||||
for i := range flat {
|
||||
if flat[i].PageThru == 0 {
|
||||
if i+1 < len(flat) {
|
||||
flat[i].PageThru = flat[i+1].PageFrom - 1
|
||||
} else {
|
||||
flat[i].PageThru = maxPage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var chapters []bookstore.Chapter
|
||||
chNum := 0
|
||||
for _, bm := range flat {
|
||||
if pdfSkipBookmarks[strings.ToLower(strings.TrimSpace(bm.Title))] {
|
||||
continue
|
||||
}
|
||||
// Gather text for all pages in this bookmark's range.
|
||||
// The first page of each chapter is typically a decorative title page
|
||||
// (chapter number, subtitle art, series title) — skip it and start
|
||||
// from PageFrom+1 so the content begins with actual story text.
|
||||
bodyStart := bm.PageFrom + 1
|
||||
if bodyStart > bm.PageThru {
|
||||
bodyStart = bm.PageFrom // single-page section, use it
|
||||
}
|
||||
var sb strings.Builder
|
||||
for p := bodyStart; p <= bm.PageThru; p++ {
|
||||
if t, ok := pageTexts[p]; ok {
|
||||
sb.WriteString(t)
|
||||
sb.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
text := cleanChapterText(strings.TrimSpace(sb.String()))
|
||||
if len(text) < 50 {
|
||||
continue // skip nearly-empty sections
|
||||
}
|
||||
chNum++
|
||||
chapters = append(chapters, bookstore.Chapter{
|
||||
Number: chNum,
|
||||
Title: bm.Title,
|
||||
Content: text,
|
||||
})
|
||||
}
|
||||
return chapters
|
||||
}
|
||||
|
||||
// cleanChapterText removes decorative header fragments that sometimes appear
|
||||
// at the start of the first body page when the chapter subtitle is printed
|
||||
// at the top of that page (e.g. "for New Journeys!I stood atop the roof...").
|
||||
//
|
||||
// It strips any prefix text up to and including the last '!' or '?' that is
|
||||
// immediately followed by a capital letter on the same line (a run-on from the
|
||||
// title art), and removes short leading lines (< 40 chars) that look like
|
||||
// title/header text rather than story content.
|
||||
func cleanChapterText(text string) string {
|
||||
lines := strings.Split(text, "\n")
|
||||
|
||||
// Find first line that is substantive story content.
|
||||
// Strategy: skip short lines at the top. The first line >= 40 chars
|
||||
// OR starting with an opening quote is the start of the story.
|
||||
start := 0
|
||||
for i, raw := range lines {
|
||||
line := strings.TrimSpace(raw)
|
||||
if line == "" {
|
||||
start = i + 1
|
||||
continue
|
||||
}
|
||||
// Long enough to be a real sentence fragment from a body page.
|
||||
if len(line) >= 40 || strings.HasPrefix(line, "\u201C") || strings.HasPrefix(line, "\"") {
|
||||
start = i
|
||||
break
|
||||
}
|
||||
// Short line — if it ends with '!' or '?' and the NEXT non-empty
|
||||
// token on the SAME line (run-on) starts a sentence, strip it.
|
||||
// This catches "for New Journeys!I stood atop..." on one line.
|
||||
start = i + 1 // tentatively skip this short line
|
||||
}
|
||||
|
||||
result := strings.TrimSpace(strings.Join(lines[start:], "\n"))
|
||||
|
||||
// Strip any run-on title fragment at the very start of the first line.
|
||||
// Pattern: something ending with '!' or '?' immediately before a capital letter.
|
||||
// e.g. "for New Journeys!I stood..." → "I stood..."
|
||||
if len(result) > 0 {
|
||||
// Find last '!' or '?' in the first 80 bytes that is followed by [A-Z"].
|
||||
firstLine := result
|
||||
if nl := strings.Index(firstLine, "\n"); nl >= 0 {
|
||||
firstLine = firstLine[:nl]
|
||||
}
|
||||
for i, c := range firstLine {
|
||||
if (c == '!' || c == '?') && i+1 < len(firstLine) {
|
||||
next := rune(firstLine[i+1])
|
||||
if (next >= 'A' && next <= 'Z') || next == '\u201C' || next == '"' {
|
||||
// Strip up to and including this '!'/'?'
|
||||
result = strings.TrimSpace(result[i+1:])
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if result == "" {
|
||||
return text
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// pageNumFromFilename extracts the page number from a pdfcpu content-stream
|
||||
// filename like "out_Content_page_42.txt". Returns 0 if not parseable.
|
||||
func pageNumFromFilename(name string) int {
|
||||
// Strip directory prefix and extension.
|
||||
base := name
|
||||
if idx := strings.LastIndex(base, "/"); idx >= 0 {
|
||||
base = base[idx+1:]
|
||||
}
|
||||
if idx := strings.LastIndex(base, "."); idx >= 0 {
|
||||
base = base[:idx]
|
||||
}
|
||||
// Find last "_" and parse the number after it.
|
||||
if idx := strings.LastIndex(base, "_"); idx >= 0 {
|
||||
n, err := strconv.Atoi(base[idx+1:])
|
||||
if err == nil && n > 0 {
|
||||
return n
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// win1252ToUnicode maps the Windows-1252 control range 0x80–0x9F to the
|
||||
// Unicode characters they actually represent in that encoding.
|
||||
// Standard Latin-1 maps these bytes to control characters; Win-1252 maps
|
||||
// them to typographic symbols that appear in publisher PDFs.
|
||||
var win1252ToUnicode = map[byte]rune{
|
||||
0x80: '\u20AC', // €
|
||||
0x82: '\u201A', // ‚
|
||||
0x83: '\u0192', // ƒ
|
||||
0x84: '\u201E', // „
|
||||
0x85: '\u2026', // …
|
||||
0x86: '\u2020', // †
|
||||
0x87: '\u2021', // ‡
|
||||
0x88: '\u02C6', // ˆ
|
||||
0x89: '\u2030', // ‰
|
||||
0x8A: '\u0160', // Š
|
||||
0x8B: '\u2039', // ‹
|
||||
0x8C: '\u0152', // Œ
|
||||
0x8E: '\u017D', // Ž
|
||||
0x91: '\u2018', // ' (left single quotation mark)
|
||||
0x92: '\u2019', // ' (right single quotation mark / apostrophe)
|
||||
0x93: '\u201C', // " (left double quotation mark)
|
||||
0x94: '\u201D', // " (right double quotation mark)
|
||||
0x95: '\u2022', // • (bullet)
|
||||
0x96: '\u2013', // – (en dash)
|
||||
0x97: '\u2014', // — (em dash)
|
||||
0x98: '\u02DC', // ˜
|
||||
0x99: '\u2122', // ™
|
||||
0x9A: '\u0161', // š
|
||||
0x9B: '\u203A', // ›
|
||||
0x9C: '\u0153', // œ
|
||||
0x9E: '\u017E', // ž
|
||||
0x9F: '\u0178', // Ÿ
|
||||
}
|
||||
|
||||
// fixWin1252 replaces Windows-1252 specific bytes (0x80–0x9F) in a string
|
||||
// that was decoded as raw Latin-1 bytes with their proper Unicode equivalents.
|
||||
func fixWin1252(s string) string {
|
||||
// Fast path: if no bytes in 0x80–0x9F range, return unchanged.
|
||||
needsFix := false
|
||||
for i := 0; i < len(s); i++ {
|
||||
b := s[i]
|
||||
if b >= 0x80 && b <= 0x9F {
|
||||
needsFix = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !needsFix {
|
||||
return s
|
||||
}
|
||||
var sb strings.Builder
|
||||
sb.Grow(len(s))
|
||||
for i := 0; i < len(s); i++ {
|
||||
b := s[i]
|
||||
if b >= 0x80 && b <= 0x9F {
|
||||
if r, ok := win1252ToUnicode[b]; ok {
|
||||
sb.WriteRune(r)
|
||||
continue
|
||||
}
|
||||
}
|
||||
sb.WriteByte(b)
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// extractTextFromContentStream parses a raw PDF content stream and extracts
|
||||
// readable text from Tj and TJ operators.
|
||||
//
|
||||
// TJ arrays may contain a mix of literal strings (parenthesised) and hex glyph
|
||||
// arrays. Only the literal strings are decoded — hex arrays require per-font
|
||||
// ToUnicode CMaps and are skipped. Kerning adjustment numbers inside TJ arrays
|
||||
// are also ignored (they're just spacing hints).
|
||||
//
|
||||
// Line breaks are inserted on ET / Td / TD / T* operators.
|
||||
func extractTextFromContentStream(stream []byte) string {
|
||||
s := string(stream)
|
||||
var sb strings.Builder
|
||||
i := 0
|
||||
n := len(s)
|
||||
for i < n {
|
||||
// TJ array: [ ... ]TJ — collect all literal strings, skip hex & numbers.
|
||||
if s[i] == '[' {
|
||||
j := i + 1
|
||||
for j < n && s[j] != ']' {
|
||||
if s[j] == '(' {
|
||||
// Literal string inside TJ array.
|
||||
k := j + 1
|
||||
depth := 1
|
||||
for k < n && depth > 0 {
|
||||
if s[k] == '\\' {
|
||||
k += 2
|
||||
continue
|
||||
}
|
||||
if s[k] == '(' {
|
||||
depth++
|
||||
} else if s[k] == ')' {
|
||||
depth--
|
||||
}
|
||||
k++
|
||||
}
|
||||
lit := pdfUnescapeString(s[j+1 : k-1])
|
||||
if hasPrintableASCII(lit) {
|
||||
sb.WriteString(lit)
|
||||
}
|
||||
j = k
|
||||
continue
|
||||
}
|
||||
j++
|
||||
}
|
||||
// Check if this is a TJ operator (skip whitespace after ']').
|
||||
end := j + 1
|
||||
for end < n && (s[end] == ' ' || s[end] == '\t' || s[end] == '\r' || s[end] == '\n') {
|
||||
end++
|
||||
}
|
||||
if end+2 <= n && s[end:end+2] == "TJ" && (end+2 == n || !isAlphaNum(s[end+2])) {
|
||||
i = end + 2
|
||||
continue
|
||||
}
|
||||
i = j + 1
|
||||
continue
|
||||
}
|
||||
// Single string: (string) Tj
|
||||
if s[i] == '(' {
|
||||
j := i + 1
|
||||
depth := 1
|
||||
for j < n && depth > 0 {
|
||||
if s[j] == '\\' {
|
||||
j += 2
|
||||
continue
|
||||
}
|
||||
if s[j] == '(' {
|
||||
depth++
|
||||
} else if s[j] == ')' {
|
||||
depth--
|
||||
}
|
||||
j++
|
||||
}
|
||||
lit := pdfUnescapeString(s[i+1 : j-1])
|
||||
if hasPrintableASCII(lit) {
|
||||
// Check for Tj operator.
|
||||
end := j
|
||||
for end < n && (s[end] == ' ' || s[end] == '\t') {
|
||||
end++
|
||||
}
|
||||
if end+2 <= n && s[end:end+2] == "Tj" && (end+2 == n || !isAlphaNum(s[end+2])) {
|
||||
sb.WriteString(lit)
|
||||
i = end + 2
|
||||
continue
|
||||
}
|
||||
}
|
||||
i = j
|
||||
continue
|
||||
}
|
||||
// Detect end of text object (ET) — add a newline.
|
||||
if i+2 <= n && s[i:i+2] == "ET" && (i+2 == n || !isAlphaNum(s[i+2])) {
|
||||
sb.WriteByte('\n')
|
||||
i += 2
|
||||
continue
|
||||
}
|
||||
// Detect Td / TD / T* — newline within text block.
|
||||
if i+2 <= n && (s[i:i+2] == "Td" || s[i:i+2] == "TD" || s[i:i+2] == "T*") &&
|
||||
(i+2 == n || !isAlphaNum(s[i+2])) {
|
||||
sb.WriteByte('\n')
|
||||
i += 2
|
||||
continue
|
||||
}
|
||||
i++
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
func isAlphaNum(b byte) bool {
|
||||
return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || (b >= '0' && b <= '9') || b == '_'
|
||||
}
|
||||
|
||||
func hasPrintableASCII(s string) bool {
|
||||
for _, c := range s {
|
||||
if c >= 0x20 && c < 0x7F {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// pdfUnescapeString handles PDF string escape sequences.
|
||||
func pdfUnescapeString(s string) string {
|
||||
if !strings.ContainsRune(s, '\\') {
|
||||
return s
|
||||
}
|
||||
var sb strings.Builder
|
||||
i := 0
|
||||
for i < len(s) {
|
||||
if s[i] == '\\' && i+1 < len(s) {
|
||||
switch s[i+1] {
|
||||
case 'n':
|
||||
sb.WriteByte('\n')
|
||||
case 'r':
|
||||
sb.WriteByte('\r')
|
||||
case 't':
|
||||
sb.WriteByte('\t')
|
||||
case '(', ')', '\\':
|
||||
sb.WriteByte(s[i+1])
|
||||
default:
|
||||
// Octal escape \ddd
|
||||
if s[i+1] >= '0' && s[i+1] <= '7' {
|
||||
end := i + 2
|
||||
for end < i+5 && end < len(s) && s[end] >= '0' && s[end] <= '7' {
|
||||
end++
|
||||
}
|
||||
val, _ := strconv.ParseInt(s[i+1:end], 8, 16)
|
||||
sb.WriteByte(byte(val))
|
||||
i = end
|
||||
continue
|
||||
}
|
||||
sb.WriteByte(s[i+1])
|
||||
}
|
||||
i += 2
|
||||
} else {
|
||||
sb.WriteByte(s[i])
|
||||
i++
|
||||
}
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// ── EPUB parsing ──────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user