scantools  1.0.8
Graphics manipulation with a view towards scanned documents
PDFAWriter.h
1 /*
2  * Copyright © 2016 - 2020 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3  *
4  * This program is free software: you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation, either version 3 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 
19 #ifndef PDFDOCUMENT
20 #define PDFDOCUMENT 1
21 
22 #include <QFuture>
23 #include <QList>
24 #include <QReadWriteLock>
25 #include <QString>
26 
27 #include "HOCRDocument.h"
28 #include "JBIG2Document.h"
29 #include "paperSize.h"
30 #include "resolution.h"
31 
32 
127 class PDFAWriter : public QObject
128 {
129  Q_OBJECT
130  Q_PROPERTY(QString author READ author WRITE setAuthor NOTIFY authorChanged)
131  Q_PROPERTY(QString keywords READ keywords WRITE setKeywords NOTIFY keywordsChanged)
132  Q_PROPERTY(QString subject READ subject WRITE setSubject NOTIFY subjectChanged)
133  Q_PROPERTY(QString title READ title WRITE setTitle NOTIFY titleChanged)
134  Q_PROPERTY(paperSize pageSize READ pageSize WRITE setPageSize NOTIFY pageSizeChanged)
135  Q_PROPERTY(resolution resolutionOverrideHorizontal READ resolutionOverrideHorizontal WRITE setResolutionOverrideHorizontal NOTIFY resolutionOverrideHorizontalChanged)
136  Q_PROPERTY(resolution resolutionOverrideVertical READ resolutionOverrideVertical WRITE setResolutionOverrideVertical NOTIFY resolutionOverrideVerticalChanged)
137  Q_PROPERTY(bool autoOCR READ autoOCR WRITE setAutoOCR NOTIFY autoOCRChanged)
138  Q_PROPERTY(QStringList autoOCRLanguages READ autoOCRLanguages WRITE setAutoOCRLanguages NOTIFY autoOCRLanguagesChanged)
139 
140  public:
147 
170  explicit PDFAWriter(bool bestCompression=false, QObject* parent=nullptr);
171 
175  QString author();
176 
181  void setAuthor(const QString &author);
182 
186  QString keywords();
187 
192  void setKeywords(const QString &keywords);
193 
198  QString subject();
199 
204  void setSubject(const QString &subject);
205 
209  QString title();
210 
215  void setTitle(const QString &title);
216 
222 
227  void setPageSize(const paperSize size);
228 
234 
240 
248 
254 
262 
276  void setResolutionOverride(resolution horizontal, resolution vertical);
277 
283  {
284  setResolutionOverride(res, res);
285  }
286 
289  {
291  }
292 
296  bool autoOCR();
297 
307  void setAutoOCR(bool autoOCR);
308 
313  QStringList autoOCRLanguages();
314 
332  QString setAutoOCRLanguages(const QStringList& nOCRLanguages);
333 
349  void appendToOCRData(const HOCRDocument &doc);
350 
358 
363  void clearOCRData();
364 
398  QString addPages(const QImage &image, QStringList *warnings=0);
399 
419  QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0);
420 
489  QString addPages(const QString &imageFileName, QStringList *warnings=0);
490 
502  operator QByteArray();
503 
504  public slots:
517 
518  signals:
521 
524 
527 
529  void titleChanged();
530 
533 
536 
539 
542 
545 
553  void finished();
554 
567  void progress(qreal percentage);
568 
569  private:
570  // Meta data
571  QString _author, _keywords, _subject, _title;
572 
573  // Paper size
574  paperSize _pageSize;
575 
576  // HOCR Document
577  HOCRDocument userSpecifiedOCRData;
578  QStringList OCRLanguages;
579  bool _autoOCR;
580 
581  // Override resolutions
582  resolution horizontalResolutionOverride;
583  resolution verticalResolutionOverride;
584 
585  // This private method adds a JBIG2 image to the PDF document. It differs from
586  // the generic method addPages() only in the arguments it expects the name of
587  // a JBIG file inestead of an abitrary graphics file.
588  //
589  // The image will be embedded in the PDF without re-encoding. The method does
590  // not check in detail if the file complies with the JBIG2 standard. If
591  // invalid input data is fed into this method, then the resulting PDF file
592  // might possibly not comply to the PDF/A standard.
593  QString addJBIG2(const QString &fileName, QStringList *warnings=0);
594 
595  // This private method adds a JPEG image to the PDF document. It differs from
596  // the generic method addPages() only in the arguments it expects the name of
597  // a JPEG file inestead of an abitrary graphics file.
598  //
599  // The image will be embedded in the PDF without re-encoding. The method does
600  // not check in detail if the file complies with the JPEG standard. If
601  // invalid input data is fed into this method, then the resulting PDF file
602  // might possibly not comply to the PDF/A standard.
603  QString addJPEG(const QString &fileName);
604 
605  // This private method adds a JPEG2000 (ISO/IEC 15444-2) image to the PDF
606  // document. The method expects a JPX or JPF file, and NOT a JP2 file. It
607  // differs from the generic method addPages() only in the arguments. It
608  // expects the name of a JPEG2000 file inestead of an abitrary graphics file.
609  //
610  // The image will be embedded in the PDF without re-encoding. The method does
611  // not check in detail if the file complies with the JPEG standard. If
612  // invalid input data is fed into this method, then the resulting PDF file
613  // might possibly not comply to the PDF/A standard.
614  QString addJPX(const QString &fileName);
615 
616  // This private method adds a TIFF image to the PDF document. The method
617  // exists because QImageReader cannot handle multi-page TIFF files. The method
618  // reads all images contained in the file, and calls addImage() to add them to
619  // the PDF
620  QString addTIFF(const QString &fileName);
621 
622  // This private method is used internally to generate a page containing a
623  // given graphicObject, and optionally a text overlay. This method assumes
624  // that the arguments have been checked and are correct. It also assumes that
625  // the PDFAWriter has been locked for writing.
626  void addGFXPage(quint32 graphicObjectIndex, const imageInfo& bInfo, const QImage& imageForOCR = QImage());
627 
628  // Lock used to provide thread-safety
629  QReadWriteLock lock;
630 
631  // PDF protoObject. This is either a QByteArray or QFuture<QByteArray>.
632  class protoObject {
633  public:
634  // cppcheck-suppress noExplicitConstructor
635  protoObject(QByteArray _data) : data(_data) {
636  ;
637  };
638 
639  // cppcheck-suppress noExplicitConstructor
640  protoObject(QFuture<QByteArray> _future) : future(_future) {
641  ;
642  };
643 
644  inline operator QByteArray() {
645  if (!future.isCanceled()) {
646  data = future.result();
647  future = QFuture<QByteArray>();
648  }
649  return data;
650  };
651 
652  QString description;
653  QByteArray data;
654  QFuture<QByteArray> future;
655  };
656 
657  // List of PDF objects
658  QList<protoObject> objects;
659 
660  // Index of the PDF object in the 'objects' list that contains …
661  quint32 catalogObjectIndex; // … the catalog of the PDF file
662  quint32 metaDataObjectIndex; // … the meta data
663  quint32 infoObjectIndex; // … the info object
664  quint32 pageDirectoryObjectIndex; // … the page directory
665  quint32 colorProfileObjectIndex; // … the color profile
666  quint32 fontObjectIndex; // … the font object itself
667 
668  // Use zopfli compression for bitmap graphics
669  bool bestCompression;
670 
671  // Indices of the PDF page objects in the 'objects' list
672  QList<quint32> pageIndices;
673 
674  // Reads file content into QByteArray
675  static QByteArray readFile(const QString& fileName);
676 
677  // Constructs a page directory object
678  QByteArray generatePageDirectoryObject() const;
679 
680  // Takes data from input, checks is zlib compression actually shrinks the
681  // data, and then generates a stream object, either unencoded or zlib encoded.
682  static QByteArray generateStreamObject(const QByteArray &input);
683 
684  // Returns the index of a font object for Times-Roman. Creates the object, if necessary
685  quint32 getFontObjectIndex();
686 
687  // Assumes that the image is black-and-white, as returned by
688  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
689  // PDF object containing the FAX G4 compressed image.
690  static QByteArray createImageObject_bw_G4(const QImage &image);
691 
692  // Assumes that the image is bitonal, as returned by
693  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
694  // PDF object containing the FAX G4 compressed image.
695  static QByteArray createImageObject_bitonal_G4(const QImage &image);
696 
697  // Assumes that the image is grayscale, as returned by
698  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
699  // PDF object containing the zlib/zopfli compressed image.
700  static QByteArray createImageObject_gray_zlib(const QImage &image, bool bestCompression);
701 
702  // Assumes that the image has an indexed palette, as returned by
703  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
704  // PDF object containing the zlib/zopfli compressed image.
705  static QByteArray createImageObject_indexed_zlib(const QImage &image, bool bestCompression);
706 
707  // Assumes that the image is full color, as returned by
708  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
709  // PDF object containing the zlib/zopfli compressed image.
710  static QByteArray createImageObject_rgb_zlib(const QImage &image, bool bestCompression);
711 
712  // Internal method. The method takes a page content stream and generates a
713  // well-compressed pageContent object, using the textBox to create a text
714  // overlay.
715  static QByteArray completePageContentObject_a(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const HOCRTextBox& textBox);
716 
717  // Internal method. The method takes runs the tesseract OCR engine to create a
718  // HOCRTextBox and then calls completePageContentObject_a
719  static QByteArray completePageContentObject_b(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const QImage& image, const QStringList& OCRLanguages);
720 };
721 
722 #endif
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
Definition: HOCRDocument.h:42
Text box, as defined in an HOCR file.
Definition: HOCRTextBox.h:45
Reads, writes and renders JBIG2 files, and chops them into pieces for inclusion into a PDF document.
Definition: JBIG2Document.h:40
Simple generator for PDF/A-2b compliant documents.
Definition: PDFAWriter.h:128
void setResolutionOverride(resolution horizontal, resolution vertical)
Sets graphic resolution for future calls of the methods addPage()
void setSubject(const QString &subject)
Set the subject string in the PDF/A meta data.
void setResolutionOverrideVertical(resolution vertical)
Set vertical resolution.
void setPageSize(const paperSize size)
Sets page size, effective for future calls of the methods addPage()
void subjectChanged()
Emitted when subject changes.
QStringList autoOCRLanguages()
List of languages used for OCR.
void waitForWorkerThreads()
Waits for all worker threads to finish.
void setResolutionOverrideHorizontal(resolution horizontal)
Set horizontal resolution.
void pageSizeChanged()
Emitted when pageSize changes.
void resolutionOverrideVerticalChanged()
Emitted when resolutionOverrideVertical changes.
void progress(qreal percentage)
Progress indicator.
void setAutoOCR(bool autoOCR)
Specify if the tesseract OCR engine should be run automatically.
void clearOCRData()
Delete all pages from the internal HOCRDocument.
void resolutionOverrideHorizontalChanged()
Emitted when resolutionOverrideHorizontal changes.
~PDFAWriter()
Destructor.
void autoOCRLanguagesChanged()
Emitted when autoOCRLanguages change.
void setResolutionOverride(resolution res)
Overloaded method that sets horizontal and vertical resolution to the same value.
Definition: PDFAWriter.h:282
paperSize pageSize()
Page Size.
QString title()
Metadata: Title String.
PDFAWriter(bool bestCompression=false, QObject *parent=nullptr)
Constructor.
void titleChanged()
Emitted when title changes.
void setPageSize(paperSize::format size=paperSize::empty)
Sets page size, effective for future calls of the methods addPage()
resolution resolutionOverrideHorizontal()
Horizontal resolution.
QString addPages(const QString &imageFileName, QStringList *warnings=0)
Add images to the PDF document.
void setTitle(const QString &title)
Set the title string in the PDF/A meta data.
void finished()
Emitted just before waitForWorkerThreads() returns.
QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0)
Add JBIG2 images to the PDF document.
QString keywords()
Metadata: Keywords.
bool autoOCR()
AutoOCR.
void authorChanged()
Emitted when author changes.
void setAuthor(const QString &author)
Set the author string in the PDF/A meta data.
HOCRDocument OCRData()
Return a copy of the internal HOCRDocument.
QString addPages(const QImage &image, QStringList *warnings=0)
Add an image to the PDF document.
void autoOCRChanged()
Emitted when autoOCR changes.
QString author()
Metadata: Author.
QString setAutoOCRLanguages(const QStringList &nOCRLanguages)
Specify languages used by the tesseract OCR engine.
void keywordsChanged()
Emitted when keywords change.
QString subject()
Metadata: Subject string.
void clearResolutionOverride()
Set horizontal and vertical override resolution to zero.
Definition: PDFAWriter.h:288
resolution resolutionOverrideVertical()
Vertical resolution.
void setKeywords(const QString &keywords)
Set the author string in the PDF/A meta data.
void appendToOCRData(const HOCRDocument &doc)
Specify pre-processed OCR data.
Trivial class to store elementary info about bitmap graphics.
Definition: imageInfo.h:31
The length stores a length and converts between units.
Definition: length.h:38
The paperSize class identifies and stores paper sizes.
Definition: paperSize.h:32
format
List of supported standard sizes.
Definition: paperSize.h:35
@ empty
0x0mm
Definition: paperSize.h:38
The resolution class stores a resolution and converts between units.
Definition: resolution.h:40