Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iterating PDF file #8

Open
MariSelvanDev opened this issue Mar 20, 2018 · 3 comments
Open

Iterating PDF file #8

MariSelvanDev opened this issue Mar 20, 2018 · 3 comments

Comments

@MariSelvanDev
Copy link

MariSelvanDev commented Mar 20, 2018

How to convert nth page of a pdf file to png image?
which I mean.
I converted a PDF file which holds 8 pages in it, but resulted by converting only first page of a pdf file to a png image, but I need a whole.

Thanks in advance.

@MariSelvanDev MariSelvanDev changed the title Iterating in PDF file Iterating PDF file Mar 20, 2018
@thnew
Copy link
Owner

thnew commented Mar 21, 2018

hi, this is not possible with my project yet, but you can fork and modify it ti make it possible. When I habe time I will do it as well. I described how to modify it here:
#6

@dipendra210
Copy link

I also need to convert multiple pdf page into pngs.
Can somebody help me?
Thanks in advance.

@dipendra210
Copy link

I implemented pdf2png.js by followed thnew's opinion.
So, it works well ( only when you input option for { returnFilePath: true } )
Here is my code.
pdf2png.js

var exec = require('child_process').exec;
var tmp = require('tmp');
var fs = require('fs');
var filesource = require('filesource');

var initialized = false;

// Add Ghostscript executables path
var projectPath = __dirname.split("\");
projectPath.pop();
projectPath = projectPath.join("\");

exports.ghostscriptPath = projectPath + "\executables\ghostScript";

// for linux compability
exports.ghostscriptPath = exports.ghostscriptPath.split("\").join("/");

exports.convert = function() {
var filepathOrData = arguments[0];
var callback = arguments[1];
var options = {};
var pageCount = 1;

var tmpFileCreated = false;

if(arguments[3] != null)
{
	options = arguments[1];
	pageCount = arguments[2]
	callback = arguments[3];
}

if(!initialized)
{
	if(!options.useLocalGhostscript)
	{
		process.env.Path += ";" + exports.ghostscriptPath;
	}
	
	initialized = true;
}

options.quality = options.quality || 100;

filesource.getDataPath(filepathOrData, function(resp){
	if(!resp.success)
	{
		callback(resp);
		return;
	}
	
	// get temporary filepath
	tmp.file({ postfix: ".png" }, function(err, imageFilepath, fd) {
		if(err)
		{
			callback({ success: false, error: "Error getting second temporary filepath: " + err });
			return;
		}

		const fileName = imageFilepath.substring(0, imageFilepath.length - 4);
	
		//exec("gs -dQUIET -dPARANOIDSAFER -dBATCH -dNOPAUSE -dNOPROMPT -sDEVICE=png16m -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r" + options.quality + " -dFirstPage=1 -dLastPage=2 -sOutputFile=" + imageFilepath + " " + resp.data, function (error, stdout, stderr) {
		
		exec("gs -dQUIET -dPARANOIDSAFER -dBATCH -dNOPAUSE -dNOPROMPT -sDEVICE=png16m -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r" + options.quality + " -dFirstPage=1 -dLastPage=" + pageCount + " -sOutputFile=" + fileName + "-%d.png " + resp.data, function (error, stdout, stderr) {
			// Remove temp files
			resp.clean();
			
			if(error !== null)
			{
				callback({ success: false, error: "Error converting pdf to png: " + error });
				return;
			}

			let imgPaths = [];
			for (let i = 0; i < pageCount; i++) {
				imgPaths.push(fileName + "-" + (i + 1) + ".png");
			}

			if(options.returnFilePath)
			{
				//callback({ success: true, data: imageFilepath });
				callback({ success: true, data: imgPaths });
				return;
			}
			
			var img = fs.readFileSync(imageFilepath);
			
			// Remove temp file
			fs.unlinkSync(imageFilepath);
			
			callback({ success: true, data: img });
		});
	});
});
};

api.js ( calling pdf2png.convert() function )
pdfPageCount.count(targetFile, function(resp){
if(!resp.success)
{
console.log("Something went wrong: " + resp.error);

					return;
				}
				// pdf page count
				const pageCount = resp.data;
				// orc result of each page of pdf
				let ocrResult = '';

				pdf2png.convert(targetFile, { returnFilePath: true }, pageCount, function(resp) {
					if(!resp.success) {
					        console.log("Something went wrong: " + resp.error);
					        
					        res.send( { result: "Can't handle the PDF file." } );
					}

					for (let i = 0; i < pageCount; i++) {
					    
					    console.log("Yayy the pdf got converted, now I'm gonna ocr it!");

					    Tesseract.recognize(resp.data[i])
									 .progress((p) => {
									 	//console.log('progress', p);
									 })
									 .then((result) => {
									 	ocrResult = ocrResult + result.text;
									 	if ( i === pageCount-1) {
											res.send({result: ocrResult});
									 		console.log(ocrResult);
									 	}
									 })
					}
				});

			});

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants