Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for pages2txt.bat

(view source code of pages2txt.bat as plain text)

  1. @ECHO OFF
  2. :: Check command line
  3. IF "%~1"=="" GOTO Syntax
  4. IF NOT "%~3"=="" GOTO Syntax
  5. ECHO.%* | FIND "?" >NUL && GOTO Syntax
  6. IF /I NOT "%~x1"==".pages" GOTO Syntax
  7. IF NOT EXIST "%~1" (
  8. 	ECHO ←[1;31mFile not found: "%~1"←[0m
  9. 	GOTO Syntax
  10. )
  11. :: Make %TEMP% the working directory
  12. PUSHD "%TEMP%"
  13. :: Check if files already exist
  14. IF EXIST preview.jpg (
  15. 	ECHO ←[1;33mFile preview.jpg already exists.
  16. 	CHOICE.EXE /D N /T 10 /M "Do you want to delete it?←[0;30m"
  17. 	IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
  18. 		ECHO ←[0m
  19. 		DEL preview.jpg
  20. 	) ELSE (
  21. 		ECHO ←[1;33mPlease move or rename preview.jpg and try again.←[0m
  22. 		POPD
  23. 		EXIT /B 1
  24. 	)
  25. )
  26. IF EXIST "%~dpn1.txt" (
  27. 	ECHO ←[1;33mFile "%~n1.txt" already exists.
  28. 	CHOICE.EXE /D N /T 10 /M "Do you want to delete it?←[0;30m"
  29. 	IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
  30. 		ECHO ←[0m
  31. 		DEL "%~dpn1.txt"
  32. 	) ELSE (
  33. 		ECHO ←[1;33mPlease move or rename "%~n1.txt" and try again.←[0m
  34. 		POPD
  35. 		EXIT /B 1
  36. 	)
  37. )
  38. :: Extract preview.jpg from .pages file
  39. FOR /F "tokens=*" %%A IN ('DIR /AD /B "%ProgramFiles%\7*"') DO (
  40. 	FOR /F "tokens=*" %%B IN ('DIR /B /S "%ProgramFiles%\%%~A\7z.exe"') DO (
  41. 		"%%~B" e "%~f1" preview.jpg
  42. 	)
  43. )
  44. IF NOT EXIST preview.jpg (
  45. 	ECHO ←[1;33mThis batch file requires 7zip, available at←[0m
  46. 	ECHO ←[1mhttps://7-zip.org/←[1;33m
  47. 	CHOICE /D N /T 10 /M "Do you want to download it?←[0;30m"
  48. 	IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
  49. 		START "" https://7-zip.org/
  50. 	)
  51. 	ECHO ←[0m
  52. 	POPD
  53. 	EXIT /B 1
  54. )
  55. :: Perform OCR on extracted preview.jpg and save it with same name as specified input file and .txt extension
  56. FOR /F "tokens=*" %%A IN ('DIR /AD /B "%ProgramFiles%\tesseract*"') DO (
  57. 	REM Check if language code is specified, and if it is valid
  58. 	IF NOT "%~2"=="" (
  59. 		IF NOT EXIST "%ProgramFiles%\%%~A\tessdata\%~2.*data*" (
  60. 			ECHO ←[1;31mUnsupported Tesseract language code: "%~2"
  61. 			FOR /F %%B IN ('DIR /B "%ProgramFiles%\%%~A\tessdata\???.*data*" ^| FIND.EXE /C "data"') DO (
  62. 				IF %%B GTR 1 (
  63. 					ECHO ←[0mUse one of the following language codes:
  64. 					FOR %%C IN ("%ProgramFiles%\%%~A\tessdata\???.*data*") DO (
  65. 						IF /I NOT "%%~nC"=="osd" (
  66. 							SET /P "=←[1;32m%%~nC←[0m, " < NUL
  67. 						)
  68. 					)
  69. 					SET /P "=or omit the language code to use the default (←[1;32meng←[0m)" < NUL
  70. 				)
  71. 			)
  72. 			ECHO ←[0m
  73. 			POPD
  74. 			EXIT /B 1
  75. 		)
  76. 	)
  77. 	FOR /F "tokens=*" %%B IN ('DIR /B /S "%ProgramFiles%\%%~A\tesseract.exe"') DO (
  78. 		IF "%~2"=="" (
  79. 			"%%~B" preview.jpg "%~dpn1" -l eng
  80. 		) ELSE (
  81. 			"%%~B" preview.jpg "%~dpn1" -l %~2
  82. 		)
  83. 	)
  84. )
  85. IF NOT EXIST "%~dpn1.txt" (
  86. 	ECHO ←[1;33mThis batch file requires Tesseract OCR, available at←[0m
  87. 	ECHO ←[1mhttps://github.com/UB-Mannheim/tesseract/wiki←[1;33m
  88. 	CHOICE /D N /T 10 /M "Do you want to download it?←[0;30m"
  89. 	IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
  90. 		START "" https://github.com/UB-Mannheim/tesseract/wiki
  91. 	)
  92. 	ECHO ←[0m
  93. 	POPD
  94. 	EXIT /B 1
  95. )
  96. ECHO ←[1;32mExtracted text successfully saved as "%~dpn1.txt"←[0m
  97. :: Delete temporary file
  98. DEL preview.jpg
  99. :: Open extracted text in Word, if available
  100. IF EXIST "%ProgramFiles%\Microsoft Office\" (
  101. 	FOR /F "tokens=*" %%A IN ('DIR /B /S "%ProgramFiles%\Microsoft Office\winword.exe"') DO (
  102. 		START "" "%%~A" /t "%~dpn1.txt"
  103. 	)
  104. )
  105. :: Restore working directory
  106. POPD
  107. :: Done
  108. EXIT /B 0
  109.  
  110.  
  111. :Syntax
  112. ECHO.
  113. ECHO %~nx0,  Version 1.00
  114. ECHO Use OCR to extract text from a *.pages document.
  115. ECHO.
  116. ECHO Usage:  ←[1;33m%~nx0  file.pages  [ languagecode ]←[0m
  117. ECHO.
  118. ECHO Where:  ←[1;33mfile.pages←[0m    *.pages file from which text is to be extracted
  119. ECHO         ←[1;33mlanguagecode←[0m  optional Tesseract 3 letter language code (default: eng)
  120. ECHO.
  121. ECHO Notes:  This program requires 7-zip as well as Tesseract OCR.
  122. ECHO         The extracted text will be saved as plain text in the .pages file's
  123. ECHO         parent folder, using the specified file's name, and .txt extension.
  124. ECHO         If the specified file name contains multiple dots, the output file
  125. ECHO         name will be truncated at the first dot. If the output file already
  126. ECHO         exists, you will be prompted to delete it or abort.
  127. ECHO         If an invalid language code is specified, the batch file will abort
  128. ECHO         after showing a list of available language codes.
  129. ECHO         A temporary file preview.jpg will be created. If it already exists,
  130. ECHO         you will be prompted to delete it or abort.
  131. ECHO         If MS Word is available, the extracted text will be opened in Word.
  132. ECHO         The batch file's return code ("Errorlevel") will equal 0 if the
  133. ECHO         specified file was successfully converted, otherwise it will equal 1.
  134. ECHO.
  135. ECHO Written by Rob van der Woude
  136. ECHO https://www.robvanderwoude.com
  137. EXIT /B 1
  138.  

page last modified: 2024-04-16; loaded in 0.0079 seconds