Rob van der Woude's Scripting Pages
Powered by GeSHi

Source code for epub2txt.cs

(view source code of epub2txt.cs as plain text)

  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.IO.Compression;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using System.Xml.Linq;
  8.  
  9.  
  10. namespace RobvanderWoude
  11. {
  12. 	internal class Epub2Txt
  13. 	{
  14. 		static string progver = "1.00";
  15.  
  16.  
  17. 		static int Main( string[] args )
  18. 		{
  19. 			string epub = string.Empty;
  20. 			Encoding encoding = null;
  21.  
  22.  
  23. 			#region Parse Command Line
  24.  
  25. 			if ( args.Length == 0 || args.Length > 2 )
  26. 			{
  27. 				return ShowHelp( );
  28. 			}
  29.  
  30. 			foreach ( string arg in args )
  31. 			{
  32. 				if ( arg[0] == '/' )
  33. 				{
  34. 					if ( arg == "/?" )
  35. 					{
  36. 						return ShowHelp( );
  37. 					}
  38. 					else if ( arg.ToUpper( ) == "/E" )
  39. 					{
  40. 						return ListEncodings( );
  41. 					}
  42. 					else
  43. 					{
  44. 						return ShowHelp( "Invalid command line switch {0}", arg );
  45. 					}
  46. 				}
  47. 				else
  48. 				{
  49. 					if ( string.IsNullOrWhiteSpace( epub ) )
  50. 					{
  51. 						epub = arg;
  52. 						if ( !File.Exists( epub ) )
  53. 						{
  54. 							return ShowHelp( "File \"{0}\" not found", epub );
  55. 						}
  56. 						if ( Path.GetExtension( epub ).ToLower( ) != ".epub" )
  57. 						{
  58. 							return ShowHelp( "This program can handle .EPUB files only" );
  59. 						}
  60. 					}
  61. 					else if ( encoding == null )
  62. 					{
  63. 						encoding = GetEncoding( arg );
  64. 						if ( encoding == null )
  65. 						{
  66. 							return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
  67. 						}
  68. 					}
  69. 					else
  70. 					{
  71. 						return ShowHelp( "Too many command line arguments" );
  72. 					}
  73. 				}
  74. 			}
  75.  
  76. 			if ( string.IsNullOrWhiteSpace( epub ) )
  77. 			{
  78. 				return ShowHelp( );
  79. 			}
  80.  
  81. 			#endregion Parse Command Line
  82.  
  83.  
  84. 			#region Extract Text
  85.  
  86. 			string content = string.Empty;
  87. 			string doctitle = string.Empty;
  88. 			SortedDictionary<int, string> textcontent = new SortedDictionary<int, string>( );
  89.  
  90. 			// Open document as ZIP file and extract the XML file containing the text content
  91. 			using ( ZipArchive archive = ZipFile.OpenRead( epub ) )
  92. 			{
  93. 				foreach ( ZipArchiveEntry entry in archive.Entries )
  94. 				{
  95. 					if ( entry.Name.ToLower( ) == "toc.ncx" )
  96. 					{
  97. 						string toc = Path.GetTempFileName( );
  98. 						entry.ExtractToFile( toc, true );
  99.  
  100. 						XDocument xml = XDocument.Load( toc );
  101. 						if ( encoding == null )
  102. 						{
  103. 							encoding = GetEncoding( xml.Declaration.Encoding );
  104. 						}
  105. 						foreach ( XElement el in xml.Elements( ).Elements( ) )
  106. 						{
  107. 							if ( el.Name.LocalName == "docTitle" )
  108. 							{
  109. 								doctitle = el.Value;
  110. 							}
  111. 						}
  112. 						foreach ( XElement el in xml.Elements( ).Elements( ).Elements( ) )
  113. 						{
  114. 							if ( el.Name.LocalName == "navPoint" )
  115. 							{
  116. 								string header = el.Value;
  117. 								int playorder = int.Parse( el.Attribute( "playOrder" ).Value );
  118. 								string chaptersrc = string.Empty;
  119. 								foreach ( XElement el2 in el.Elements( ) )
  120. 								{
  121. 									if ( el2.Name.LocalName == "content" )
  122. 									{
  123. 										chaptersrc = el2.Attribute( "src" ).Value;
  124. 										if ( chaptersrc.Contains( "#" ) )
  125. 										{
  126. 											chaptersrc = chaptersrc.Split( "#".ToCharArray( ) )[0];
  127. 										}
  128. 									}
  129. 								}
  130. 								string chapter = Path.GetTempFileName( );
  131. 								string text = string.Empty;
  132. 								foreach ( ZipArchiveEntry entry2 in archive.Entries )
  133. 								{
  134. 									if ( entry2.Name == chaptersrc )
  135. 									{
  136. 										entry2.ExtractToFile( chapter, true );
  137. 										text = File.ReadAllText( chapter );
  138. 										File.Delete( chapter );
  139. 										text = Regex.Replace( text, "<title>[^<]*</title>", string.Empty, RegexOptions.IgnoreCase );
  140. 										text = Regex.Replace( text, "<br( /)?>", Environment.NewLine, RegexOptions.IgnoreCase );
  141. 										text = Regex.Replace( text, "(</p>|</h\\d+>)", Environment.NewLine + Environment.NewLine, RegexOptions.IgnoreCase );
  142. 										text = Regex.Replace( text, "<[^>]+>", string.Empty );
  143. 									}
  144. 								}
  145. 								textcontent.Add( playorder, text );
  146. 							}
  147. 						}
  148. 						File.Delete( toc );
  149. 					}
  150. 				}
  151. 			}
  152.  
  153. 			#endregion Extract Text
  154.  
  155.  
  156. 			int textlength = 0;
  157. 			Encoding oldencoding = Console.OutputEncoding;
  158. 			Console.OutputEncoding = encoding;
  159. 			foreach ( int key in textcontent.Keys )
  160. 			{
  161. 				Console.WriteLine( textcontent[key] );
  162. 				textlength += textcontent[key].Length;
  163. 			}
  164. 			Console.OutputEncoding = oldencoding;
  165.  
  166. 			if ( textlength > 1000 )
  167. 			{
  168. 				return 0;
  169. 			}
  170. 			else
  171. 			{
  172. 				return 1;
  173. 			}
  174. 		}
  175.  
  176.  
  177. 		static Encoding GetEncoding( string myencoding )
  178. 		{
  179. 			if ( string.IsNullOrEmpty( myencoding ) )
  180. 			{
  181. 				return null;
  182. 			}
  183. 			// Get a list of available encodings
  184. 			EncodingInfo[] encodings = Encoding.GetEncodings( );
  185. 			// Try correctly spelled encodings first
  186. 			foreach ( EncodingInfo encoding in encodings )
  187. 			{
  188. 				if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
  189. 				{
  190. 					return Encoding.GetEncoding( encoding.CodePage );
  191. 				}
  192. 			}
  193. 			// No direct match found, try again, ignoring dashes
  194. 			foreach ( EncodingInfo encoding in encodings )
  195. 			{
  196. 				if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
  197. 				{
  198. 					return Encoding.GetEncoding( encoding.CodePage );
  199. 				}
  200. 			}
  201. 			// Still no match, try codepages
  202. 			foreach ( EncodingInfo encoding in encodings )
  203. 			{
  204. 				if ( encoding.CodePage.ToString( ) == myencoding )
  205. 				{
  206. 					return Encoding.GetEncoding( encoding.CodePage );
  207. 				}
  208. 			}
  209. 			// Still no match, giving up
  210. 			return null;
  211. 		}
  212.  
  213.  
  214. 		static int ListEncodings( )
  215. 		{
  216. 			try
  217. 			{
  218. 				Console.Clear( );
  219. 			}
  220. 			catch
  221. 			{
  222. 				// Console.Clear( ) throws an IO exception if the output is redirected
  223. 			}
  224. 			int columnwidth = 8;
  225. 			EncodingInfo[] allencodings = Encoding.GetEncodings( );
  226. 			List<string> allencodingnames = new List<string>( );
  227. 			foreach ( EncodingInfo enc in allencodings )
  228. 			{
  229. 				allencodingnames.Add( enc.Name );
  230. 			}
  231. 			allencodingnames.Sort( );
  232. 			foreach ( string enc in allencodingnames )
  233. 			{
  234. 				columnwidth = Math.Max( columnwidth, enc.Length );
  235. 			}
  236. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "Encoding", "CodePage" );
  237. 			Console.WriteLine( "{0,-" + columnwidth + "}   {1}", "========", "========" );
  238. 			foreach ( string enc in allencodingnames )
  239. 			{
  240. 				Console.WriteLine( "{0,-" + columnwidth + "}   {1}", enc, GetEncoding( enc ).CodePage );
  241. 			}
  242. 			return 0;
  243. 		}
  244.  
  245.  
  246. 		static int ShowHelp( params string[] errmsg )
  247. 		{
  248. 			#region Help Text
  249.  
  250. 			/*
  251. 			Epub2Txt,  Version 1.00
  252. 			Extract plain text from an EPUB file and send it to the screen
  253.  
  254. 			Usage:   Epub2Txt    "epubfile"  [ encoding ]
  255.  
  256. 			or:      Word2Txt    /E
  257.  
  258. 			Where:   epubfile    is the path of the EPUB file to be read
  259. 			                     (no wildcards allowed, only .epub extension)
  260. 			         encoding    force use of alternative encoding for plain
  261. 			                     text, e.g. UTF-8 to preserve accented characters
  262. 			                     or IBM437 to convert unicode quotes to ASCII
  263. 			                     (default: encoding of EPUB file)
  264. 			         /E          list all available encodings
  265.  
  266. 			Notes:   If the specified encoding does not match any available encoding
  267. 			         name, the program will try again, ignoring dashes; if that does
  268. 			         not provide a match, the program will try matching the specified
  269. 			         encoding with the available encodings' codepages.
  270. 			         This program requires .NET 4.5.
  271. 			         Return code ("errorlevel") 0 means no errors were encountered and
  272. 			         extracted text exceeds 1KB; otherwise the return code will be 1.
  273.  
  274. 			Written by Rob van der Woude
  275. 			https://www.robvanderwoude.com
  276. 			*/
  277.  
  278. 			#endregion Help Text
  279.  
  280.  
  281. 			#region Error Message
  282.  
  283. 			if ( errmsg.Length > 0 )
  284. 			{
  285. 				List<string> errargs = new List<string>( errmsg );
  286. 				errargs.RemoveAt( 0 );
  287. 				Console.Error.WriteLine( );
  288. 				Console.ForegroundColor = ConsoleColor.Red;
  289. 				Console.Error.Write( "ERROR:\t" );
  290. 				Console.ForegroundColor = ConsoleColor.White;
  291. 				Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
  292. 				Console.ResetColor( );
  293. 			}
  294.  
  295. 			#endregion Error Message
  296.  
  297.  
  298. 			#region Display Help Text
  299.  
  300. 			Console.Error.WriteLine( );
  301.  
  302. 			Console.Error.WriteLine( "Epub2Txt,  Version {0}", progver );
  303.  
  304. 			Console.Error.WriteLine( "Extract plain text from an EPUB file and send it to the screen" );
  305.  
  306. 			Console.Error.WriteLine( );
  307.  
  308. 			Console.Error.Write( "Usage:   " );
  309. 			Console.ForegroundColor = ConsoleColor.White;
  310. 			Console.Error.WriteLine( "Epub2Txt    \"epubfile\"  [ encoding ]" );
  311. 			Console.ResetColor( );
  312.  
  313. 			Console.Error.WriteLine( );
  314.  
  315. 			Console.Error.Write( "or:      " );
  316. 			Console.ForegroundColor = ConsoleColor.White;
  317. 			Console.Error.WriteLine( "Epub2Txt    /E" );
  318. 			Console.ResetColor( );
  319.  
  320. 			Console.Error.WriteLine( );
  321.  
  322. 			Console.Error.Write( "Where:   " );
  323. 			Console.ForegroundColor = ConsoleColor.White;
  324. 			Console.Error.Write( "epubfile" );
  325. 			Console.ResetColor( );
  326. 			Console.Error.WriteLine( "    is the path of the EPUB file to be read" );
  327.  
  328. 			Console.Error.WriteLine( "                     (no wildcards allowed, only .epub extension)" );
  329.  
  330. 			Console.ForegroundColor = ConsoleColor.White;
  331. 			Console.Error.Write( "         encoding" );
  332. 			Console.ResetColor( );
  333. 			Console.Error.WriteLine( "    force use of alternative encoding for plain" );
  334.  
  335. 			Console.Error.Write( "                     text, e.g. " );
  336. 			Console.ForegroundColor = ConsoleColor.White;
  337. 			Console.Error.Write( "UTF-8" );
  338. 			Console.ResetColor( );
  339. 			Console.Error.WriteLine( " to preserve accented characters" );
  340.  
  341. 			Console.Error.Write( "                     or " );
  342. 			Console.ForegroundColor = ConsoleColor.White;
  343. 			Console.Error.Write( "IBM437" );
  344. 			Console.ResetColor( );
  345. 			Console.Error.WriteLine( " to convert unicode quotes to ASCII" );
  346.  
  347. 			Console.Error.WriteLine( "                     (default: encoding of EPUB file)" );
  348.  
  349. 			Console.ForegroundColor = ConsoleColor.White;
  350. 			Console.Error.Write( "         /E" );
  351. 			Console.ResetColor( );
  352. 			Console.Error.WriteLine( "          list all available encodings" );
  353.  
  354. 			Console.Error.WriteLine( );
  355.  
  356. 			Console.Error.WriteLine( "Notes:   If the specified encoding does not match any available encoding" );
  357.  
  358. 			Console.Error.WriteLine( "         name, the program will try again, ignoring dashes; if that does" );
  359.  
  360. 			Console.Error.WriteLine( "         not provide a match, the program will try matching the specified" );
  361.  
  362. 			Console.Error.WriteLine( "         encoding with the available encodings' codepages." );
  363.  
  364. 			Console.Error.WriteLine( "         Return code (\"errorlevel\") 0 means no errors were encounterd" );
  365.  
  366. 			Console.Error.WriteLine( "         and some text was extracted from the file; otherwise the" );
  367.  
  368. 			Console.Error.WriteLine( "         return code will be 1." );
  369.  
  370. 			Console.Error.WriteLine( );
  371.  
  372. 			Console.Error.WriteLine( "Written by Rob van der Woude" );
  373.  
  374. 			Console.Error.WriteLine( "https://www.robvanderwoude.com" );
  375.  
  376. 			#endregion Display Help Text
  377.  
  378.  
  379. 			return 1;
  380. 		}
  381. 	}
  382. }
  383.  

page last uploaded: 2021-01-27