using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using System.Text.RegularExpressions; using System.Xml.Linq; namespace RobvanderWoude { internal class Epub2Txt { static string progver = "1.01"; static int Main( string[] args ) { string epub = string.Empty; Encoding encoding = null; #region Parse Command Line if ( args.Length == 0 || args.Length > 2 ) { return ShowHelp( ); } foreach ( string arg in args ) { if ( arg[0] == '/' ) { if ( arg == "/?" ) { return ShowHelp( ); } else if ( arg.ToUpper( ) == "/E" ) { return ListEncodings( ); } else { return ShowHelp( "Invalid command line switch {0}", arg ); } } else { if ( string.IsNullOrWhiteSpace( epub ) ) { epub = arg; if ( !File.Exists( epub ) ) { return ShowHelp( "File \"{0}\" not found", epub ); } if ( Path.GetExtension( epub ).ToLower( ) != ".epub" ) { return ShowHelp( "This program can handle .EPUB files only" ); } } else if ( encoding == null ) { encoding = GetEncoding( arg ); if ( encoding == null ) { return ShowHelp( "Invalid encoding \"{0}\"", args[1] ); } } else { return ShowHelp( "Too many command line arguments" ); } } } if ( string.IsNullOrWhiteSpace( epub ) ) { return ShowHelp( ); } #endregion Parse Command Line #region Extract Text string content = string.Empty; string doctitle = string.Empty; SortedDictionary textcontent = new SortedDictionary( ); // Open document as ZIP file and extract the XML file containing the text content using ( ZipArchive archive = ZipFile.OpenRead( epub ) ) { foreach ( ZipArchiveEntry entry in archive.Entries ) { if ( entry.Name.ToLower( ) == "toc.ncx" ) { string toc = Path.GetTempFileName( ); entry.ExtractToFile( toc, true ); XDocument xml = XDocument.Load( toc ); if ( encoding == null ) { encoding = GetEncoding( xml.Declaration.Encoding ); } foreach ( XElement el in xml.Elements( ).Elements( ) ) { if ( el.Name.LocalName == "docTitle" ) { doctitle = el.Value; } } foreach ( XElement el in xml.Elements( ).Elements( ).Elements( ) ) { if ( el.Name.LocalName == "navPoint" ) { string header = el.Value; int playorder = int.Parse( el.Attribute( "playOrder" ).Value ); string chaptersrc = string.Empty; foreach ( XElement el2 in el.Elements( ) ) { if ( el2.Name.LocalName == "content" ) { chaptersrc = el2.Attribute( "src" ).Value; if ( chaptersrc.Contains( "#" ) ) { chaptersrc = chaptersrc.Split( "#".ToCharArray( ) )[0]; } } } string chapter = Path.GetTempFileName( ); string text = string.Empty; foreach ( ZipArchiveEntry entry2 in archive.Entries ) { if ( entry2.Name == chaptersrc ) { entry2.ExtractToFile( chapter, true ); text = File.ReadAllText( chapter ); File.Delete( chapter ); text = Regex.Replace( text, "[^<]*", string.Empty, RegexOptions.IgnoreCase ); text = Regex.Replace( text, "", Environment.NewLine, RegexOptions.IgnoreCase ); text = Regex.Replace( text, "(

|)", Environment.NewLine + Environment.NewLine, RegexOptions.IgnoreCase ); text = Regex.Replace( text, "<[^>]+>", string.Empty ); } } textcontent.Add( playorder, text ); } } File.Delete( toc ); } } } #endregion Extract Text int textlength = 0; Encoding oldencoding = Console.OutputEncoding; Console.OutputEncoding = encoding; foreach ( int key in textcontent.Keys ) { Console.WriteLine( textcontent[key] ); textlength += textcontent[key].Length; } Console.OutputEncoding = oldencoding; if ( textlength > 1000 ) { return 0; } else { return 1; } } static Encoding GetEncoding( string myencoding ) { if ( string.IsNullOrEmpty( myencoding ) ) { return null; } // Get a list of available encodings EncodingInfo[] encodings = Encoding.GetEncodings( ); // Try correctly spelled encodings first foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // No direct match found, try again, ignoring dashes foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, try codepages foreach ( EncodingInfo encoding in encodings ) { if ( encoding.CodePage.ToString( ) == myencoding ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, giving up return null; } static int ListEncodings( ) { try { Console.Clear( ); } catch { // Console.Clear( ) throws an IO exception if the output is redirected } int columnwidth = 8; EncodingInfo[] allencodings = Encoding.GetEncodings( ); List allencodingnames = new List( ); foreach ( EncodingInfo enc in allencodings ) { allencodingnames.Add( enc.Name ); } allencodingnames.Sort( ); foreach ( string enc in allencodingnames ) { columnwidth = Math.Max( columnwidth, enc.Length ); } Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" ); Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" ); foreach ( string enc in allencodingnames ) { Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage ); } return 0; } static int ShowHelp( params string[] errmsg ) { #region Help Text /* Epub2Txt, Version 1.01 Extract plain text from an EPUB file and send it to the screen Usage: Epub2Txt "epubfile" [ encoding ] or: Epub2Txt /E Where: epubfile is the path of the EPUB file to be read (no wildcards allowed, only .epub extension) encoding force use of alternative encoding for plain text, e.g. UTF-8 to preserve accented characters or IBM437 to convert unicode quotes to ASCII (default: encoding of EPUB file) /E list all available encodings Notes: If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 0 means no errors were encountered and extracted text exceeds 1KB; otherwise the return code will be 1. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Error Message if ( errmsg.Length > 0 ) { List errargs = new List( errmsg ); errargs.RemoveAt( 0 ); Console.Error.WriteLine( ); Console.ForegroundColor = ConsoleColor.Red; Console.Error.Write( "ERROR:\t" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) ); Console.ResetColor( ); } #endregion Error Message #region Display Help Text Console.Error.WriteLine( ); Console.Error.WriteLine( "Epub2Txt, Version {0}", progver ); Console.Error.WriteLine( "Extract plain text from an EPUB file and send it to the screen" ); Console.Error.WriteLine( ); Console.Error.Write( "Usage: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "Epub2Txt \"epubfile\" [ encoding ]" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "or: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "Epub2Txt /E" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "Where: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "epubfile" ); Console.ResetColor( ); Console.Error.WriteLine( " is the path of the EPUB file to be read" ); Console.Error.WriteLine( " (no wildcards allowed, only .epub extension)" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " encoding" ); Console.ResetColor( ); Console.Error.WriteLine( " force use of alternative encoding for plain" ); Console.Error.Write( " text, e.g. " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "UTF-8" ); Console.ResetColor( ); Console.Error.WriteLine( " to preserve accented characters" ); Console.Error.Write( " or " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "IBM437" ); Console.ResetColor( ); Console.Error.WriteLine( " to convert unicode quotes to ASCII" ); Console.Error.WriteLine( " (default: encoding of EPUB file)" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " /E" ); Console.ResetColor( ); Console.Error.WriteLine( " list all available encodings" ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Notes: If the specified encoding does not match any available encoding" ); Console.Error.WriteLine( " name, the program will try again, ignoring dashes; if that does" ); Console.Error.WriteLine( " not provide a match, the program will try matching the specified" ); Console.Error.WriteLine( " encoding with the available encodings' codepages." ); Console.Error.WriteLine( " Return code (\"errorlevel\") 0 means no errors were encounterd" ); Console.Error.WriteLine( " and some text was extracted from the file; otherwise the" ); Console.Error.WriteLine( " return code will be 1." ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Written by Rob van der Woude" ); Console.Error.WriteLine( "https://www.robvanderwoude.com" ); #endregion Display Help Text return 1; } } }