(view source code of epub2txt.cs as plain text)
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
namespace RobvanderWoude{internal class Epub2Txt
{static string progver = "1.01";
static int Main( string[] args )
{string epub = string.Empty;
Encoding encoding = null;
#region Parse Command Lineif ( args.Length == 0 || args.Length > 2 )
{return ShowHelp( );
}foreach ( string arg in args )
{if ( arg[0] == '/' )
{if ( arg == "/?" )
{return ShowHelp( );
}else if ( arg.ToUpper( ) == "/E" )
{return ListEncodings( );
} else {return ShowHelp( "Invalid command line switch {0}", arg );
} } else {if ( string.IsNullOrWhiteSpace( epub ) )
{epub = arg;
if ( !File.Exists( epub ) )
{return ShowHelp( "File \"{0}\" not found", epub );
}if ( Path.GetExtension( epub ).ToLower( ) != ".epub" )
{return ShowHelp( "This program can handle .EPUB files only" );
} }else if ( encoding == null )
{encoding = GetEncoding( arg );
if ( encoding == null )
{return ShowHelp( "Invalid encoding \"{0}\"", args[1] );
} } else {return ShowHelp( "Too many command line arguments" );
} } }if ( string.IsNullOrWhiteSpace( epub ) )
{return ShowHelp( );
} #endregion Parse Command Line #region Extract Textstring content = string.Empty;
string doctitle = string.Empty;
SortedDictionary<int, string> textcontent = new SortedDictionary<int, string>( );
// Open document as ZIP file and extract the XML file containing the text contentusing ( ZipArchive archive = ZipFile.OpenRead( epub ) )
{foreach ( ZipArchiveEntry entry in archive.Entries )
{if ( entry.Name.ToLower( ) == "toc.ncx" )
{string toc = Path.GetTempFileName( );
entry.ExtractToFile( toc, true );
XDocument xml = XDocument.Load( toc );
if ( encoding == null )
{encoding = GetEncoding( xml.Declaration.Encoding );
}foreach ( XElement el in xml.Elements( ).Elements( ) )
{if ( el.Name.LocalName == "docTitle" )
{doctitle = el.Value;
} }foreach ( XElement el in xml.Elements( ).Elements( ).Elements( ) )
{if ( el.Name.LocalName == "navPoint" )
{string header = el.Value;
int playorder = int.Parse( el.Attribute( "playOrder" ).Value );
string chaptersrc = string.Empty;
foreach ( XElement el2 in el.Elements( ) )
{if ( el2.Name.LocalName == "content" )
{chaptersrc = el2.Attribute( "src" ).Value;
if ( chaptersrc.Contains( "#" ) )
{chaptersrc = chaptersrc.Split( "#".ToCharArray( ) )[0];
} } }string chapter = Path.GetTempFileName( );
string text = string.Empty;
foreach ( ZipArchiveEntry entry2 in archive.Entries )
{if ( entry2.Name == chaptersrc )
{entry2.ExtractToFile( chapter, true );
text = File.ReadAllText( chapter );
File.Delete( chapter );
text = Regex.Replace( text, "<title>[^<]*</title>", string.Empty, RegexOptions.IgnoreCase );
text = Regex.Replace( text, "<br( /)?>", Environment.NewLine, RegexOptions.IgnoreCase );
text = Regex.Replace( text, "(</p>|</h\\d+>)", Environment.NewLine + Environment.NewLine, RegexOptions.IgnoreCase );
text = Regex.Replace( text, "<[^>]+>", string.Empty );
} }textcontent.Add( playorder, text );
} }File.Delete( toc );
} } } #endregion Extract Textint textlength = 0;
Encoding oldencoding = Console.OutputEncoding;
Console.OutputEncoding = encoding;
foreach ( int key in textcontent.Keys )
{Console.WriteLine( textcontent[key] );
textlength += textcontent[key].Length;
}Console.OutputEncoding = oldencoding;
if ( textlength > 1000 )
{return 0;
} else {return 1;
} }static Encoding GetEncoding( string myencoding )
{if ( string.IsNullOrEmpty( myencoding ) )
{return null;
} // Get a list of available encodingsEncodingInfo[] encodings = Encoding.GetEncodings( );
// Try correctly spelled encodings firstforeach ( EncodingInfo encoding in encodings )
{if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) )
{return Encoding.GetEncoding( encoding.CodePage );
} } // No direct match found, try again, ignoring dashesforeach ( EncodingInfo encoding in encodings )
{if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) )
{return Encoding.GetEncoding( encoding.CodePage );
} } // Still no match, try codepagesforeach ( EncodingInfo encoding in encodings )
{if ( encoding.CodePage.ToString( ) == myencoding )
{return Encoding.GetEncoding( encoding.CodePage );
} } // Still no match, giving upreturn null;
}static int ListEncodings( )
{ try {Console.Clear( );
} catch { // Console.Clear( ) throws an IO exception if the output is redirected }int columnwidth = 8;
EncodingInfo[] allencodings = Encoding.GetEncodings( );
List<string> allencodingnames = new List<string>( );
foreach ( EncodingInfo enc in allencodings )
{allencodingnames.Add( enc.Name );
}allencodingnames.Sort( );
foreach ( string enc in allencodingnames )
{columnwidth = Math.Max( columnwidth, enc.Length );
}Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" );
Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" );
foreach ( string enc in allencodingnames )
{Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage );
}return 0;
}static int ShowHelp( params string[] errmsg )
{ #region Help Text /* Epub2Txt, Version 1.01 Extract plain text from an EPUB file and send it to the screen Usage: Epub2Txt "epubfile" [ encoding ] or: Epub2Txt /E Where: epubfile is the path of the EPUB file to be read (no wildcards allowed, only .epub extension) encoding force use of alternative encoding for plain text, e.g. UTF-8 to preserve accented characters or IBM437 to convert unicode quotes to ASCII (default: encoding of EPUB file) /E list all available encodings Notes: If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 0 means no errors were encountered and extracted text exceeds 1KB; otherwise the return code will be 1. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Error Messageif ( errmsg.Length > 0 )
{List<string> errargs = new List<string>( errmsg );
errargs.RemoveAt( 0 );
Console.Error.WriteLine( );
Console.ForegroundColor = ConsoleColor.Red;
Console.Error.Write( "ERROR:\t" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) );
Console.ResetColor( );
} #endregion Error Message #region Display Help TextConsole.Error.WriteLine( );
Console.Error.WriteLine( "Epub2Txt, Version {0}", progver );
Console.Error.WriteLine( "Extract plain text from an EPUB file and send it to the screen" );
Console.Error.WriteLine( );
Console.Error.Write( "Usage: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Epub2Txt \"epubfile\" [ encoding ]" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "or: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.WriteLine( "Epub2Txt /E" );
Console.ResetColor( );
Console.Error.WriteLine( );
Console.Error.Write( "Where: " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "epubfile" );
Console.ResetColor( );
Console.Error.WriteLine( " is the path of the EPUB file to be read" );
Console.Error.WriteLine( " (no wildcards allowed, only .epub extension)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " encoding" );
Console.ResetColor( );
Console.Error.WriteLine( " force use of alternative encoding for plain" );
Console.Error.Write( " text, e.g. " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "UTF-8" );
Console.ResetColor( );
Console.Error.WriteLine( " to preserve accented characters" );
Console.Error.Write( " or " );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( "IBM437" );
Console.ResetColor( );
Console.Error.WriteLine( " to convert unicode quotes to ASCII" );
Console.Error.WriteLine( " (default: encoding of EPUB file)" );
Console.ForegroundColor = ConsoleColor.White;
Console.Error.Write( " /E" );
Console.ResetColor( );
Console.Error.WriteLine( " list all available encodings" );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Notes: If the specified encoding does not match any available encoding" );
Console.Error.WriteLine( " name, the program will try again, ignoring dashes; if that does" );
Console.Error.WriteLine( " not provide a match, the program will try matching the specified" );
Console.Error.WriteLine( " encoding with the available encodings' codepages." );
Console.Error.WriteLine( " Return code (\"errorlevel\") 0 means no errors were encounterd" );
Console.Error.WriteLine( " and some text was extracted from the file; otherwise the" );
Console.Error.WriteLine( " return code will be 1." );
Console.Error.WriteLine( );
Console.Error.WriteLine( "Written by Rob van der Woude" );
Console.Error.WriteLine( "https://www.robvanderwoude.com" );
#endregion Display Help Textreturn 1;
} }}page last modified: 2025-10-11; loaded in 0.0109 seconds