using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Text; using System.Text.RegularExpressions; namespace RobvanderWoude { internal class ODT2Txt { static string progver = "1.00"; static int Main( string[] args ) { string odtfile = string.Empty; Encoding encoding = null; bool usexmlencoding = false; #region Parse Command Line if ( args.Length == 0 || args.Length > 2 ) { return ShowHelp( ); } foreach ( string arg in args ) { if ( arg[0] == '/' ) { if ( arg == "/?" ) { return ShowHelp( ); } else if ( arg.ToUpper( ).StartsWith( "/D", StringComparison.OrdinalIgnoreCase ) ) { usexmlencoding = true; } else if ( arg.ToUpper( ) == "/E" ) { return ListEncodings( ); } else { return ShowHelp( "Invalid command line switch {0}", arg ); } } else { if ( string.IsNullOrWhiteSpace( odtfile ) ) { odtfile = arg; if ( !File.Exists( odtfile ) ) { return ShowHelp( "File \"{0}\" not found", odtfile ); } if ( Path.GetExtension( odtfile ).ToLower( ) != ".odt" ) { return ShowHelp( "This program can extract text from .ODT files only" ); } } else if ( encoding == null ) { encoding = GetEncoding( arg ); if ( encoding == null ) { return ShowHelp( "Invalid encoding \"{0}\"", args[1] ); } } else { return ShowHelp( "Too many command line arguments" ); } } } if ( string.IsNullOrWhiteSpace( odtfile ) ) { return ShowHelp( ); } #endregion Parse Command Line #region Extract Text string tempfile = Path.GetTempFileName( ); string content = string.Empty; bool success = false; using ( ZipArchive archive = ZipFile.OpenRead( odtfile ) ) { foreach ( ZipArchiveEntry entry in archive.Entries ) { if ( entry.Name.ToLower( ) == "content.xml" ) { entry.ExtractToFile( tempfile, true ); success = true; } } } if ( success ) { StreamReader sr = new StreamReader( tempfile ); content = sr.ReadToEnd( ); sr.Close( ); } File.Delete( tempfile ); #endregion Extract Text if ( success ) { // The first 100 characters of the extracted XML usually contain its encoding; // this encoding will be used if the /D command line switch was used Regex regex = new Regex( " encoding=\"([^\"]+)\"" ); string xmlencoding = regex.Match( content, 0, 100 ).Groups[1].Value; #region Cleanup Text // insert newlines after header, paragraph or list-item regex = new Regex( "" ); string plaintext = regex.Replace( content, "\n\n" ); // remove all XML tags regex = new Regex( "<[^>]+>" ); plaintext = regex.Replace( plaintext, "" ); // reduce maximum number of censecutive newlines to two regex = new Regex( "\n{3,}" ); plaintext = regex.Replace( plaintext, "\n\n" ); // convert stray carriage returns to carriage return/linefeed pairs plaintext = ConvertStrayCarriageReturns( plaintext ).Trim( "\n\r\t ".ToCharArray( ) ); #endregion Cleanup Text #region Display Text if ( usexmlencoding ) { encoding = GetEncoding( xmlencoding ); } if ( encoding == null ) { // send text to console using default output encoding Console.WriteLine( plaintext ); } else { // temporarily change output encoding and send text to console Encoding oldencoding = Console.OutputEncoding; Console.OutputEncoding = encoding; Console.WriteLine( plaintext ); Console.OutputEncoding = oldencoding; } #endregion Display Text return 0; } return ShowHelp( "An error occurred while trying to read \"{0}\"", odtfile ); } static string ConvertStrayCarriageReturns( string text ) { // convert stray carriage returns to carriage return/linefeed pairs // search for stray carriage returns (\r), i.e. the ones NOT followed by linefeeds (\n) Regex regex = new Regex( "\r(?!\n)" ); // replace each matching stray carriage return by a carriage return/linefeed pair text = regex.Replace( text, Environment.NewLine ); return text; } static Encoding GetEncoding( string myencoding ) { if ( string.IsNullOrEmpty( myencoding ) ) { return null; } // Get a list of available encodings EncodingInfo[] encodings = Encoding.GetEncodings( ); // Try correctly spelled encodings first foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.ToLower( ) == myencoding.ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // No direct match found, try again, ignoring dashes foreach ( EncodingInfo encoding in encodings ) { if ( encoding.Name.Replace( "-", "" ).ToLower( ) == myencoding.Replace( "-", "" ).ToLower( ) ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, try codepages foreach ( EncodingInfo encoding in encodings ) { if ( encoding.CodePage.ToString( ) == myencoding ) { return Encoding.GetEncoding( encoding.CodePage ); } } // Still no match, giving up return null; } static int ListEncodings( ) { try { Console.Clear( ); } catch { // Console.Clear( ) throws an IO exception if the output is redirected } int columnwidth = 8; EncodingInfo[] allencodings = Encoding.GetEncodings( ); List allencodingnames = new List( ); foreach ( EncodingInfo enc in allencodings ) { allencodingnames.Add( enc.Name ); } allencodingnames.Sort( ); foreach ( string enc in allencodingnames ) { columnwidth = Math.Max( columnwidth, enc.Length ); } Console.WriteLine( "{0,-" + columnwidth + "} {1}", "Encoding", "CodePage" ); Console.WriteLine( "{0,-" + columnwidth + "} {1}", "========", "========" ); foreach ( string enc in allencodingnames ) { Console.WriteLine( "{0,-" + columnwidth + "} {1}", enc, GetEncoding( enc ).CodePage ); } return 0; } static int ShowHelp( params string[] errmsg ) { #region Error Message if ( errmsg.Length > 0 ) { List errargs = new List( errmsg ); errargs.RemoveAt( 0 ); Console.Error.WriteLine( ); Console.ForegroundColor = ConsoleColor.Red; Console.Error.Write( "ERROR:\t" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( errmsg[0], errargs.ToArray( ) ); Console.ResetColor( ); } #endregion Error Message #region Help Text /* ODT2Txt.exe, Version 1.00 Return plain text content of an OpenOffice file without requiring OpenOffice Usage: ODT2Txt.exe odtfile [ encoding | /D ] or: ODT2Txt.exe /E Where: odtfile is the path of the OpenOffice file to be read (no wildcards, only .odt extension allowed) encoding is the output encoding, e.g. UTF-8 to preserve Unicode characters, or IBM437 to convert Unicode doublequotes to ASCII /D use the encoding specified in the document file /E list all available encodings Notes: If the specified encoding does not match any available encoding name, the program will try again, ignoring dashes; if that does not provide a match, the program will try matching the specified encoding with the available encodings' codepages. This program requires .NET 4.5. Return code ("errorlevel") 1 in case of errors, 0 on success. Written by Rob van der Woude https://www.robvanderwoude.com */ #endregion Help Text #region Display Help Text Console.Error.WriteLine( ); Console.Error.WriteLine( "ODT2Txt.exe, Version {0}", progver ); Console.Error.WriteLine( "Return plain text content of an OpenOffice file without requiring OpenOffice" ); Console.Error.WriteLine( ); Console.Error.Write( "Usage: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "ODT2Txt.exe odtfile [ encoding | /D ]" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "or: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.WriteLine( "ODT2Txt.exe /E" ); Console.ResetColor( ); Console.Error.WriteLine( ); Console.Error.Write( "Where: " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "odtfile" ); Console.ResetColor( ); Console.Error.WriteLine( " is the path of the OpenOffice file to be read" ); Console.Error.WriteLine( " (no wildcards, only .odt extension allowed)" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " encoding" ); Console.ResetColor( ); Console.Error.Write( " is the output encoding, e.g. " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "UTF-8" ); Console.ResetColor( ); Console.Error.WriteLine( " to preserve" ); Console.Error.Write( " Unicode characters, or " ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( "IBM437" ); Console.ResetColor( ); Console.Error.WriteLine( " to convert Unicode" ); Console.Error.WriteLine( " doublequotes to ASCII" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " /D" ); Console.ResetColor( ); Console.Error.WriteLine( " use the encoding specified in the document file" ); Console.ForegroundColor = ConsoleColor.White; Console.Error.Write( " /E" ); Console.ResetColor( ); Console.Error.WriteLine( " list all available encodings" ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Notes: If the specified encoding does not match any available encoding" ); Console.Error.WriteLine( " name, the program will try again, ignoring dashes; if that does" ); Console.Error.WriteLine( " not provide a match, the program will try matching the specified" ); Console.Error.WriteLine( " encoding with the available encodings' codepages." ); Console.Error.WriteLine( " This program requires .NET 4.5." ); Console.Error.WriteLine( " Return code (\"errorlevel\") 1 in case of errors, 0 on success." ); Console.Error.WriteLine( ); Console.Error.WriteLine( "Written by Rob van der Woude" ); Console.Error.WriteLine( "https://www.robvanderwoude.com" ); #endregion Display Help Text return 1; } } }