src/lib/wpxml2md.js

import Fs from 'fs'
import Path  from 'path'
import NodeUtil from 'util'
import XmlParser  from 'xml2js'
import Util from './util.js'
import Logger from './logger.js'
import Convert from './converter.js'
import ImageLinkReplace from './image-link-replacer.js'
import Comment from './comment.js'

const ParseXML = NodeUtil.promisify(XmlParser.parseString)

/**
 * Create a directory to save the markdown file.
 * @param {String} root Path of the roo directory.
 * @param {String} year Year.
 * @param {String} month Month
 * @return {String} If successful it is the path of the created directory.
 */
const createSaveDir = (root, year, month) => {
  // root/year
  let dir  = Path.join(root, year)
  if (!(Util.mkdirSync(dir))) {
    return null
  }

  // root/year/month
  dir = Path.join(dir, month)
  if (!(Util.mkdirSync(dir))) {
    return null
  }

  return dir
}

/**
 * Array to string for metadata.
 * @param {Array} arr Array.
 * @return {String} String.
 */
const arrayToString = (arr) => {
  if (!(arr && arr.length)) {
    return '[]'
  }

  let str = `["${arr[0]}"`
  for (let i = 1, max = arr.length; i < max; ++i) {
    str += `, "${arr[i]}"`
  }

  str += ']'
  return str
}

/**
 * Create a excerpt string from Markdown text.
 * The specification of the excerpt statement is below.
 * - No line break
 * - No header, list (ul/ol), table, blockquote
 * - No Markdown decoration
 * - Markdown links and images extracted only text
 * - Add "..." to the end if it exceeds 100 characters
 * - Escape a double quote for YAML
 * @param {String} markdown Markdown text of content body.
 * @return {String} Excerpted string.
 */
const createExcerpt = (markdown) => {
  if (!markdown) {
    return ''
  }

  let str = markdown
    .replace(/\n\n/mg, '\n')
    .replace(/^(#|\*|\d{1,5}\. |\||> ).*?\n/mg, '')
    .replace(/\n/mg, '')
    .replace(/\*\*(.*?)\*\*/g, (m, $1) => $1)
    .replace(/__(.*?)__/g, (m, $1) => $1)
    .replace(/!\[(.*?)\]\(.*?\)/g, (m, $1) => $1)
    .replace(/\[(.*?)\]\(.*?\)/g, (m, $1) => $1)
    .replace(/"/g, '\\"')

  str = 100 <= str.length ? str.substring(0, 99) + '...' : str
  return str
}

/**
 * Create a header of article metadata.
 * @param {Object} metadata Metadata of article.
 * @param {String} markdown Markdown text of content body.
 * @return {String} Header text.
 */
const createMetadataHeader = (metadata, markdown) => {
  const last = metadata.type === 'page' ? 'single: true\n---\n\n' : '---\n\n'
  return `---
path: "/${metadata.type}s/${metadata.year}/${metadata.month}/${metadata.permanentName}/"
date: "${metadata.year}-${metadata.month}-${metadata.day}T${metadata.time}Z"
title: "${metadata.title}"
categories: ${arrayToString(metadata.categories)}
tags: ${arrayToString(metadata.tags)}
excerpt: "${createExcerpt(markdown)}"
${last}`
}

/**
 * Read an article metadata from xml object.
 * @param {Object} post XML object.
 * @return {Object} Metadata.
 */
const readMetadata = (post) => {
  const categories = []
  const tags       = []
  if (post.category) {
    post.category.forEach((value) => {
      switch (value.$.domain) {
        case 'category':
          categories.push(value._)
          break

        case 'post_tag':
          tags.push(value._)
          break

        default:
          break
      }
    })
  }

  const datetime = Util.datetimeFromWpGMT(post['wp:post_date_gmt'][0])
  return {
    year: datetime.year,
    month: datetime.month,
    day: datetime.day,
    time: datetime.time,
    permanentName: post['wp:post_name'][0],
    title: post['title'][0],
    categories,
    tags,
    type: post['wp:post_type'][0]
  }
}

/**
 * Replace the link URL included in Markdown.
 * @param {String} markdown Markdown text.
 * @param {String} oldPrefix Target.
 * @param {String} newPrefix String to replace.
 * @return {String} Replaced string.
 */
const replaceLinkURL = (markdown, oldPrefix, newPrefix) => {
  if (!(markdown && (oldPrefix && typeof oldPrefix === 'string') && (newPrefix && typeof newPrefix === 'string'))) {
    return markdown
  }

  return markdown.replace(/\[(.*?)\]\((.*?)\)/g, (match, $1, $2) => {
    const regexp = new RegExp(Util.escapeRegExp(oldPrefix), 'g')
    const url = $2.replace(regexp, newPrefix)
    return `[${$1}](${url})`
  })
}

/**
 * Convert the post data to markdown file.
 * @param {Object} post Post data.
 * @param {Object} metadata Metadata.
 * @param {String} rootDir Path of Markdown file output directory.
 * @param {Logger} logger Logger.
 * @param {CLIOptions} options Options.
 * @return {Promise} Promise task.
 */
const convertPost = async (post, metadata, rootDir, logger, options) => {
  logger.log(`${metadata.year}/${metadata.month}/${metadata.day} ['${metadata.type}']: ${metadata.title}`)

  const dir = createSaveDir(rootDir, metadata.year, metadata.month)
  if (!(dir)) {
    throw new Error('Failed to create a save directory.')
  }

  // If there are multiple articles on the same day, their names will be duplicated and made unique.
  const filePath = Util.uniquePathWithSequentialNumber(Path.join(dir, `${metadata.day}.md`))
  const stream = Fs.createWriteStream(filePath)
  if (!(stream)) {
    throw new Error('Failed to create the stream.')
  }

  let markdown = Convert(post['content:encoded'][0], options)
  if (options.withMetadata) {
    stream.write(createMetadataHeader(metadata, markdown), 'utf8')
  } else {
    stream.write(`# ${metadata.title}\n\n`, 'utf8')
  }

  if (options.withImageDownload) {
    const basename = Path.basename(filePath, '.md')
    markdown = await ImageLinkReplace(markdown, dir, basename, logger)
  }

  if (options.replaceLinkPrefix) {
    markdown = replaceLinkURL(markdown, options.replaceLinkPrefix.old, options.replaceLinkPrefix.new)
  }

  if (options.withComment) {
    markdown += Comment(post['wp:comment'])
  }

  stream.write(markdown, 'utf8')
}

/**
 * Create a directory with a unique name.
 * @param {String} dir Base directory path.
 * @return {String} The path of the created directory. Failure is null.
 */
const createUniqueDestDir = (dir) => {
  const base = Path.resolve(dir)
  const name = Util.formatDate(new Date(), 'YYYYMMDD-hhmmss')

  let path = Path.resolve(base, name)
  if (!(Util.existsSync(path))) {
    if (Util.mkdirSync(path)) {
      return path
    }
  }

  // Add sequential number
  for (let i = 1; i <= 256; ++i) {
    path = Path.resolve(base, name + '-' + i)
    if (!(Util.existsSync(path))) {
      if (Util.mkdirSync(path)) {
        return path
      }
    }
  }

  return null
}

/**
 * Gets the posts data from XML.
 * @param {String} src Path of XML file..
 * @return {Promise} Promise task.
 */
const postsFromXML = async (src) => {
  const data = Fs.readFileSync(Path.resolve(src))
  if (!(data)) {
    throw new Error(`"${src}" is not found.`)
  }
  const xml   = await ParseXML(data.toString())
  return xml.rss.channel[0].item
}

/**
 * Conver WordPress XML file to Markdown files.
 * @param {String} src Path of the WordPress XML file.
 * @param {String} dest Path of Markdown files output directory.
 * @param {CLIOptions} options Options.
 * @return {Promise} Promise object.
 */
const WordPressXmlToMarkdown = async (src, dest, options = { report: false }) => {
  const logger = new Logger(options.report)
  logger.log(`Input:  ${src}`)
  logger.log(`Output: ${dest}`)

  const dir = createUniqueDestDir(dest)
  if (!(dir)) {
    throw new Error('Failed to create the root directory.')
  }

  const postsDir = Path.join(dir, 'posts')
  if (!(Util.mkdirSync(postsDir))) {
    throw new Error('Failed to create the posts directory.')
  }

  const pagesDir = Path.join(dir, 'pages')
  if (!(Util.mkdirSync(pagesDir))) {
    throw new Error('Failed to create the pages directory.')
  }

  const posts = await postsFromXML(src)
  for (let post of posts) {
    const m = readMetadata(post)
    await convertPost(post, m, m.type === 'post' ? postsDir : pagesDir, logger, options)
  }
}

export default WordPressXmlToMarkdown