package de.xam.texthtml.text;

import static org.xydra.core.util.RegExUtil.DOT;
import static org.xydra.core.util.RegExUtil.HEXDIGIT;
import static org.xydra.core.util.RegExUtil.LOWER;
import static org.xydra.core.util.RegExUtil.UNICODE_ALPHANUM;
import static org.xydra.core.util.RegExUtil.UNICODE_RANGE_ALPHANUM;
import static org.xydra.core.util.RegExUtil.UPPER;

import org.xydra.annotations.RunsInGWT;

import de.xam.texthtml.html.SharedHtmlUtils;

/**
 * Render URLs as a-href
 *
 * See also {@link SharedHtmlUtils}
 *
 * GWT RegEx Docu: There are a few small incompatibilities between the two implementations. Java-specific constructs in
 * the regular expression syntax (e.g. [a-z&&[^bc]], (?<=foo), \A, \Q) work only in the pure Java implementation, not
 * the GWT implementation, and are not rejected by either. Also, the Javscript-specific constructs $` and $' in the
 * replacement expression work only in the GWT implementation, not the pure Java implementation, which rejects them.
 *
 * ECMA script: Don't nest character classes -- legal in Java, but forbidden in ECMA.
 *
 * TODO find out how to use these in GWT with unicode flag set. Set Unicode flag in Java:
 * Pattern.UNICODE_CHARACTER_CLASS in Pattern.compile(..)
 *
 * @author xamde
 */
@RunsInGWT(true)
public class TextRenderer {

	public static final char ZERO_WIDTH_SPACE = '\u200B';

	/**
	 * TODO Using secure, popular schemes from https://en.wikipedia.org/wiki/URI_scheme
	 */
	public static final String SCHEMES = "("
			// core
			+ "http|https"

	// // stable
	// + "|doi|data|dav|file|ftp|sip|sips|tag|tel|urn"
	// // provisional
	// + "|bitcoin|callto|git|svn|sftp|ssh|xri"

	+ ")";

	/**
	 * Scheme is fixed as "http" or "https".
	 */
	public static final String SCHEME = "(?:" + SCHEMES + ":\\/\\/)";

	/** A 2nd or deeper level sub-domain */
	public static final String SUBDOMAIN = "(?:" + UNICODE_ALPHANUM + "(?:" + "[" + UNICODE_RANGE_ALPHANUM + "-]*"
			+ UNICODE_ALPHANUM + ")?" + DOT + ")+";

	/**
	 * Top-level domain list is growing, so allow every word with at least two ASCII-characters. All characters should
	 * be same-case to minimise false matches. I.e. domain extension can be 'com', 'COM' but not 'Com' or 'CoM'
	 */
	public static final String TOPLEVELDOMAIN = "(?:" + LOWER + "{2,}|" + UPPER + "{2,})";

	/**
	 * Domain name with many optional subdomains. No port number!
	 */
	public static final String DOMAIN = "(?:" + SUBDOMAIN + TOPLEVELDOMAIN + "|localhost)";

	/** if this one doesn't match, its not a URI for sure */
	public static final String REGEX_MINIMAL_URL_CONTAINS = "[:.]|localhost";

	public static final String PORT = "(?::[0-9]+)?";

	/**
	 * Does not include '#' or '?', also no '%'
	 */
	public static final String LEGAL_URL_CHARS_IN_PATH_OR_FRAGMENT =

	UNICODE_RANGE_ALPHANUM + "\\/._~!$&'()*+,;=:@\\-";

	/**
	 * There are no additional chars allowed in email addresses. Ready for embedding in a character class.
	 */
	public static final String LEGAL_URL_CHARS = LEGAL_URL_CHARS_IN_PATH_OR_FRAGMENT + "%#?";

	public static final String URL_PATH_OR_FRAGMENT_PART = "(?:["

	+ LEGAL_URL_CHARS_IN_PATH_OR_FRAGMENT

	+ "]|(?:%" + HEXDIGIT + HEXDIGIT + "))";

	public static final String FRAGMENT = "(?:#" + URL_PATH_OR_FRAGMENT_PART + "*)?";

	/**
	 * Path starts with '/', '?' or '#'.
	 *
	 * According to RFC 3986 the valid characters for the URL path component are:
	 *
	 * a-z A-Z 0-9 . - _ ~ ! $ & ' ( ) * + , ; = : @
	 *
	 * + '%' HEXDIGIT HEXDIGIT
	 *
	 */
	public static final String PATH =

	// path segment
	"(?:\\/" + URL_PATH_OR_FRAGMENT_PART + "*)?"

	// query part
			+ "(?:\\?" + URL_PATH_OR_FRAGMENT_PART + "*)?"

	+ FRAGMENT;

	/** scheme://domain:port/path || file:///path */
	public static final String FULL_HTTP_URL = "(?:" + SCHEME + DOMAIN + PORT + PATH + ")";

	/** file:///path */
	public static final String FILE_URL = "(?:" + "file:\\/\\/" + PATH + ")";

	/** domain:port/path */
	public static final String PARTIAL_URL = "(?:" + DOMAIN + PORT + PATH + ")";

	/**
	 * Allow a number of weird characters in front of @. Avoid allowing '@' itself and ',' to prevent confusion with
	 * multiple mail addresses. Avoid '&lt;' and '&gt;' to strip them off.
	 */
	private static final String EMAIL_BEFORE_AT = "(?:["

	+ UNICODE_RANGE_ALPHANUM + "._~!$&()*+;=:-"

	+ "]+)";

	/**
	 * Recognise the address-part of a mail address. I.e. the 'jon@example.com' in '"John Doe" <jon@example.com>'
	 */
	public static final String EMAIL = "(?:" + EMAIL_BEFORE_AT + "@" + DOMAIN + ")";

	public static final String WHITESPACE = "[ \\t\\n\\r]";

	/* ============= find ... versions ================== */

	/**
	 * Constructs e.g. regexes for finding urls, allowing a braced and unbraced variant.
	 *
	 * @param baseRegex
	 * @param allowBraced
	 * @param openBrace
	 * @param closeBrace
	 * @return a regex that has the matched baseRegex at groups 2 and 5
	 */
	public static final String findRegex(final String baseRegex, final boolean allowBraced, final char openBrace,
			final char closeBrace) {
		String x = "";
		// all = group 0

		// non-capturing group
		x += "(?:";
		// group 1: whitespace or start of content
		x += "(^|" + WHITESPACE + ")" +
				// group 2: BASE
				"(" + baseRegex + ")" +
				// group 3: whitespace or end of content
				"(" + WHITESPACE + "|$)"
				// end non-capturing group
				+ ")";
		if (allowBraced) {
			x += "|" +
					// braced

			// start non-capturing group
			"(?:" +
					// group 4, the char class which contains a single open
					// brace
			"([" + openBrace + "])" +
					// group 5: BASE
					"(" + baseRegex + ")" +
					// group 6, closing brace
					"([" + closeBrace + "])"
					// close non-capturing group
					+ ")";
		}
		return x;
	}

	/**
	 * replace with '"$1$4<a target=\"_blank\" class=\"full auto\" href=\"$2$5\">$2$5</a>$3$6"'
	 *
	 * == (leading whitespace and open braces)TAG(full url)TAG(trainling whitespace and closing braces)
	 */
	public static final String FIND_HTTP_URL = findRegex(FULL_HTTP_URL, true, '(', ')');

	public static final String FIND_FILE_URL = findRegex(FILE_URL, false, 'x','x');

	public static final String FIND_EMAIL = findRegex(EMAIL, true, '<', '>');

	/** URL without scheme at group 2 or 5 */
	public static final String FIND_PARTIAL_URL = findRegex(PARTIAL_URL, true, '(', ')');

	/* ================================= */

	/**
	 * Render text and turn detected (partial) URL strings into full fledged HTML links.
	 *
	 * Detects even those like 'see also foo (foo.com)'
	 *
	 * See also {@link SharedHtmlUtils#sanitize(String)}
	 *
	 * @param plainText ..
	 *
	 * @return safe HTML
	 */
	public static String renderAsSafeHtml(final String plainText) {
		if (plainText == null) {
			return null;
		}

		/* first, encode dangerous stuff */
		final String safe = SharedHtmlUtils.htmlEncode(plainText);
		String linked = safe;

		/* second: make the safe stuff nicer */

		// look for urls and make them clickable

		/* Implementation note: Normal version fills back-references 1,2, and 3. The braced version fills
		 * back-references 4,5, and 6. */
		try {
			// full HTTP URLs
			linked = linked.replaceAll( // .

			FIND_HTTP_URL,

			"$1$4<a target=\"_blank\" class=\"full auto\" href=\"$2$5\">$2$5</a>$3$6"

			);

			// full file: URLs
			linked = linked.replaceAll( // .

			FIND_FILE_URL,

			"$1$4<a target=\"_blank\" class=\"file auto\" href=\"$2$5\">$2$5</a>$3$6"

			);

			// partial urls
			linked = linked.replaceAll( // .

			FIND_PARTIAL_URL,

			"$1$4<a target=\"_blank\" class=\"partial auto\" href=\"http://$2$5\">$2$5</a>$3$6"

			);
		} catch (final Exception e) {
			throw new RuntimeException("Error rendering '" + plainText + "'", e);
		}

		// linked = linked.replaceAll( // .
		//
		// "(^|" + DELIM + ")(" + DOMAIN + PATH + ")" + DELIM,
		//
		// "$1<a target=\"_blank\" class=\"partial auto\" href=\"http://$3\">$3</a>$7"
		//
		// );

		return linked;
	}

}
