Linkifier.class.mjs

/**
 * @file The definition of the main Linkifier class which provides the link rendering functionality with the help of the other classes and modules.
 * @author Bart Busschots <opensource@bartificer.ie>
 */

/**
 * Linkifier's core link rendering functionality.
 * @module Linkifier
 * @requires LinkData
 * @requires LinkTemplate
 * @requires PageData
 * @requires module:node-fetch
 * @requires module:cheerio
 * @requires module:mustache
 */
import { PageData } from './PageData.class.mjs';
import { LinkData } from './LinkData.class.mjs';
import { LinkTemplate } from './LinkTemplate.class.mjs';
import * as utilities from "./utilities.mjs";
import * as defaults from "./defaults.mjs";

import fetch from 'node-fetch';
import * as cheerio from 'cheerio';
import Mustache from 'mustache';

/**
 * The class providing the link rendering functionality. Instances of this class capture the settings for generating links, and, generate links using these settings.
 */
export class Linkifier {
    constructor(){
        /**
         * A mapping of domain names to data transformation functions.
         *
         * @private
         * @type {Object.<FQDN, dataTransformer>}
         */
        this._pageDataToLinkDataTransmformers = {
            '.' : function(pData){
                let text = pData.title;
                if(pData.h1s.length === 1){
                    text = pData.mainHeading;
                }
                return new LinkData(pData.url, text);
            }
        };

        /**
         * A mapping of domains names to default template names.
         * 
         * @private
         * @type {Object.<FQDN, templateName>}
         */
        this._pageDataToLinkTemplateName = {
            '.' : 'html' // default to the 'html' template for all domains unless otherwise specified
        };

        /**
         * The registered link templates.
         *
         * @private
         * @type {Object.<templateName, module:@bartificer/linkify.LinkTemplate>}
         */
        this._linkTemplates = {};

        /**
         * The loaded list of words with customised capitalisations.
         * 
         * @private
         * @type {string[]}
         */
        this._speciallyCapitalisedWords = [];
        defaults.speciallyCapitalisedWords.map(word => this._speciallyCapitalisedWords.push(word));

        /**
         * A collection of utility functions.
         *
         * @private
         * @type {Object.<string, Function>}
         */
        this._utilities = utilities;

        //
        // -- Create and register the default templates --
        //
        for (const [name, template] of Object.entries(defaults.linkTemplates)) {
            this.registerTemplate(name, template);
        }
    }

    /**
     * @type {Object.<string, Function>}
     */
    get utilities() {
        return this._utilities;
    }

    /**
     * @see Linfifier.utilities
     */
    get util(){
        return this._utilities;
    }

    /**
     * @returns {string[]} The current list of known words with special capitalisations.
     */
    get speciallyCapitalisedWords(){
        const ans = [];
        this._speciallyCapitalisedWords.map(word => ans.push(word));
        return ans;
    }

    /**
     * @param {string[]} words - a list of words with special capitalisations
     */
    set speciallyCapitalisedWords(words){
        // TO DO - add validation

        this._speciallyCapitalisedWords = words;
    }

    /**
     * Register a data transformer function for a given domain.
     *
     * @param {domainName} domain - The domain for which this transformer should be
     * used.
     * @param {dataTransformer} transformerFn - The data transformer callback.
     * @throws {ValidationError} A validation error is thrown if either parameter
     * is missing or invalid.
     */
    registerTransformer(domain, transformerFn){
        // TO DO - add validation
    
        let fqdn = String(domain);
        if(!fqdn.match(/[.]$/)){
            fqdn += '.';
        }
        this._pageDataToLinkDataTransmformers[fqdn] = transformerFn;
    }

    /**
     * Get the data transformer function for a given domain.
     *
     * Note that domains are searched from the subdomain up. For example, if passed
     * the domain `www.bartificer.net` the function will first look for a
     * transformer for the domain `www.bartificer.net`, if there's no transformer
     * registered for that domain it will look for a transformer for the domain
     * `bartificer.net`, if there's no transformer for that domain either it will
     * return the default transformer.
     *
     * @param {domainName} domain - The domain to get the data transformer for.
     * @returns {dataTransformer}
     * @throws {ValidationError} A validation error is thrown unless a valid domain
     * name is passed.
     */
    getTransformerForDomain(domain){
        // TO DO - add validation
    
        let fqdn = String(domain);
        if(!fqdn.match(/[.]$/)){
            fqdn += '.';
        }
    
        // return the most exact match
        while(fqdn.match(/[.][^.]+[.]$/)){
            if(this._pageDataToLinkDataTransmformers[fqdn]){
                //console.log(`returning transformer for '${fqdn}'`);
                return this._pageDataToLinkDataTransmformers[fqdn];
            }
            //console.log(`no transformer found for '${fqdn}'`);
            fqdn = fqdn.replace(/^[^.]+[.]/, '');
        }
        //console.log('returning default transformer');
        return this._pageDataToLinkDataTransmformers['.'];
    }

    /**
     * A list of the names of the registered link templates.
     * @type {string[]}
     */
    get templateNames() {
        return Object.keys(this._linkTemplates);
    }

    /**
     * @returns {string} The name of the default template.
     */
    get defaultTemplateName(){
        return this._pageDataToLinkTemplateName['.'];
    }

    /**
     * @param {string} templateName - The name of the default template to use.
     * @throws {ValidationError} A validation error is thrown if the template name is missing, invalid, or doesn't correspond to a registered template.
     */
    set defaultTemplateName(templateName){
        const tplName = String(templateName);
        if(!this._linkTemplates[tplName]){
            throw new ValidationError(`No template named '${tplName}' is registered`);
        }
        this._pageDataToLinkTemplateName['.'] = tplName;
    }
    
    /**
     * The default link template.
     * @type {LinkTemplate}
     */
    get defaultTemplate(){
        return this._linkTemplates[this._pageDataToLinkTemplateName['.']];
    }

    /**
     * Register a link template.
     *
     * @param {templateName} name
     * @param {module:@bartificer/linkify.LinkTemplate} template
     * @throws {ValidationError} A validation error is thrown unless both a valid
     * name and template object are passed.
     */
    registerTemplate(name, template){
        // TO DO - add validation
        const tplName = String(name);
    
        this._linkTemplates[tplName] = template;
    }

    /**
     * Get a registered link template by name.
     *
     * @param {string} templateName
     * @returns {LinkTemplate}
     * @throws {ValidationError} A validation error is thrown unless a valid name is passed and corresponds to a registered template.
     */
    getTemplate(templateName){
        const tplName = String(templateName);

        if(!this._linkTemplates[tplName]){
            throw new ValidationError(`No template named '${tplName}' is registered`);
        }
        return this._linkTemplates[tplName];
    }

    /**
     * Register a default template for use with a given domain. This template will
     * override the overall default for this domain and all its subdomains.
     *
     * @param {domainName} domain - The domain for which this template should be used by default.
     * @param {templateName} templateName - The name of the template to use.
     * @throws {ValidationError} A validation error is thrown if either parameter
     * is missing or invalid.
     */
    registerDefaultTemplateMapping(domain, templateName){
        // TO DO - add validation
    
        let fqdn = String(domain);
        if(!fqdn.match(/[.]$/)){
            fqdn += '.';
        }
        this._pageDataToLinkTemplateName[fqdn] = templateName;
    }

    /**
     * Get the data transformer function for a given domain.
     *
     * Note that domains are searched from the subdomain up. For example, if passed
     * the domain `www.bartificer.net` the function will first look for a
     * transformer for the domain `www.bartificer.net`, if there's no transformer
     * registered for that domain it will look for a transformer for the domain
     * `bartificer.net`, if there's no transformer for that domain either it will
     * return the default transformer.
     *
     * @param {domainName} domain - The domain to get the data transformer for.
     * @returns {dataTransformer}
     * @throws {ValidationError} A validation error is thrown unless a valid domain
     * name is passed.
     */
    getTemplateNameForDomain(domain){
        // TO DO - add validation
    
        let fqdn = String(domain);
        if(!fqdn.match(/[.]$/)){
            fqdn += '.';
        }
    
        // return the most exact match
        while(fqdn.match(/[.][^.]+[.]$/)){
            if(this._pageDataToLinkTemplateName[fqdn]){
                let tplName = this._pageDataToLinkTemplateName[fqdn];

                // make sure the template exists
                if(!this._linkTemplates[tplName]){
                    console.warn(`No template named '${tplName}' is registered, falling back to global default '${this._pageDataToLinkTemplateName['.']}'`);
                    return this._pageDataToLinkTemplateName['.'];
                }

                //console.log(`returning template name for '${fqdn}'`);
                return this._pageDataToLinkTemplateName[fqdn];
            }
            //console.log(`no template name found for '${fqdn}'`);
            fqdn = fqdn.replace(/^[^.]+[.]/, '');
        }
        //console.log('returning default template name');
        return this._pageDataToLinkTemplateName['.'];
    }

    /**
     * Fetch the page data for a given URL.
     *
     * @async
     * @param {URL} url
     * @returns {PageData}
     * @throws {ValidationError} A validation error is thrown unless a valid URL is
     * passed.
     */
    async fetchPageData(url){
        // TO DO - add validation
        
        let ans = new PageData(url);
        
        // then try load the contents form the web
        let webDownloadResponseBody = '';
        try {
            let webDownloadResponse = await fetch(url);
            if(!webDownloadResponse.ok){
                throw new Error(`HTTP ${webDownloadResponse.status}: ${webDownloadResponse.statusText}`);
            }
            webDownloadResponseBody = await webDownloadResponse.text();
        } catch (err) {
            // fall back to extracting the title from the URL slug
            console.warn(`Falling back to de-slugifying URL (${err.message})`);
            ans.title = this.utilities.extractSlug(url, this._speciallyCapitalisedWords) || 'Untitled';
            return ans;
        }
        let $ = cheerio.load(webDownloadResponseBody);
        ans.title = $('title').text().trim();
        $('h1').each(function(){
            ans.h1($(this).text().trim());
        });
        $('h2').each(function(){
            ans.h2($(this).text().trim());
        });
        
        // return the answer
        return ans;
    }

    /**
     * Generate a link given a URL. By default the registered template for the
     * URL's domain will be used, or, if none is registered, the overall
     * default will be used (`html`).
     *
     * @async
     * @param {URL} url
     * @param {templateName} [templateName='html']
     * @returns {string}
     * @throws {ValidationError} A validation error is thrown unless a valid URL is
     * passed.
     */
    async generateLink(url, templateName){
        // TO DO - add validation

        //
        // -- resolve the template name to use for this URL --
        //
        let tplName = '';

        // resolve the template — if a template name is passed, try use it,
        // otherwise resolve the default for this URL's domain
        if(templateName && typeof templateName === 'string'){
            tplName = templateName;

            // make sure the template exists
            if(!this._linkTemplates[tplName]){
                console.warn(`No template named '${tplName}' is registered, falling back to global default '${this._pageDataToLinkTemplateName['.']}'`);
                tplName = this._pageDataToLinkTemplateName['.'];
            }
        } else {
            tplName = this.getTemplateNameForDomain((new URL(url)).hostname);
        }
        const template = this._linkTemplates[tplName];
        
        // get the page data        
        const pageData = await this.fetchPageData(url);
        
        // transform the page data to link data
        const linkData = this.getTransformerForDomain(pageData.uri.hostname())(pageData);

        // apply field-specific filters to the link data
        const fieldNames = ['url', 'text', 'description'];
        const templateData = linkData.asPlainObject();
        for(let fieldName of fieldNames){
            let fieldFilters = template.filtersFor(fieldName);
            for(let filterFn of fieldFilters){
                templateData[fieldName] = filterFn(templateData[fieldName]);
            }
        }

        // apply the universal filters to all the link data fields
        let globalFilters = template.filtersFor('all');
        for(let filterFn of globalFilters){
            for(let fieldName of fieldNames){
                templateData[fieldName] = filterFn(templateData[fieldName]);
            }
        }
        
        // render the link
        return Mustache.render(this._linkTemplates[tplName].templateString, templateData);
    }
};