/**
* @file Data model for web page information.
* @author Bart Busschots <opensource@bartificer.ie>
* @license MIT
*/
/**
* This module provides the class for representing the information that is extracted from web pages.
* @module page-data
* @requires module:urijs
*/
import {default as URI} from 'urijs';
/**
* The information extracted from web pages that can be used to render a link.
*
* Instances of this class are created from the information extracted from web pages and converted to link information by data transformers before being rendered to links via templates.
* @see {@link dataTransformer} for details of how instances of this class are used in the link generation process.
*/
export class PageData {
/**
* A new empty page metadata object.
* @readonly
* @type {pageMetadataObject}
*/
static get emptyPageMetadataObject(){
return {
author: '',
creator: '',
description: '',
keywords: [],
publisher: ''
};
}
/**
* @param {string} url - The page's full URL.
*/
constructor(url){
// TO DO - add validation
/**
* The page's URL as a URI object.
*
* @private
* @type {module:urijs}
*/
this._uri = URI();
/**
* The page's title.
*
* @private
* @type {string}
*/
this._title = '';
/**
* The page's header metadata
*
* @private
* @type {pageMetadataObject}
*/
this._metadata = PageData.emptyPageMetadataObject;
/**
* The section headings on the page as arrays of strings indexed by
* `h1` and `h2`.
*
* @private
* @type {Object}
* @property {string[]} h1 - The page's top-level headings (`h1` tags).
* @property {string[]} h2 - The page's secondary headings (`h2` tags).
*/
this._headings = {
h1: [],
h2: []
};
/**
* An object to hold any extra fields extracted from the page by a field extractor function, indexed by field name.
* @type {Object.<string, string>}
* @private
*/
this._extraFields = {};
// store the URL using the public setter to ensure it's stored as a URI object
this.url = url;
}
/**
* @type {string}
* @throws {TypeError} on invalid URLs.
*/
get url(){
return this._uri.toString();
}
set url(url){
this._uri = URI(url).normalize();
}
/**
* @type {module:urijs}
* @readonly
*/
get uri(){
return this._uri.clone();
}
/**
* The domain-part of the URL.
* @type {string}
* @readonly
*/
get domain(){
return this._uri.hostname();
}
/**
* The path-part of the URL.
* @type {string}
* @readonly
*/
get path(){
return this._uri.path();
}
/**
* The page's title. Values are coerced to strings with `String(title)`.
* @type {string}
*/
get title(){
return this._title;
}
set title(title){
this._title = String(title);
}
/**
* The page's metadata from the page's HTML header.
*
* When getting this property a shallow clone of the internal metadata fields object is returned.
*
* When setting this property a comma-separated string can be passed for the keywords. All values passed are coerced to strings with `String()` before being written to the internal data structure.
*
* Note that reading this property produces a shallow clone of the internal metadata fields object, and that the values set for metadata fields are coerced to strings with `String(value)`.
* @type {pageMetadataObject}
* @throws {TypeError} if an attempt is made to set this property to a non-object value.
*/
get metadata(){
return {
author: this._metadata.author,
creator: this._metadata.creator,
description: this._metadata.description,
keywords: [...this._metadata.keywords], // shallow clone is OK since the values are coerced to strings by the setter
publisher: this._metadata.publisher
}
}
set metadata(metadata){
if(typeof metadata === 'object' && metadata !== null){
this._metadata.author = metadata.author ? String(metadata.author) : '';
this._metadata.creator = metadata.creator ? String(metadata.creator) : '';
this._metadata.description = metadata.description ? String(metadata.description) : '';
if(typeof metadata.keywords == 'string' && metadata.keywords.length > 0){
this._metadata.keywords = metadata.keywords.split(/,[ ]*/); // split keyword lists on commas
} else if(metadata.keywords && Array.isArray(metadata.keywords)){
this._metadata.keywords = [...metadata.keywords.map((keyword) => { String(keyword) })];
} else {
this._metadata.keywords = [];
}
this._metadata.publisher = metadata.publisher ? String(metadata.publisher) : ''
} else {
throw new TypeError('metadata must be a dictionary object with string keys and string values, except for the key keywords which should be an array of strings');
}
}
/**
* The page's primary and secondary headings.
* @type {Object}
* @property {string[]} h1 - The page's top-level headings (`h1` tags).
* @property {string[]} h2 - The page's secondary headings (`h2` tags).
* @readonly
*/
get headings(){
let ans = {
h1: [],
h2: []
};
for(let h of this._headings.h1){
ans.h1.push(h);
}
for(let h of this._headings.h2){
ans.h2.push(h);
}
return ans;
}
/**
* The page's top-level headings (`h1` tags).
* @type {string[]}
* @readonly
*/
get topLevelHeadings(){
var ans = [];
for(let h of this._headings.h1){
ans.push(h);
}
return ans;
}
/**
* An alias for `.topLevelHeadings`.
* @readonly
* @see {@link module:page-data.PageData#topLevelHeadings}
*/
get h1s(){
return this.topLevelHeadings;
}
/**
* The page's secondary headings (`h2` tags).
* @type {string[]}
* @readonly
*/
get secondaryHeadings(){
var ans = [];
for(let h of this._headings.h2){
ans.push(h);
}
return ans;
}
/**
* An alias for `.secondaryHeadings`.
* @readonly
* @see {@link module:page-data.PageData#secondaryHeadings}
*/
get h2s(){
return this.secondaryHeadings;
}
/**
* The text from the most important heading on the page. If the page
* has `h1` tags, the first one will be used, if not, the first `h2` tag
* will be used, and if there's none of those either, an empty string will
* be returned.
* @type {string}
* @readonly
*/
get mainHeading(){
if(this._headings.h1.length > 0){
return this._headings.h1[0];
}
if(this._headings.h2.length > 0){
return this._headings.h2[0];
}
return '';
}
/**
* Add a top-level heading.
*
* @param {string} h1Text
* @returns {module:page-data.PageData} A reference to self to
* facilitate function chaning.
*/
addTopLevelHeading(h1Text){
// TO DO - add argument validation
this._headings.h1.push(h1Text);
return this;
}
/**
* Add a seconary heading.
*
* @param {string} h2Text
* @returns {module:page-data.PageData} A reference to self to
* facilitate function chaning.
*/
addSecondaryHeading(h2Text){
// TO DO - add argument validation
this._headings.h2.push(h2Text);
return this;
}
/**
* The extra fields extracted from the page by a field extractor function, indexed by field name. If no field extractor was used, this will be an empty object.
*
* Note that reading this property produces a shallow clone of the internal extra fields object, and that the values set for extra fields are coerced to strings with `String(value)`.
* @type {Object.<string, string>}
* @default {}
* @throws {TypeError} if an attempt is made to set this property to a non-object value.
*/
get extraFields(){
return { ...this._extraFields }; // shallow clone is OK since values are coerced to strings by the setter
}
set extraFields(extraFields){
if(typeof extraFields === 'object' && extraFields !== null){
for (let [key, value] of Object.entries(extraFields)) {
this._extraFields[String(key)] = String(value);
}
} else {
throw new TypeError('extraFields must be a dictionary object with string keys and string values');
}
}
/**
* Add an extra field to the page data object.
*
* Note that values are coerced to strings with `String(value)`.
* @param {string} fieldName - The name of the field to add.
* @param {string} value - The value of the field to add. This will be coerced to a string with `String(value)`.
* @returns {module:page-data.PageData} A reference to self to facilitate function chaning.
*/
addExtraField(fieldName, value){
this._extraFields[String(fieldName)] = String(value);
return this;
}
/**
* Get the page data as a plain object.
* @returns {plainPageInformationObject}
*/
asPlainObject(){
let ans = {
url: this.url,
title: this.title,
metadata: this.metadata,
topLevelHeadings: this.topLevelHeadings,
secondaryHeadings: this.secondaryHeadings,
mainHeading: this.mainHeading,
extraFields: this.extraFields,
uri: URI.parse(this._uri.toString())
};
ans.uri.hasPath = ans.uri.path !== '/';
return ans;
}
};
/**
* A shortcut for `.addTopLevelHeading()`.
* @name module:page-data.PageData#h1
* @function
* @see {@link module:page-data.PageData#addTopLevelHeading}
*
*/
PageData.prototype.h1 = PageData.prototype.addTopLevelHeading;
/**
* A shortcut for `.addSecondaryHeading()`.
* @name module:page-data.PageData#h2
* @function
* @see {@link module:page-data.PageData#addSecondaryHeading}
*
*/
PageData.prototype.h2 = PageData.prototype.addSecondaryHeading;