package edu.cmu.casos.automap;

import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.parser.PdfTextExtractor;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.math.BigInteger;
import java.security.DigestOutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Random;
import org.apache.poi.hwpf.extractor.WordExtractor;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;

/* loaded from: input_file:edu/cmu/casos/automap/WebCrawler.class */
public class WebCrawler extends Crawler {
    private String outputDir;
    private boolean removeRefCh;
    private int wait;
    private boolean randomWait;
    private long quota;
    private boolean fetchDocuments;
    private long downloaded;
    private Random rnd;
    private static final String newline = System.getProperty("line.separator");
    private static final String indexFile = "Index";

    public WebCrawler() throws IOException {
        this("output", true);
    }

    public WebCrawler(String str) throws IOException {
        this(str, true);
    }

    public WebCrawler(boolean z) throws IOException {
        this("output", z);
    }

    public WebCrawler(String str, boolean z) throws IOException {
        this(str, z, 0, false, 0L, false);
    }

    public WebCrawler(String str, boolean z, int i, boolean z2, long j, boolean z3) throws IOException {
        this.rnd = new Random();
        if (str.endsWith(File.separator)) {
            this.outputDir = str;
        } else {
            this.outputDir = str + File.separator;
        }
        this.removeRefCh = z;
        File file = new File(this.outputDir);
        if (!file.exists()) {
            file.mkdir();
        } else if (!file.isDirectory()) {
            throw new IOException(this.outputDir + " is not a directory.");
        }
        this.wait = i * 1000;
        this.randomWait = z2;
        this.quota = j;
        this.fetchDocuments = z3;
        this.downloaded = 0L;
    }

    public boolean shouldVisit(Link link) {
        return true;
    }

    public void visit(Page page) {
        ByteArrayOutputStream byteArrayOutputStream;
        DigestOutputStream digestOutputStream;
        String bigInteger;
        int i;
        String str;
        File file;
        try {
            byteArrayOutputStream = new ByteArrayOutputStream();
            digestOutputStream = null;
            try {
                digestOutputStream = new DigestOutputStream(byteArrayOutputStream, MessageDigest.getInstance("MD5"));
            } catch (NoSuchAlgorithmException e) {
                Debug.exceptHandler(e, "WebCrawler");
            }
            String contentType = page.getContentType();
            if (contentType.startsWith("text/html")) {
                PageTransformer pageTransformer = new PageTransformer(digestOutputStream);
                pageTransformer.writePage(page);
                pageTransformer.close();
            } else if (contentType.startsWith("text/plain")) {
                digestOutputStream.write(page.getContentBytes());
            } else {
                if (!this.fetchDocuments) {
                    return;
                }
                if (contentType.startsWith("application/msword")) {
                    digestOutputStream.write(new WordExtractor(new ByteArrayInputStream(page.getContentBytes())).getText().getBytes());
                } else if (contentType.startsWith("application/pdf")) {
                    try {
                        PdfReader pdfReader = new PdfReader(new ByteArrayInputStream(page.getContentBytes()));
                        PdfTextExtractor pdfTextExtractor = new PdfTextExtractor(pdfReader);
                        for (int i2 = 1; i2 <= pdfReader.getNumberOfPages(); i2++) {
                            digestOutputStream.write(pdfTextExtractor.getTextFromPage(i2).getBytes());
                            digestOutputStream.write(10);
                        }
                        pdfReader.close();
                        pdfReader.close();
                    } catch (Exception e2) {
                    }
                }
            }
            bigInteger = new BigInteger(1, digestOutputStream.getMessageDigest().digest()).toString(36);
            i = 1;
            str = this.outputDir + bigInteger + ".txt";
            file = new File(str);
        } catch (IOException e3) {
            Debug.exceptHandler(e3, "WebCrawler");
        }
        synchronized (this) {
            this.downloaded += page.getContentBytes().length;
            if (this.quota > 0 && this.downloaded > this.quota) {
                stop();
                return;
            }
            while (file.exists()) {
                i++;
                str = this.outputDir + bigInteger + "_" + i + ".txt";
                file = new File(str);
            }
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file));
            bufferedWriter.write(byteArrayOutputStream.toString());
            bufferedWriter.close();
            digestOutputStream.close();
            BufferedWriter bufferedWriter2 = new BufferedWriter(new FileWriter(this.outputDir + indexFile, true));
            bufferedWriter2.write(page.getURL() + "\t" + bigInteger);
            if (i > 1) {
                bufferedWriter2.write("_" + i);
            }
            bufferedWriter2.newLine();
            bufferedWriter2.close();
            if (this.removeRefCh) {
                BufferedReader bufferedReader = new BufferedReader(new FileReader(str));
                String str2 = "";
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    } else {
                        str2 = str2 + readLine.replaceAll("&#?\\w+;", " ") + newline;
                    }
                }
                bufferedReader.close();
                BufferedWriter bufferedWriter3 = new BufferedWriter(new FileWriter(str));
                bufferedWriter3.write(str2);
                bufferedWriter3.close();
            }
            page.discardContent();
            if (this.wait > 0) {
                try {
                    Thread.sleep(this.randomWait ? (int) (this.wait * (this.rnd.nextDouble() + 0.5d)) : this.wait);
                } catch (InterruptedException e4) {
                }
            }
        }
    }
}
