assume new paqradigm

This commit is contained in:
maximo tejeda 2024-05-23 12:03:10 -04:00
parent 151c8f7a7b
commit b5a3123506
19 changed files with 249 additions and 588 deletions

View File

@ -49,7 +49,7 @@ create-descriptors:
@envsubst < k8s/deployment.yml.template > k8s/deployment.yml @envsubst < k8s/deployment.yml.template > k8s/deployment.yml
deploy: build-image create-descriptors deploy: build-image create-descriptors
@kubectl apply -f k8s/pvc.yaml #@kubectl apply -f k8s/pvc.yaml
@kubectl apply -f k8s/cronjobs.yml @kubectl apply -f k8s/cronjobs.yml
test: test:

Binary file not shown.

Binary file not shown.

View File

@ -32,6 +32,6 @@ func main() {
log.Error("selecting crawler adapter", "error", err) log.Error("selecting crawler adapter", "error", err)
panic(err) panic(err)
} }
app := api.NewApplication(crawler) app := api.NewApplication(crawler, dol)
app.Run() app.Run()
} }

View File

@ -10,6 +10,43 @@ func GetDollarServiceURL() string {
return getEnvValue("DOLLAR_SERVICE_URL") return getEnvValue("DOLLAR_SERVICE_URL")
} }
func GetAPAPURL() string {
return getEnvValue("APA")
}
func GetBCDURL() string {
return getEnvValue("BCD")
}
func GetBDRURL() string {
return getEnvValue("BDR")
}
func GetBHDURL() string {
return getEnvValue("BHD")
}
func GetBNCURL() string {
return getEnvValue("BNC")
}
func GetBPDURL() string {
return getEnvValue("BPD")
}
func GetINFURL() string {
return getEnvValue("GENERAL")
}
// GetSCTAURL
// Scotia bank URL
func GetSCTAURL() string {
return getEnvValue("SCOTIA")
}
func GetVMCURL() string {
return getEnvValue("VIMENCA")
}
func getEnvValue(key string) string { func getEnvValue(key string) string {
if os.Getenv(key) == "" { if os.Getenv(key) == "" {
panic("key not found " + key) panic("key not found " + key)

View File

@ -1,9 +1,9 @@
package helpers package helpers
import ( import (
"context"
"fmt" "fmt"
"log/slog" "log/slog"
"math"
"math/rand" "math/rand"
"os" "os"
"strconv" "strconv"
@ -11,7 +11,7 @@ import (
"unicode" "unicode"
"github.com/maximotejeda/us_dop_db/db" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"golang.org/x/text/runes" "golang.org/x/text/runes"
"golang.org/x/text/transform" "golang.org/x/text/transform"
@ -125,62 +125,95 @@ func Normalize(val string) float64 {
if err != nil { if err != nil {
fmt.Printf("%s", err) fmt.Printf("%s", err)
} }
return cv cvt := math.Round(cv*10000) / 10000
return cvt
} }
return 0 return 0
} }
// CreateBrowser // CreateBrowser
func CreateBrowser(log *slog.Logger) (chrome *playwright.Browser, firefox *playwright.Browser, webkit *playwright.Browser) { func CreateBrowser(log *slog.Logger) (chrome *playwright.BrowserContext, firefox *playwright.BrowserContext, webkit *playwright.BrowserContext) {
pw, err := playwright.Run(&playwright.RunOptions{ pw, err := playwright.Run(&playwright.RunOptions{
Verbose: true, Verbose: true,
}) })
ua := helpers.NewMobileUA()
headless := true
if err != nil { if err != nil {
log.Error("running pw, could not start", "error", err) log.Error("running pw, could not start", "error", err)
os.Exit(1) os.Exit(1)
} }
ff, err := pw.Firefox.Launch() ff, err := pw.Firefox.Launch(playwright.BrowserTypeLaunchOptions{Headless: &headless})
if err != nil { if err != nil {
log.Error("could not start browser", "error", err) log.Error("could not start browser firefox", "error", err)
os.Exit(1) os.Exit(1)
} }
cm, err := pw.Firefox.Launch() ffc, err := ff.NewContext(playwright.BrowserNewContextOptions{
IgnoreHttpsErrors: &headless,
UserAgent: &ua,
HasTouch: &headless,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
IsMobile: &headless,
})
if err != nil { if err != nil {
log.Error("could not start browser", "error", err) log.Error("could not start browser firefox context", "error", err)
os.Exit(1) os.Exit(1)
} }
sf, err := pw.WebKit.Launch() cm, err := pw.Firefox.Launch(playwright.BrowserTypeLaunchOptions{Headless: &headless})
if err != nil { if err != nil {
log.Error("could not start browser", "error", err) log.Error("could not start browser chrome", "error", err)
os.Exit(1) os.Exit(1)
} }
return &cm, &ff, &sf cmc, err := cm.NewContext(playwright.BrowserNewContextOptions{
} IgnoreHttpsErrors: &headless,
UserAgent: &ua,
// ExecTask HasTouch: &headless,
func ExecTask( Viewport: &playwright.Size{
ctx context.Context, Width: 412,
dbi *db.DB, Height: 915,
browser []*playwright.Browser, },
log *slog.Logger, Screen: &playwright.Size{
errCounter map[string]int, Width: 412,
parserName string, Height: 915,
parserExecution func(context.Context, *db.DB, *playwright.Browser, *slog.Logger) error) (err error) { },
err = parserExecution(ctx, dbi, browser[0], log) IsMobile: &headless,
})
if err != nil { if err != nil {
errCounter[parserName]++ log.Error("could not start browser chorme context", "error", err)
log.Error(err.Error(), "parser", parserName) os.Exit(1)
// todo want a retry with different browser firefox }
err = parserExecution(ctx, dbi, browser[1], log) sf, err := pw.WebKit.Launch(playwright.BrowserTypeLaunchOptions{Headless: &headless})
if err != nil { if err != nil {
errCounter[parserName]++ log.Error("could not start browser safari", "error", err)
os.Exit(1)
} }
sfc, err := sf.NewContext(playwright.BrowserNewContextOptions{
IgnoreHttpsErrors: &headless,
UserAgent: &ua,
HasTouch: &headless,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
IsMobile: &headless,
})
if err != nil {
log.Error("could not start browser safari context", "error", err)
os.Exit(1)
} }
log.Info("executed", "parser", parserName, "errors", errCounter[parserName]) return &cmc, &ffc, &sfc
return err
} }
// RemoveAccent // RemoveAccent

View File

@ -4,28 +4,25 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type Apap struct { type Apap struct{}
client ports.DollarPort
}
func NewApap(client ports.DollarPort) ports.APIPorts { func NewApap() ports.APIPorts {
return &Apap{client: client} return &Apap{}
} }
func (a Apap) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (a Apap) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
tout := 120000.00 tout := 120000.00
uri := os.Getenv("APA")
log = log.With("scrapper", "apap") log = log.With("scrapper", "apap")
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetAPAPURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -64,43 +61,3 @@ func (a Apap) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger
log.Info("parsed", "value", inst) log.Info("parsed", "value", inst)
return []*domain.History{inst}, nil return []*domain.History{inst}, nil
} }
func (a Apap) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
histList, err := a.Scrape(ctx, page, log)
// here we execute db operations
if err != nil {
return err
}
err = a.client.NewHistory(histList[0])
if err != nil {
return err
}
return err
}

View File

@ -4,30 +4,25 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type bcd struct { type bcd struct{}
client ports.DollarPort
}
func NewBCD(client ports.DollarPort) ports.APIPorts { func NewBCD() ports.APIPorts {
return &bcd{ return &bcd{}
client: client,
}
} }
func (b bcd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (b bcd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
log = log.With("scrapper", "bcd") log = log.With("scrapper", "bcd")
tout := 90000.00 tout := 90000.00
uri := os.Getenv("BCD") if _, err = page.Goto(config.GetBCDURL(), playwright.PageGotoOptions{
if _, err = page.Goto(uri, playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -68,43 +63,3 @@ func (b bcd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger)
} }
return []*domain.History{inst}, nil return []*domain.History{inst}, nil
} }
func (bc bcd) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
inst, err := bc.Scrape(ctx, page, log)
if err != nil {
return err
}
err = bc.client.NewHistory(inst[0])
if err != nil {
return err
}
return err
}

View File

@ -4,29 +4,24 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type bdr struct { type bdr struct{}
client ports.DollarPort
}
func NewBDR(client ports.DollarPort) ports.APIPorts { func NewBDR() ports.APIPorts {
return &bdr{ return &bdr{}
client: client,
}
} }
func (bd bdr) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (bd bdr) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
tout := 120000.00 tout := 120000.00
log = log.With("scrapper", "bdr") log = log.With("scrapper", "bdr")
uri := os.Getenv("BDR") if _, err := page.Goto(config.GetBDRURL(), playwright.PageGotoOptions{
if _, err := page.Goto(uri, playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -72,39 +67,3 @@ func (bd bdr) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger
insts = append(insts, inst) insts = append(insts, inst)
return insts, nil return insts, nil
} }
func (bd bdr) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
insts, err := bd.Scrape(ctx, page, log)
if err != nil {
return err
}
err = bd.client.NewHistory(insts[0])
return err
}

View File

@ -4,33 +4,28 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"strings" "strings"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type bhd struct { type bhd struct{}
client ports.DollarPort
}
func NewBHD(client ports.DollarPort) ports.APIPorts { func NewBHD() ports.APIPorts {
return &bhd{ return &bhd{}
client: client,
}
} }
// Scrape // Scrape
// needs a mobile User Agent // needs a mobile User Agent
func (bh bhd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (bh bhd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
tout := 120000.00 tout := 120000.00
uri := os.Getenv("BHD")
log = log.With("scrapper", "bhd") log = log.With("scrapper", "bhd")
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetBHDURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -111,38 +106,3 @@ func (bh bhd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger
//log.Info(fmt.Sprintf("%v", inst)) //log.Info(fmt.Sprintf("%v", inst))
return []*domain.History{inst}, nil return []*domain.History{inst}, nil
} }
func (bh bhd) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
inst, err := bh.Scrape(ctx, page, log)
if err != nil {
return err
}
bh.client.NewHistory(inst[0])
return err
}

View File

@ -4,30 +4,25 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type bnc struct { type bnc struct{}
client ports.DollarPort
}
func NewBNC(client ports.DollarPort) ports.APIPorts { func NewBNC() ports.APIPorts {
return &bnc{ return &bnc{}
client: client,
}
} }
func (bn bnc) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (bn bnc) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
tout := 120000.00 tout := 120000.00
uri := os.Getenv("BNC")
log = log.With("scrapper", "bnc") log = log.With("scrapper", "bnc")
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetBNCURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -66,39 +61,3 @@ func (bn bnc) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger
} }
return []*domain.History{inst}, nil return []*domain.History{inst}, nil
} }
func (bn bnc) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
inst, err := bn.Scrape(ctx, page, log)
// here we execute db operations
if err != nil {
return err
}
bn.client.NewHistory(inst[0])
return err
}

View File

@ -4,35 +4,28 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"strconv" "strconv"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type bpd struct { type bpd struct{}
client ports.DollarPort
}
func NewBPD(client ports.DollarPort) ports.APIPorts { func NewBPD() ports.APIPorts {
return &bpd{ return &bpd{}
client: client,
}
} }
// Scrape // Scrape
// needs a mobile User Agent // needs a mobile User Agent
func (bp bpd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (bp bpd) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
tout := 120000.00 tout := 120000.00
uri := os.Getenv("BPD")
log = log.With("scrapper", "bpd") log = log.With("scrapper", "bpd")
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetBPDURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -90,38 +83,3 @@ func HoverTasas(page playwright.Page) {
tasasMenu := page.Locator(".footer_est_menu_bpd > li:nth-child(3)") tasasMenu := page.Locator(".footer_est_menu_bpd > li:nth-child(3)")
tasasMenu.Hover() tasasMenu.Hover()
} }
func (bp bpd) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
inst, err := bp.Scrape(ctx, page, log)
if err != nil {
return err
}
bp.client.NewHistory(inst[0])
return err
}

View File

@ -10,23 +10,23 @@ func Selector(who string, client ports.DollarPort) (ports.APIPorts, error) {
var parser ports.APIPorts var parser ports.APIPorts
switch who { switch who {
case "apap": case "apap":
parser = NewApap(client) parser = NewApap()
case "bcd": case "bcd":
parser = NewBCD(client) parser = NewBCD()
case "bdr": case "brd":
parser = NewBDR(client) parser = NewBDR()
case "bhd": case "bhd":
parser = NewBHD(client) parser = NewBHD()
case "bnc": case "bnc":
parser = NewBNC(client) parser = NewBNC()
case "bpd": case "bpd":
parser = NewBPD(client) parser = NewBPD()
case "inf": case "inf":
parser = NewINF(client) parser = NewINF()
case "scotia": case "scotia":
parser = NewScotia(client) parser = NewScotia()
case "vimenca": case "vimenca":
parser = NewVimenca(client) parser = NewVimenca()
default: default:
return nil, fmt.Errorf("not recognize who: " + who) return nil, fmt.Errorf("not recognize who: " + who)
} }

View File

@ -2,34 +2,28 @@ package crawler
import ( import (
"context" "context"
"fmt"
"log/slog" "log/slog"
"os"
"strings" "strings"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type inf struct { type inf struct{}
client ports.DollarPort
}
func NewINF(client ports.DollarPort) ports.APIPorts { func NewINF() ports.APIPorts {
return &inf{ return &inf{}
client: client,
}
} }
// Scrape // Scrape
func (in inf) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (instList []*domain.History, err error) { func (in inf) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (instList []*domain.History, err error) {
uri := os.Getenv("GENERAL")
log = log.With("scrapper", "general") log = log.With("scrapper", "general")
tout := float64(120000) tout := float64(120000)
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetINFURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -49,17 +43,13 @@ func (in inf) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger
log.Error("could not get info", "error", err) log.Error("could not get info", "error", err)
return nil, err return nil, err
} }
scotia := false // in this page there are 2 scotia one the change online the other is tha bank
instList = []*domain.History{} instList = []*domain.History{}
for _, entry := range entries { for _, entry := range entries {
inst := &domain.History{ inst := &domain.History{
Parser: "inf", Parser: "inf",
} }
title, _ := entry.Locator("span.nombre").TextContent() title, _ := entry.Locator("span.nombre").TextContent()
if strings.ToLower(title) == "scotiabank" && !scotia {
title = "scotiabank cambio online"
scotia = true
}
name := "" name := ""
if title != "" { if title != "" {
name = helpers.RemoveAccent(strings.ToLower(title)) name = helpers.RemoveAccent(strings.ToLower(title))
@ -81,6 +71,10 @@ func (in inf) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger
log.Warn("skipping", "nombre", inst.Name, "compra", inst.Compra, "venta", inst.Venta) log.Warn("skipping", "nombre", inst.Name, "compra", inst.Compra, "venta", inst.Venta)
continue continue
} }
switch {
case strings.Contains(inst.Name, "banreservas"), strings.Contains(inst.Name,"banco popular"), strings.Contains(inst.Name,"scotia"), strings.Contains(inst.Name,"hipotecario"), strings.Contains(inst.Name,"asociacion popular"), strings.Contains(inst.Name,"vimenca"):
continue
}
instList = append(instList, inst) instList = append(instList, inst)
} }
@ -98,45 +92,3 @@ func getValue(place playwright.Locator) string {
} }
return value return value
} }
// ExecParser
func (in inf) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) error {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
instList, err := in.Scrape(ctx, page, log)
if err != nil {
return err
}
for _, inst := range instList {
log.Info("processing", "name", inst.Name)
err = in.client.NewHistory(inst)
if err != nil {
log.Error(fmt.Sprintf("inspecting %s", inst.Name), "error", err)
}
}
return err
}

View File

@ -4,30 +4,26 @@ import (
"context" "context"
"fmt" "fmt"
"log/slog" "log/slog"
"os"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type scotia struct { type scotia struct{}
client ports.DollarPort
}
func NewScotia(client ports.DollarPort) ports.APIPorts { func NewScotia() ports.APIPorts {
return &scotia{ return &scotia{}
client: client,
}
} }
func (sct scotia) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (sct scotia) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
tout := 120000.00 tout := 120000.00
uri := os.Getenv("SCOTIA")
log = log.With("scrapper", "scotia") log = log.With("scrapper", "scotia")
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetSCTAURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -102,41 +98,3 @@ func (sct scotia) Scrape(ctx context.Context, page playwright.Page, log *slog.Lo
insts = append(insts, instOnline, instOnsite) insts = append(insts, instOnline, instOnsite)
return insts, nil return insts, nil
} }
func (sct scotia) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
insts, err := sct.Scrape(ctx, page, log)
// here we execute db operations
if err != nil {
return err
}
for _, inst := range insts {
sct.client.NewHistory(inst)
}
return err
}

View File

@ -3,30 +3,25 @@ package crawler
import ( import (
"context" "context"
"log/slog" "log/slog"
"os"
"time" "time"
"github.com/maximotejeda/us_dop_scrapper/config"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain" "github.com/maximotejeda/us_dop_scrapper/internal/application/core/domain"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
) )
type vimenca struct { type vimenca struct{}
client ports.DollarPort
}
func NewVimenca(client ports.DollarPort) ports.APIPorts { func NewVimenca() ports.APIPorts {
return &vimenca{ return &vimenca{}
client: client,
}
} }
func (v vimenca) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) { func (v vimenca) Scrape(ctx context.Context, page playwright.Page, log *slog.Logger) (insts []*domain.History, err error) {
uri := os.Getenv("VIMENCA")
tout := 120000.00 tout := 120000.00
log = log.With("scrapper", "vimenca") log = log.With("scrapper", "vimenca")
if _, err := page.Goto(uri, playwright.PageGotoOptions{ if _, err := page.Goto(config.GetVMCURL(), playwright.PageGotoOptions{
Timeout: &tout, Timeout: &tout,
WaitUntil: playwright.WaitUntilStateLoad, WaitUntil: playwright.WaitUntilStateLoad,
}); err != nil { }); err != nil {
@ -64,43 +59,3 @@ func (v vimenca) Scrape(ctx context.Context, page playwright.Page, log *slog.Log
log.Info("institution", "value", inst) log.Info("institution", "value", inst)
return []*domain.History{inst}, nil return []*domain.History{inst}, nil
} }
func (v vimenca) ExecParser(
ctx context.Context,
browser *playwright.Browser,
log *slog.Logger) (err error) {
t := true
ua := helpers.NewMobileUA()
b := *browser
page, err := b.NewPage(playwright.BrowserNewPageOptions{
UserAgent: &ua,
// IsMobile: &t,
HasTouch: &t,
Viewport: &playwright.Size{
Width: 412,
Height: 915,
},
Screen: &playwright.Size{
Width: 412,
Height: 915,
},
})
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
inst, err := v.Scrape(ctx, page, log)
// here we execute db operations
if err != nil {
return err
}
err = v.client.NewHistory(inst[0])
if err != nil {
return err
}
return err
}

View File

@ -2,38 +2,74 @@ package api
import ( import (
"context" "context"
"log/slog"
"os"
"time"
"github.com/maximotejeda/us_dop_scrapper/helpers" "github.com/maximotejeda/us_dop_scrapper/helpers"
"github.com/maximotejeda/us_dop_scrapper/internal/ports" "github.com/maximotejeda/us_dop_scrapper/internal/ports"
"log/slog" "github.com/playwright-community/playwright-go"
) )
type Application struct { type Application struct {
log *slog.Logger log *slog.Logger
api ports.APIPorts api ports.APIPorts
client ports.DollarPort
} }
func NewApplication(crawler ports.APIPorts) *Application { func NewApplication(crawler ports.APIPorts, client ports.DollarPort) *Application {
log := slog.Default() log := slog.Default()
log = log.With("application", "root") log = log.With("application", "root")
return &Application{ return &Application{
log: log, log: log,
api: crawler, api: crawler,
client: client,
} }
} }
func (a Application) Run() { func (a Application) Run() {
ctx := context.Background() ctx := context.Background()
ch, ff, wk := helpers.CreateBrowser(a.log) ch, ff, wk := helpers.CreateBrowser(a.log)
err := a.api.ExecParser(ctx, ch, a.log) err := a.ExecParser(ctx, ch, a.log)
if err != nil { if err != nil {
a.log.Info("failed on frist browser", "browser", "chrome", "error", err) a.log.Info("failed on frist browser", "browser", "chrome", "error", err)
err := a.api.ExecParser(ctx, ff, a.log) err := a.ExecParser(ctx, ff, a.log)
if err != nil { if err != nil {
a.log.Error("failed on second browser", "browser", "firefox", "error", err) a.log.Error("failed on second browser", "browser", "firefox", "error", err)
err := a.api.ExecParser(ctx, wk, a.log) err := a.ExecParser(ctx, wk, a.log)
if err != nil { if err != nil {
a.log.Error("tried all browsers error", "brwser", "webkit", "error", err) a.log.Error("tried all browsers error", "brwser", "webkit", "error", err)
return
} }
} }
} }
} }
func (a Application) ExecParser(
ctx context.Context,
browser *playwright.BrowserContext,
log *slog.Logger) (err error) {
b := *browser
page, err := b.NewPage()
if err != nil {
log.Error("creating page", "error", err)
os.Exit(1)
}
ctx, cancel := context.WithTimeout(ctx, 6*time.Minute)
defer page.Close()
defer cancel()
histList, err := a.api.Scrape(ctx, page, log)
// here we execute db operations
if err != nil {
return err
}
for _, hist := range histList {
err = a.client.NewHistory(hist)
if err != nil {
a.log.Error("creating new hist", "history", hist, "error", err)
}
a.log.Info("parsed Success", "parser", hist.Parser, "item", hist)
}
return err
}

View File

@ -10,5 +10,4 @@ import (
type APIPorts interface { type APIPorts interface {
Scrape(context.Context, playwright.Page, *slog.Logger) ([]*domain.History, error) Scrape(context.Context, playwright.Page, *slog.Logger) ([]*domain.History, error)
ExecParser(context.Context, *playwright.Browser, *slog.Logger) error
} }

View File

@ -10,24 +10,19 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-inf - name: crawler-inf
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: WHO
value: inf
- name: DOLLAR_SERVICE_URL
value: "dolar-grpc-svc:80"
- name: GENERAL - name: GENERAL
value: https://www.infodolar.com.do/ value: https://www.infodolar.com.do/
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
volumeMounts:
- name: database
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -41,26 +36,19 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-bcd - name: crawler-bcd
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: BCD - name: BCD
value: https://www.bancentral.gov.do/SectorExterno/HistoricoTasas value: https://www.bancentral.gov.do/SectorExterno/HistoricoTasas
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: bcd value: bcd
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -74,26 +62,19 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-bpd - name: crawler-bpd
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: BPD - name: BPD
value: https://popularenlinea.com/empresarial/Paginas/Home.aspx value: https://popularenlinea.com/empresarial/Paginas/Home.aspx
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: bpd value: bpd
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -107,26 +88,19 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-bhd - name: crawler-bhd
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: BHD - name: BHD
value: https://bhd.com.do/calculators?calculator=DIVISAS value: https://bhd.com.do/calculators?calculator=DIVISAS
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: bhd value: bhd
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -140,26 +114,20 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-brd - name: crawler-brd
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: BDR - name: BDR
value: https://www.banreservas.com/calculadoras value: https://www.banreservas.com/calculadoras
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: brd value: brd
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -173,26 +141,20 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-apap - name: crawler-apap
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: APA - name: APA
value: https://apap.com.do/ value: https://apap.com.do/
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: apa value: apap
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -206,26 +168,20 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-bnc - name: crawler-bnc
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: BNC - name: BNC
value: https://www.banesco.com.do/ value: https://www.banesco.com.do/
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: bnc value: bnc
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -239,26 +195,20 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-vimenca - name: crawler-vimenca
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: VIMENCA - name: VIMENCA
value: https://www.bancovimenca.com/ value: https://www.bancovimenca.com/
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: vimenca value: vimenca
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@ -272,23 +222,16 @@ spec:
spec: spec:
template: template:
spec: spec:
restartPolicy: OnFailure
containers: containers:
- name: crawler-scotia - name: crawler-scotia
image: localhost:32000/crawler:latest image: localhost:32000/crawler:latest
env: env:
- name: SCOTIA - name: SCOTIA
value: https://do.scotiabank.com/banca-personal/tarifas/tasas-de-cambio.html value: https://do.scotiabank.com/banca-personal/tarifas/tasas-de-cambio.html
- name: DBURI
value: dolardb/crawler.db
- name: NATSURI - name: NATSURI
value: "nats://nats-svc:4222" value: "nats://nats-svc:4222"
- name: WHO - name: WHO
value: scotia value: scotia
volumeMounts: - name: DOLLAR_SERVICE_URL
- name: database value: "dolar-grpc-svc:80"
mountPath: /app/dolardb
volumes:
- name: database
persistentVolumeClaim:
claimName: bank-crawler-pvc
restartPolicy: OnFailure